From c09d3c29d67a00d206809988458574031b8fda8f Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Thu, 24 Apr 2008 17:21:03 +0000 Subject: [PATCH] =?utf8?q?Part=20of=20Bug=2097545=20=E2=80=93=20Make=20pan?= =?utf8?q?go=5Fdefault=5Fbreak=20follow=20Unicode=20TR=20#29?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 2008-04-24 Behdad Esfahbod Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 * pango/break.c (pango_default_break): Make Grapheme Boundary code exactly follow UAX#29 of Unicode 5.1.0 svn path=/trunk/; revision=2615 --- ChangeLog | 7 ++ pango/break.c | 210 ++++++++++++++++++++++++---------------------------------- 2 files changed, 94 insertions(+), 123 deletions(-) diff --git a/ChangeLog b/ChangeLog index f715665..dc9a8b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2008-04-24 Behdad Esfahbod + Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 + + * pango/break.c (pango_default_break): Make Grapheme Boundary code + exactly follow UAX#29 of Unicode 5.1.0 + +2008-04-24 Behdad Esfahbod + * pango/break.c (pango_default_break): Update GraphemeBoundary to Unicode 5.1.0. Pretty close now. Passes the TR14 test. diff --git a/pango/break.c b/pango/break.c index e6b10fc..69c8650 100644 --- a/pango/break.c +++ b/pango/break.c @@ -417,26 +417,6 @@ static const CharJamoProps HangulJamoProps[] = { #define JAMO_TYPE(btype) \ (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO) - - - -/* "virama script" is just an optimization; it includes a bunch of - * scripts without viramas in them - */ -#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF) -#define VIRAMA(wc) ((wc) == 0x094D || \ - (wc) == 0x09CD || \ - (wc) == 0x0A4D || \ - (wc) == 0x0ACD || \ - (wc) == 0x0B4D || \ - (wc) == 0x0BCD || \ - (wc) == 0x0C4D || \ - (wc) == 0x0CCD || \ - (wc) == 0x0D4D || \ - (wc) == 0x0DCA || \ - (wc) == 0x0F84 || \ - (wc) == 0x1039 || \ - (wc) == 0x17D2) /* Types of Japanese characters */ #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF) #define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF) @@ -450,10 +430,6 @@ static const CharJamoProps HangulJamoProps[] = { #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3) #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc)) -#define LOGICAL_ORDER_EXCEPTION(wc) (((wc) >= 0x0E40 && (wc) <= 0x0E44) || \ - ((wc) >= 0x0EC0 && (wc) <= 0x0EC4)) - - /* p. 132-133 of Unicode spec table 5-6 will help understand this */ typedef enum { @@ -534,6 +510,18 @@ pango_default_break (const gchar *text, GUnicodeBreakType prev_break_type; /* skips spaces */ gboolean prev_was_break_space; + /* See Grapheme_Cluster_Break Property Values table of UAX#29 */ + typedef enum + { + GB_Other, + GB_ControlCRLF, + GB_Extend, + GB_Prepend, + GB_SpacingMark, + GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */ + } GraphemeBreakType; + GraphemeBreakType prev_GB_type = GB_Other; + WordType current_word_type = WordNone; gunichar last_word_letter = 0; gunichar base_character = 0; @@ -638,105 +626,81 @@ pango_default_break (const gchar *text, /* ---- Cursor position breaks (Grapheme breaks) ---- */ - /* TODO: This is quite close to TR29 of Unicode 5.1 now. - * We need to implement Extend and SpacingMark support, as well - * as checking the category exceptions. - */ - - if (wc == '\n') - { - /* Break before line feed unless prev char is a CR */ - - if (prev_wc != '\r') - attrs[i].is_cursor_position = TRUE; - else - attrs[i].is_cursor_position = FALSE; - } - else if (prev_type == G_UNICODE_CONTROL || - prev_type == G_UNICODE_FORMAT) - { - /* break after all format or control characters */ - attrs[i].is_cursor_position = TRUE; - } - else - { - switch (type) - { - case G_UNICODE_CONTROL: - case G_UNICODE_FORMAT: - /* Break before all format or control characters */ - attrs[i].is_cursor_position = TRUE; - break; - - case G_UNICODE_COMBINING_MARK: - case G_UNICODE_ENCLOSING_MARK: - case G_UNICODE_NON_SPACING_MARK: - /* Unicode spec includes "Combining marks plus Tibetan - * subjoined characters" as joining chars, but lists the - * Tibetan subjoined characters as combining marks, and - * g_unichar_type() returns NON_SPACING_MARK for the Tibetan - * subjoined characters. So who knows, beats me. - */ - - /* It's a joining character, break only if preceded by - * control or format; we already handled the case where - * it was preceded earlier, so here we know it wasn't, - * don't break - */ - attrs[i].is_cursor_position = FALSE; - break; - - case G_UNICODE_LOWERCASE_LETTER: - case G_UNICODE_MODIFIER_LETTER: - case G_UNICODE_OTHER_LETTER: - case G_UNICODE_TITLECASE_LETTER: - case G_UNICODE_UPPERCASE_LETTER: - - /* Handle non-Hangul-syllable non-combining chars */ - - /* Break before Jamo if they are in a broken sequence or - * next to non-Jamo; break if preceded by Jamo; don't - * break if a letter is preceded by a virama; break in - * all other cases. No need to check whether we are or are - * preceded by Jamo explicitly, since a Jamo is not - * a virama, we just break in all cases where we - * aren't a or preceded by a virama. Don't fool with - * viramas if we aren't part of a script that uses them. - */ - - if (makes_hangul_syllable) - { - attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */ - break; - } - - if (VIRAMA_SCRIPT (wc)) - { - /* Check whether we're preceded by a virama; this - * could use some optimization. - */ - if (VIRAMA (prev_wc)) - { - attrs[i].is_cursor_position = FALSE; - break; - } - } - - /* fall through */ - - default: - - if (LOGICAL_ORDER_EXCEPTION (prev_wc)) - { - attrs[i].is_cursor_position = FALSE; /* Rule GB9b */ - break; - } - - /* Some weirdo char, just break here, why not */ - attrs[i].is_cursor_position = TRUE; /* Rule GB10 */ - break; - } - } + { + /* Find the GraphemeBreakType of wc */ + GraphemeBreakType GB_type = GB_Other; + switch (type) + { + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + if (wc != 0x200C && wc != 0x200D) + GB_type = GB_ControlCRLF; + else + GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */ + break; + + case G_UNICODE_OTHER_LETTER: + if (makes_hangul_syllable) + GB_type = GB_InHangulSyllable; + else if ((wc & 0x0E00) == 0x0E00) + { + /* Thai and Lao stuff hardcoded in UAX#29 */ + if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4)) + GB_type = GB_Prepend; /* Prepend */ + else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 || + wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3) + GB_type = GB_Extend; /* Exceptions in the Extend definition */ + } + break; + + case G_UNICODE_MODIFIER_LETTER: + if (wc >= 0xFF9E && wc <= 0xFF9F) + GB_type = GB_Extend; /* Other_Grapheme_Extend */ + break; + + case G_UNICODE_COMBINING_MARK: + GB_type = GB_SpacingMark; /* SpacingMark */ + if (wc >= 0x0900) + { + if (wc == 0x09BE || wc == 0x09D7 || + wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 || + wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 || + wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF || + wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172)) + GB_type = GB_Extend; /* Other_Grapheme_Extend */ + } + break; + + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + GB_type = GB_Extend; /* Grapheme_Extend */ + break; + + default: + break; + } + + /* Grapheme Cluster Boundary Rules */ + /* We apply Rules GB1 and GB2 at the end of the function */ + if (wc == '\n' && prev_wc == '\r') + attrs[i].is_cursor_position = FALSE; /* Rule GB3 */ + else if (GB_type == GB_ControlCRLF || prev_GB_type == GB_ControlCRLF) + attrs[i].is_cursor_position = TRUE; /* Rules GB4 and GB5 */ + else if (GB_type == GB_InHangulSyllable) + attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */ + else if (GB_type == GB_Extend) + attrs[i].is_cursor_position = FALSE; /* Rule GB9 */ + else if (GB_type == GB_SpacingMark) + attrs[i].is_cursor_position = FALSE; /* Rule GB9a */ + else if (prev_GB_type == GB_Prepend) + attrs[i].is_cursor_position = FALSE; /* Rule GB9b */ + else + attrs[i].is_cursor_position = TRUE; /* Rule GB10 */ + + prev_GB_type = GB_type; + } /* If this is a grapheme boundary, we have to decide if backspace * deletes a character or the whole grapheme cluster */ -- 2.7.4