From 35cfe752f8e36e6c7642e4dd0e25261f9aca8842 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Thu, 24 Apr 2008 15:59:50 +0000 Subject: [PATCH] Update GraphemeBoundary to Unicode 5.1.0. Pretty close now. Passes the 2008-04-24 Behdad Esfahbod * pango/break.c (pango_default_break): Update GraphemeBoundary to Unicode 5.1.0. Pretty close now. Passes the TR14 test. svn path=/trunk/; revision=2614 --- ChangeLog | 5 ++++ pango/break.c | 90 +++++++++++++++++++++++++++++++++++------------------------ 2 files changed, 59 insertions(+), 36 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3c25ad7..f715665 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2008-04-24 Behdad Esfahbod + * pango/break.c (pango_default_break): Update GraphemeBoundary to + Unicode 5.1.0. Pretty close now. Passes the TR14 test. + +2008-04-24 Behdad Esfahbod + * pango/break.c (pango_default_break): Allow line break at the end of string. UAX#14 rule LB3 says "Always break at the end of text." With this test, Pango passes the LineBreakTest.txt, sans the bug in diff --git a/pango/break.c b/pango/break.c index 5a1e917..e6b10fc 100644 --- a/pango/break.c +++ b/pango/break.c @@ -450,6 +450,9 @@ static const CharJamoProps HangulJamoProps[] = { #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3) #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc)) +#define LOGICAL_ORDER_EXCEPTION(wc) (((wc) >= 0x0E40 && (wc) <= 0x0E44) || \ + ((wc) >= 0x0EC0 && (wc) <= 0x0EC4)) + /* p. 132-133 of Unicode spec table 5-6 will help understand this */ typedef enum @@ -550,7 +553,7 @@ pango_default_break (const gchar *text, next = text; - prev_type = (GUnicodeType) -1; + prev_type = G_UNICODE_PARAGRAPH_SEPARATOR; prev_break_type = G_UNICODE_BREAK_UNKNOWN; prev_was_break_space = FALSE; prev_wc = 0; @@ -635,6 +638,11 @@ pango_default_break (const gchar *text, /* ---- Cursor position breaks (Grapheme breaks) ---- */ + /* TODO: This is quite close to TR29 of Unicode 5.1 now. + * We need to implement Extend and SpacingMark support, as well + * as checking the category exceptions. + */ + if (wc == '\n') { /* Break before line feed unless prev char is a CR */ @@ -644,15 +652,9 @@ pango_default_break (const gchar *text, else attrs[i].is_cursor_position = FALSE; } - else if (i == 0 || - prev_type == G_UNICODE_CONTROL || + else if (prev_type == G_UNICODE_CONTROL || prev_type == G_UNICODE_FORMAT) { - /* Break at first position (must be special cased, or if the - * first char is say a combining mark there won't be a - * cursor position at the start, which seems wrong to me - * ???? - maybe it makes sense though, who knows) - */ /* break after all format or control characters */ attrs[i].is_cursor_position = TRUE; } @@ -690,42 +692,48 @@ pango_default_break (const gchar *text, case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: + /* Handle non-Hangul-syllable non-combining chars */ + + /* Break before Jamo if they are in a broken sequence or + * next to non-Jamo; break if preceded by Jamo; don't + * break if a letter is preceded by a virama; break in + * all other cases. No need to check whether we are or are + * preceded by Jamo explicitly, since a Jamo is not + * a virama, we just break in all cases where we + * aren't a or preceded by a virama. Don't fool with + * viramas if we aren't part of a script that uses them. + */ + if (makes_hangul_syllable) - attrs[i].is_cursor_position = FALSE; - else + { + attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */ + break; + } + + if (VIRAMA_SCRIPT (wc)) { - /* Handle non-Hangul-syllable non-combining chars */ - - /* Break before Jamo if they are in a broken sequence or - * next to non-Jamo; break if preceded by Jamo; don't - * break if a letter is preceded by a virama; break in - * all other cases. No need to check whether we are or are - * preceded by Jamo explicitly, since a Jamo is not - * a virama, we just break in all cases where we - * aren't a or preceded by a virama. Don't fool with - * viramas if we aren't part of a script that uses them. + /* Check whether we're preceded by a virama; this + * could use some optimization. */ - - if (VIRAMA_SCRIPT (wc)) + if (VIRAMA (prev_wc)) { - /* Check whether we're preceded by a virama; this - * could use some optimization. - */ - if (VIRAMA (prev_wc)) - attrs[i].is_cursor_position = FALSE; - else - attrs[i].is_cursor_position = TRUE; - } - else - { - attrs[i].is_cursor_position = TRUE; + attrs[i].is_cursor_position = FALSE; + break; } } - break; + + /* fall through */ default: + + if (LOGICAL_ORDER_EXCEPTION (prev_wc)) + { + attrs[i].is_cursor_position = FALSE; /* Rule GB9b */ + break; + } + /* Some weirdo char, just break here, why not */ - attrs[i].is_cursor_position = TRUE; + attrs[i].is_cursor_position = TRUE; /* Rule GB10 */ break; } } @@ -743,7 +751,7 @@ pango_default_break (const gchar *text, g_assert (prev_break_type != G_UNICODE_BREAK_SPACE); - attrs[i].is_line_break = done; /* XXX ugly */ + attrs[i].is_line_break = FALSE; attrs[i].is_mandatory_break = FALSE; if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary, @@ -1468,6 +1476,16 @@ pango_default_break (const gchar *text, type != G_UNICODE_NON_SPACING_MARK) base_character = wc; } + i--; + + attrs[i].is_line_break = TRUE; /* Rule LB3 */ + attrs[0].is_line_break = FALSE; /* Rule LB2 */ + + attrs[i].is_word_end = TRUE; /* Rule WB2 */ + attrs[0].is_word_start = TRUE; /* Rule WB1 */ + + attrs[i].is_cursor_position = TRUE; /* Rule GB2 */ + attrs[0].is_cursor_position = TRUE; /* Rule GB1 */ } static gboolean -- 2.7.4