From c09d3c29d67a00d206809988458574031b8fda8f Mon Sep 17 00:00:00 2001
From: Behdad Esfahbod <behdad@gnome.org>
Date: Thu, 24 Apr 2008 17:21:03 +0000
Subject: [PATCH] =?utf8?q?Part=20of=20Bug=2097545=20=E2=80=93=20Make=20pan?=
 =?utf8?q?go=5Fdefault=5Fbreak=20follow=20Unicode=20TR=20#29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

2008-04-24  Behdad Esfahbod  <behdad@gnome.org>

        Part of Bug 97545 â Make pango_default_break follow Unicode TR #29

        * pango/break.c (pango_default_break): Make Grapheme Boundary code
        exactly follow UAX#29 of Unicode 5.1.0


svn path=/trunk/; revision=2615
---
 ChangeLog     |   7 ++
 pango/break.c | 210 ++++++++++++++++++++++++----------------------------------
 2 files changed, 94 insertions(+), 123 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index f715665..dc9a8b8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2008-04-24  Behdad Esfahbod  <behdad@gnome.org>
 
+	Part of Bug 97545 â Make pango_default_break follow Unicode TR #29
+
+	* pango/break.c (pango_default_break): Make Grapheme Boundary code
+	exactly follow UAX#29 of Unicode 5.1.0
+
+2008-04-24  Behdad Esfahbod  <behdad@gnome.org>
+
 	* pango/break.c (pango_default_break): Update GraphemeBoundary to
 	Unicode 5.1.0.  Pretty close now.  Passes the TR14 test.
 
diff --git a/pango/break.c b/pango/break.c
index e6b10fc..69c8650 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -417,26 +417,6 @@ static const CharJamoProps HangulJamoProps[] = {
 #define JAMO_TYPE(btype)      \
 	(IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
 
-
-
-
-/* "virama script" is just an optimization; it includes a bunch of
- * scripts without viramas in them
- */
-#define VIRAMA_SCRIPT(wc)        ((wc) >= 0x0901 && (wc) <= 0x17FF)
-#define VIRAMA(wc) ((wc) == 0x094D || \
-		    (wc) == 0x09CD || \
-		    (wc) == 0x0A4D || \
-		    (wc) == 0x0ACD || \
-		    (wc) == 0x0B4D || \
-		    (wc) == 0x0BCD || \
-		    (wc) == 0x0C4D || \
-		    (wc) == 0x0CCD || \
-		    (wc) == 0x0D4D || \
-		    (wc) == 0x0DCA || \
-		    (wc) == 0x0F84 || \
-		    (wc) == 0x1039 || \
-		    (wc) == 0x17D2)
 /* Types of Japanese characters */
 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
 #define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
@@ -450,10 +430,6 @@ static const CharJamoProps HangulJamoProps[] = {
 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
 
-#define LOGICAL_ORDER_EXCEPTION(wc) (((wc) >= 0x0E40 && (wc) <= 0x0E44) || \
-				     ((wc) >= 0x0EC0 && (wc) <= 0x0EC4))
-
-
 /* p. 132-133 of Unicode spec table 5-6 will help understand this */
 typedef enum
 {
@@ -534,6 +510,18 @@ pango_default_break (const gchar   *text,
   GUnicodeBreakType prev_break_type; /* skips spaces */
   gboolean prev_was_break_space;
 
+  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    GB_Other,
+    GB_ControlCRLF,
+    GB_Extend,
+    GB_Prepend,
+    GB_SpacingMark,
+    GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
+  } GraphemeBreakType;
+  GraphemeBreakType prev_GB_type = GB_Other;
+
   WordType current_word_type = WordNone;
   gunichar last_word_letter = 0;
   gunichar base_character = 0;
@@ -638,105 +626,81 @@ pango_default_break (const gchar   *text,
 
       /* ---- Cursor position breaks (Grapheme breaks) ---- */
 
-      /* TODO: This is quite close to TR29 of Unicode 5.1 now.
-       * We need to implement Extend and SpacingMark support, as well
-       * as checking the category exceptions.
-       */
-
-      if (wc == '\n')
-	{
-	  /* Break before line feed unless prev char is a CR */
-
-	  if (prev_wc != '\r')
-	    attrs[i].is_cursor_position = TRUE;
-	  else
-	    attrs[i].is_cursor_position = FALSE;
-	}
-      else if (prev_type == G_UNICODE_CONTROL ||
-	       prev_type == G_UNICODE_FORMAT)
-	{
-	  /* break after all format or control characters */
-	  attrs[i].is_cursor_position = TRUE;
-	}
-      else
-	{
-	  switch (type)
-	    {
-	    case G_UNICODE_CONTROL:
-	    case G_UNICODE_FORMAT:
-	      /* Break before all format or control characters */
-	      attrs[i].is_cursor_position = TRUE;
-	      break;
-
-	    case G_UNICODE_COMBINING_MARK:
-	    case G_UNICODE_ENCLOSING_MARK:
-	    case G_UNICODE_NON_SPACING_MARK:
-	      /* Unicode spec includes "Combining marks plus Tibetan
-	       * subjoined characters" as joining chars, but lists the
-	       * Tibetan subjoined characters as combining marks, and
-	       * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
-	       * subjoined characters. So who knows, beats me.
-	       */
-
-	      /* It's a joining character, break only if preceded by
-	       * control or format; we already handled the case where
-	       * it was preceded earlier, so here we know it wasn't,
-	       * don't break
-	       */
-	      attrs[i].is_cursor_position = FALSE;
-	      break;
-
-	    case G_UNICODE_LOWERCASE_LETTER:
-	    case G_UNICODE_MODIFIER_LETTER:
-	    case G_UNICODE_OTHER_LETTER:
-	    case G_UNICODE_TITLECASE_LETTER:
-	    case G_UNICODE_UPPERCASE_LETTER:
-
-	      /* Handle non-Hangul-syllable non-combining chars */
-
-	      /* Break before Jamo if they are in a broken sequence or
-	       * next to non-Jamo; break if preceded by Jamo; don't
-	       * break if a letter is preceded by a virama; break in
-	       * all other cases. No need to check whether we are or are
-	       * preceded by Jamo explicitly, since a Jamo is not
-	       * a virama, we just break in all cases where we
-	       * aren't a or preceded by a virama.  Don't fool with
-	       * viramas if we aren't part of a script that uses them.
-	       */
-
-	      if (makes_hangul_syllable)
-	        {
-		  attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */
-		  break;
-		}
-
-	      if (VIRAMA_SCRIPT (wc))
-		{
-		  /* Check whether we're preceded by a virama; this
-		   * could use some optimization.
-		   */
-		  if (VIRAMA (prev_wc))
-		    {
-		      attrs[i].is_cursor_position = FALSE;
-		      break;
-		    }
-		}
-
-	      /* fall through */
-
-	    default:
-
-	      if (LOGICAL_ORDER_EXCEPTION (prev_wc))
-	        {
-		  attrs[i].is_cursor_position = FALSE; /* Rule GB9b */
-		  break;
-		}
-
-	      /* Some weirdo char, just break here, why not */
-	      attrs[i].is_cursor_position = TRUE; /* Rule GB10 */
-	      break;
-	    }
-	}
+      {
+        /* Find the GraphemeBreakType of wc */
+	GraphemeBreakType GB_type = GB_Other;
+	switch (type)
+	  {
+	  case G_UNICODE_CONTROL:
+	  case G_UNICODE_FORMAT:
+	  case G_UNICODE_LINE_SEPARATOR:
+	  case G_UNICODE_PARAGRAPH_SEPARATOR:
+	    if (wc != 0x200C && wc != 0x200D)
+	      GB_type = GB_ControlCRLF;
+	    else
+	      GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
+	    break;
+
+	  case G_UNICODE_OTHER_LETTER:
+	    if (makes_hangul_syllable)
+	      GB_type = GB_InHangulSyllable;
+	    else if ((wc & 0x0E00) == 0x0E00)
+	      {
+	        /* Thai and Lao stuff hardcoded in UAX#29 */
+		if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4))
+		  GB_type = GB_Prepend; /* Prepend */
+		else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 ||
+			 wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3)
+		  GB_type = GB_Extend; /* Exceptions in the Extend definition */
+	      }
+	    break;
+
+	  case G_UNICODE_MODIFIER_LETTER:
+	    if (wc >= 0xFF9E && wc <= 0xFF9F)
+	      GB_type = GB_Extend; /* Other_Grapheme_Extend */
+	    break;
+
+	  case G_UNICODE_COMBINING_MARK:
+	    GB_type = GB_SpacingMark; /* SpacingMark */
+	    if (wc >= 0x0900)
+	      {
+		if (wc == 0x09BE || wc == 0x09D7 ||
+		    wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
+		    wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
+		    wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
+		    wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
+		  GB_type = GB_Extend; /* Other_Grapheme_Extend */
+	      }
+	    break;
+
+	  case G_UNICODE_ENCLOSING_MARK:
+	  case G_UNICODE_NON_SPACING_MARK:
+	    GB_type = GB_Extend; /* Grapheme_Extend */
+	    break;
+
+	  default:
+	    break;
+	  }
+
+	/* Grapheme Cluster Boundary Rules */
+	/* We apply Rules GB1 and GB2 at the end of the function */
+	if (wc == '\n' && prev_wc == '\r')
+	  attrs[i].is_cursor_position = FALSE; /* Rule GB3 */
+	else if (GB_type == GB_ControlCRLF || prev_GB_type == GB_ControlCRLF)
+	  attrs[i].is_cursor_position = TRUE; /* Rules GB4 and GB5 */
+	else if (GB_type == GB_InHangulSyllable)
+	  attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */
+	else if (GB_type == GB_Extend)
+	  attrs[i].is_cursor_position = FALSE; /* Rule GB9 */
+	else if (GB_type == GB_SpacingMark)
+	  attrs[i].is_cursor_position = FALSE; /* Rule GB9a */
+	else if (prev_GB_type == GB_Prepend)
+	  attrs[i].is_cursor_position = FALSE; /* Rule GB9b */
+	else
+	  attrs[i].is_cursor_position = TRUE;  /* Rule GB10 */
+
+	prev_GB_type = GB_type;
+      }
 
       /* If this is a grapheme boundary, we have to decide if backspace
        * deletes a character or the whole grapheme cluster */
-- 
2.7.4