Implement Unicode compatibility decompositions

author Behdad Esfahbod <behdad@behdad.org>

Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)

committer Behdad Esfahbod <behdad@behdad.org>

Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)
author Behdad Esfahbod <behdad@behdad.org>
Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)
committer Behdad Esfahbod <behdad@behdad.org>
Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)
diff --git a/src/hb-glib.cc b/src/hb-glib.cc

index 6b655dd..5246363 100644 (file)
--- a/src/hb-glib.cc
+++ b/src/hb-glib.cc
@@ -336,6 +336,36 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
    return ret;
  }
  
+static unsigned int
+hb_glib_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
+                                        hb_codepoint_t      u,
+                                        hb_codepoint_t     *decomposed,
+                                        void               *user_data HB_UNUSED)
+{
+#if GLIB_CHECK_VERSION(2,29,12)
+  return g_unichar_fully_decompose (u, TRUE, decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN);
+#endif
+
+  /* If the user doesn't have GLib >= 2.29.12 we have to perform
+   * a round trip to UTF-8 and the associated memory management dance. */
+  gchar utf8[6];
+  gchar *utf8_decomposed, *c;
+  gsize utf8_len, utf8_decomposed_len, i;
+
+  /* Convert @u to UTF-8 and normalise it in NFKD mode. This performs the compatibility decomposition. */
+  utf8_len = g_unichar_to_utf8 (u, utf8);
+  utf8_decomposed = g_utf8_normalize (utf8, utf8_len, G_NORMALIZE_NFKD);
+  utf8_decomposed_len = g_utf8_strlen (utf8_decomposed, -1);
+
+  assert (utf8_decomposed_len <= HB_UNICODE_MAX_DECOMPOSITION_LEN);
+
+  for (i = 0, c = utf8_decomposed; i < utf8_decomposed_len; i++, c = g_utf8_next_char (c))
+    *decomposed++ = g_utf8_get_char (c);
+
+  g_free (utf8_decomposed);
+
+  return utf8_decomposed_len;
+}
  
  extern HB_INTERNAL const hb_unicode_funcs_t _hb_glib_unicode_funcs;
  const hb_unicode_funcs_t _hb_glib_unicode_funcs = {
diff --git a/src/hb-icu.cc b/src/hb-icu.cc

index 491c1c8..dce6103 100644 (file)
--- a/src/hb-icu.cc
+++ b/src/hb-icu.cc
@@ -207,7 +207,7 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
                           hb_codepoint_t     *b,
                           void               *user_data HB_UNUSED)
  {
-  UChar utf16[2], normalized[20];
+  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
    int len;
    hb_bool_t ret, err;
    UErrorCode icu_err;
@@ -271,6 +271,40 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
    return ret;
  }
  
+static unsigned int
+hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
+                                       hb_codepoint_t      u,
+                                       hb_codepoint_t     *decomposed,
+                                       void               *user_data HB_UNUSED)
+{
+  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
+  gint len;
+  int32_t utf32_len;
+  hb_bool_t err;
+  UErrorCode icu_err;
+
+  /* Copy @u into a UTF-16 array to be passed to ICU. */
+  len = 0;
+  err = FALSE;
+  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
+  if (err)
+    return 0;
+
+  /* Normalise the codepoint using NFKD mode. */
+  icu_err = U_ZERO_ERROR;
+  len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
+  if (icu_err)
+    return 0;
+
+  /* Convert the decomposed form from UTF-16 to UTF-32. */
+  icu_err = U_ZERO_ERROR;
+  u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
+  if (icu_err)
+    return 0;
+
+  return utf32_len;
+}
+
  
  extern HB_INTERNAL const hb_unicode_funcs_t _hb_icu_unicode_funcs;
  const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc

index d4b0b27..46c89ec 100644 (file)
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -62,7 +62,8 @@
   *     knowledge too.  We need to provide assistance to the itemizer.
   *
   *   - When a font does not support a character but supports its decomposition,
- *     well, use the decomposition.
+ *     well, use the decomposition (preferring the canonical decomposition, but
+ *     falling back to the compatibility decomposition if necessary).
   *
   *   - The Indic shaper requests decomposed output.  This will handle splitting
   *     matra for the Indic shaper.
@@ -111,29 +112,45 @@ decompose (hb_font_t *font, hb_buffer_t *buffer,
    return false;
  }
  
-static void
-decompose_current_glyph (hb_font_t *font, hb_buffer_t *buffer,
-                        bool shortest)
+static bool
+decompose_compatibility (hb_font_t *font, hb_buffer_t *buffer,
+                        hb_codepoint_t u)
  {
-  if (decompose (font, buffer, shortest, buffer->cur().codepoint))
-    buffer->skip_glyph ();
-  else
-    buffer->next_glyph ();
+  unsigned int len, i;
+  hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];
+
+  len = hb_unicode_decompose_compatibility (buffer->unicode, u, decomposed);
+  if (!len)
+    return false;
+
+  hb_codepoint_t glyph;
+  for (i = 0; i < len; i++)
+    if (!hb_font_get_glyph (font, decomposed[i], 0, &glyph))
+      return false;
+
+  for (i = 0; i < len; i++)
+    output_glyph (buffer, decomposed[i]);
+
+  return true;
  }
  
  static void
-decompose_single_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
-                              bool will_recompose)
+decompose_current_character (hb_font_t *font, hb_buffer_t *buffer,
+                            bool shortest)
  {
    hb_codepoint_t glyph;
  
-  /* If recomposing and font supports this, we're good to go */
-  if (will_recompose && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) {
+  /* Kind of a cute waterfall here... */
+  if (shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph))
+    buffer->next_glyph ();
+  else if (decompose (font, buffer, shortest, buffer->cur().codepoint))
+    buffer->skip_glyph ();
+  else if (!shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph))
+    buffer->next_glyph ();
+  else if (decompose_compatibility (font, buffer, buffer->cur().codepoint))
+    buffer->skip_glyph ();
+  else
      buffer->next_glyph ();
-    return;
-  }
-
-  decompose_current_glyph (font, buffer, will_recompose);
  }
  
  static void
@@ -149,7 +166,7 @@ decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
      }
  
    while (buffer->idx < end)
-    decompose_current_glyph (font, buffer, false);
+    decompose_current_character (font, buffer, false);
  }
  
  static int
@@ -188,7 +205,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
          break;
  
      if (buffer->idx + 1 == end)
-      decompose_single_char_cluster (font, buffer, recompose);
+      decompose_current_character (font, buffer, recompose);
      else {
        decompose_multi_char_cluster (font, buffer, end);
        has_multichar_clusters = true;
diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh

index 1ce5adc..ba791eb 100644 (file)
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@@ -50,6 +50,7 @@
    HB_UNICODE_FUNC_IMPLEMENT (script) \
    HB_UNICODE_FUNC_IMPLEMENT (compose) \
    HB_UNICODE_FUNC_IMPLEMENT (decompose) \
+  HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \
    /* ^--- Add new callbacks here */
  
  /* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */
diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc

index b05b290..f300fed 100644 (file)
--- a/src/hb-unicode.cc
+++ b/src/hb-unicode.cc
@@ -99,6 +99,15 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs    HB_UNUSED,
  }
  
  
+static unsigned int
+hb_unicode_decompose_compatibility_nil (hb_unicode_funcs_t *ufuncs     HB_UNUSED,
+                                       hb_codepoint_t      u          HB_UNUSED,
+                                       hb_codepoint_t     *decomposed HB_UNUSED,
+                                       void               *user_data  HB_UNUSED)
+{
+  return 0;
+}
+
  
  hb_unicode_funcs_t *
  hb_unicode_funcs_get_default (void)
@@ -312,6 +321,23 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
    return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose);
  }
  
+unsigned int
+hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
+                                   hb_codepoint_t      u,
+                                   hb_codepoint_t     *decomposed)
+{
+  unsigned int ret = ufuncs->func.decompose_compatibility (ufuncs, u,
+                                                          decomposed,
+                                                          ufuncs->user_data.decompose_compatibility);
+  if (ret == 1 && u == decomposed[0]) {
+    decomposed[0] = 0;
+    return 0;
+  }
+
+  decomposed[ret] = 0;
+
+  return ret;
+}
  
  
  unsigned int
@@ -380,4 +406,3 @@ _hb_unicode_modified_combining_class (hb_unicode_funcs_t *ufuncs,
  
    return c;
  }
-
diff --git a/src/hb-unicode.h b/src/hb-unicode.h

index 808c6e1..2af2d67 100644 (file)
--- a/src/hb-unicode.h
+++ b/src/hb-unicode.h
@@ -1,7 +1,7 @@
  /*
   * Copyright © 2009  Red Hat, Inc.
   * Copyright © 2011  Codethink Limited
- * Copyright © 2011  Google, Inc.
+ * Copyright © 2011,2012  Google, Inc.
   *
   *  This is part of HarfBuzz, a text shaping library.
   *
@@ -122,6 +122,32 @@ typedef hb_bool_t                  (*hb_unicode_decompose_func_t)          (hb_unicode_funcs_t *ufuncs,
                                                                                  hb_codepoint_t     *b,
                                                                                  void               *user_data);
  
+/**
+ * hb_unicode_decompose_compatibility_func_t:
+ * @ufuncs: Unicode function structure
+ * @u: codepoint to decompose
+ * @decomposed: address of codepoint array (of length %HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into
+ * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func()
+ *
+ * Fully decompose @u to its Unicode compatibility decomposition. The codepoints of the decomposition will be written to @decomposed.
+ * The complete length of the decomposition will be returned.
+ *
+ * If @u has no compatibility decomposition, zero should be returned.
+ *
+ * The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any
+ * compatibility decomposition plus an terminating value of 0.  Consequently, @decompose must be allocated by the caller to be at least this length.  Implementations
+ * of this function type must ensure that they do not write past the provided array.
+ *
+ * Return value: number of codepoints in the full compatibility decomposition of @u, or 0 if no decomposition available.
+ */
+typedef unsigned int                   (*hb_unicode_decompose_compatibility_func_t)    (hb_unicode_funcs_t *ufuncs,
+                                                                                        hb_codepoint_t      u,
+                                                                                        hb_codepoint_t     *decomposed,
+                                                                                        void               *user_data);
+
+/* See Unicode 6.1 for details on the maximum decomposition length. */
+#define HB_UNICODE_MAX_DECOMPOSITION_LEN (18+1) /* codepoints */
+
  /* setters */
  
  void
@@ -159,6 +185,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs,
                                      hb_unicode_decompose_func_t decompose_func,
                                      void *user_data, hb_destroy_func_t destroy);
  
+void
+hb_unicode_funcs_set_decompose_compatibility_func (hb_unicode_funcs_t *ufuncs,
+                                                  hb_unicode_decompose_compatibility_func_t decompose_compatibility_func,
+                                                  void *user_data, hb_destroy_func_t destroy);
  
  /* accessors */
  
@@ -193,6 +223,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
                       hb_codepoint_t     *a,
                       hb_codepoint_t     *b);
  
+unsigned int
+hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
+                                   hb_codepoint_t      u,
+                                   hb_codepoint_t     *decomposed);
+
  HB_END_DECLS
  
  #endif /* HB_UNICODE_H */
diff --git a/test/api/hb-test.h b/test/api/hb-test.h

index d569757..8655f41 100644 (file)
--- a/test/api/hb-test.h
+++ b/test/api/hb-test.h
@@ -33,6 +33,7 @@
  
  #include <stdlib.h>
  #include <string.h>
+#include <stdio.h>
  
  HB_BEGIN_DECLS
  
diff --git a/test/api/test-unicode.c b/test/api/test-unicode.c

index a420bf3..96c61dd 100644 (file)
--- a/test/api/test-unicode.c
+++ b/test/api/test-unicode.c
@@ -786,6 +786,7 @@ test_unicode_normalization (gconstpointer user_data)
  {
    hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data;
    gunichar a, b, ab;
+  hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];
  
  
    /* Test compose() */
@@ -849,6 +850,55 @@ test_unicode_normalization (gconstpointer user_data)
    g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8);
    g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173);
  
+
+  /* Test decompose_compatibility() */
+
+  /* Not decomposable */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x0041, decomposed) == 0);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x1F632, decomposed) == 0);
+
+  /* Singletons */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x00B5, decomposed) == 1 && decomposed[0] == 0x03BC);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x03D6, decomposed) == 1 && decomposed[0] == 0x03C0);
+
+  /* Arabic compatibility */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0xFB54, decomposed) == 1 && decomposed[0] == 0x067B);
+
+  /* Longest decomposition ever */
+  g_assert (18 <= HB_UNICODE_MAX_DECOMPOSITION_LEN);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0xFDFA, decomposed) == 18 && decomposed[17] == 0x0645);
+
+  /* Note: we deliberately don't test characters that have canonical decompositions but no
+   * compatibility decomposition against the decompose_compatibility() function as that we
+   * leave up to implementations (for now). */
+
+  /* Spaces */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2002, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2003, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2004, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2005, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2006, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2008, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2009, decomposed) == 1 && decomposed[0] == 0x0020);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x200A, decomposed) == 1 && decomposed[0] == 0x0020);
+
+  /* Pairs */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x0587, decomposed) == 2 &&
+            decomposed[0] == 0x0565 && decomposed[1] == 0x0582);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2017, decomposed) == 2 &&
+            decomposed[0] == 0x0020 && decomposed[1] == 0x0333);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2025, decomposed) == 2 &&
+            decomposed[0] == 0x002E && decomposed[1] == 0x002E);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2033, decomposed) == 2 &&
+            decomposed[0] == 0x2032 && decomposed[1] == 0x2032);
+
+  /* Triples */
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2026, decomposed) == 3 &&
+            decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x2034, decomposed) == 3 &&
+            decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032);
+  g_assert (hb_unicode_decompose_compatibility (uf, 0x213B, decomposed) == 3 &&
+            decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058);
  }
author	Behdad Esfahbod <behdad@behdad.org>
	Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)
committer	Behdad Esfahbod <behdad@behdad.org>
	Wed, 1 Aug 2012 01:36:16 +0000 (21:36 -0400)
src/hb-glib.cc		patch \| blob \| history
src/hb-icu.cc		patch \| blob \| history
src/hb-ot-shape-normalize.cc		patch \| blob \| history
src/hb-unicode-private.hh		patch \| blob \| history
src/hb-unicode.cc		patch \| blob \| history
src/hb-unicode.h		patch \| blob \| history
test/api/hb-test.h		patch \| blob \| history
test/api/test-unicode.c		patch \| blob \| history