Replace zerowidth invisible chars with a zero-advance space glyph

author Behdad Esfahbod <behdad@behdad.org>

Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)

committer Behdad Esfahbod <behdad@behdad.org>

Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)
author Behdad Esfahbod <behdad@behdad.org>
Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)
committer Behdad Esfahbod <behdad@behdad.org>
Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)
diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc

index 880a6b9..746f2d7 100644 (file)
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@@ -25,6 +25,7 @@
   */
  
  #include "hb-ot-shape-complex-private.hh"
+#include "hb-ot-shape-private.hh"
  
  
  
@@ -248,7 +249,7 @@ _hb_ot_shape_complex_setup_masks_arabic (hb_ot_map_t *map, hb_buffer_t *buffer,
  
    for (unsigned int i = 0; i < count; i++)
    {
-    unsigned int this_type = get_joining_type (buffer->info[i].codepoint, (hb_unicode_general_category_t) buffer->info[i].general_category());
+    unsigned int this_type = get_joining_type (buffer->info[i].codepoint, _hb_glyph_info_get_general_category (&buffer->info[i]));
  
      if (unlikely (this_type == JOINING_TYPE_T)) {
        buffer->info[i].arabic_shaping_action() = NONE;
diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc

index 6f14777..f198fba 100644 (file)
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -433,24 +433,6 @@ found_non_indic (const hb_ot_map_t *map, hb_buffer_t *buffer, hb_mask_t *mask_ar
  #include "hb-ot-shape-complex-indic-machine.hh"
  
  static void
-remove_joiners (hb_buffer_t *buffer)
-{
-  /* For now we remove joiners.  However, Uniscbire seems to keep them
-   * and output a zero-width space glyph for them.  It is not clear to
-   * me how that is supposed to interact with GSUB. */
-
-  buffer->clear_output ();
-  unsigned int count = buffer->len;
-  for (buffer->idx = 0; buffer->idx < count;)
-    if (unlikely (is_joiner (buffer->info[buffer->idx])))
-      buffer->skip_glyph ();
-    else
-      buffer->next_glyph ();
-
-  buffer->swap_buffers ();
-}
-
-static void
  initial_reordering (const hb_ot_map_t *map,
                     hb_face_t *face,
                     hb_buffer_t *buffer,
@@ -462,8 +444,6 @@ initial_reordering (const hb_ot_map_t *map,
      mask_array[i] = map->get_1_mask (indic_basic_features[i].tag);
  
    find_syllables (map, buffer, mask_array);
-
-  remove_joiners (buffer);
  }
  
  static void
diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh

index 38edaa0..3f99781 100644 (file)
--- a/src/hb-ot-shape-complex-private.hh
+++ b/src/hb-ot-shape-complex-private.hh
@@ -35,8 +35,8 @@
  
  
  /* buffer var allocations, used during the entire shaping process */
-#define general_category() var1.u8[0] /* unicode general_category (hb_unicode_general_category_t) */
-#define combining_class() var1.u8[1] /* unicode combining_class (uint8_t) */
+#define unicode_props0()       var1.u8[0]
+#define unicode_props1()       var1.u8[1]
  
  /* buffer var allocations, used by complex shapers */
  #define complex_var_persistent_u8_0()  var2.u8[0]
diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc

index 0bcf7f5..4a378a8 100644 (file)
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@@ -68,19 +68,12 @@
   *     matra for the Indic shaper.
   */
  
-static inline void
-set_unicode_props (hb_glyph_info_t *info, hb_unicode_funcs_t *unicode)
-{
-  info->general_category() = hb_unicode_general_category (unicode, info->codepoint);
-  info->combining_class() = _hb_unicode_modified_combining_class (unicode, info->codepoint);
-}
-
  static void
  output_glyph (hb_font_t *font, hb_buffer_t *buffer,
               hb_codepoint_t glyph)
  {
    buffer->output_glyph (glyph);
-  set_unicode_props (&buffer->out_info[buffer->out_len - 1], buffer->unicode);
+  _hb_glyph_info_set_unicode_props (&buffer->out_info[buffer->out_len - 1], buffer->unicode);
  }
  
  static bool
@@ -163,8 +156,8 @@ decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
  static int
  compare_combining_class (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)
  {
-  unsigned int a = pa->combining_class();
-  unsigned int b = pb->combining_class();
+  unsigned int a = _hb_glyph_info_get_modified_combining_class (pa);
+  unsigned int b = _hb_glyph_info_get_modified_combining_class (pb);
  
    return a < b ? -1 : a == b ? 0 : +1;
  }
@@ -214,12 +207,12 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
    count = buffer->len;
    for (unsigned int i = 0; i < count; i++)
    {
-    if (buffer->info[i].combining_class() == 0)
+    if (_hb_glyph_info_get_modified_combining_class (&buffer->info[i]) == 0)
        continue;
  
      unsigned int end;
      for (end = i + 1; end < count; end++)
-      if (buffer->info[end].combining_class() == 0)
+      if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
          break;
  
      /* We are going to do a bubble-sort.  Only do this if the
@@ -254,11 +247,11 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
      if (/* If mode is NOT COMPOSED_FULL (ie. it's COMPOSED_DIACRITICS), we don't try to
          * compose a CCC=0 character with it's preceding starter. */
         (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL ||
-        buffer->info[buffer->idx].combining_class() != 0) &&
+        _hb_glyph_info_get_modified_combining_class (&buffer->info[buffer->idx]) != 0) &&
         /* If there's anything between the starter and this char, they should have CCC
          * smaller than this character's. */
         (starter == buffer->out_len - 1 ||
-        buffer->out_info[buffer->out_len - 1].combining_class() < buffer->info[buffer->idx].combining_class()) &&
+        _hb_glyph_info_get_modified_combining_class (&buffer->out_info[buffer->out_len - 1]) < _hb_glyph_info_get_modified_combining_class (&buffer->info[buffer->idx])) &&
         /* And compose. */
         hb_unicode_compose (buffer->unicode,
                             buffer->out_info[starter].codepoint,
@@ -270,7 +263,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
        /* Composes. Modify starter and carry on. */
        buffer->out_info[starter].codepoint = composed;
        /* XXX update cluster */
-      set_unicode_props (&buffer->out_info[starter], buffer->unicode);
+      _hb_glyph_info_set_unicode_props (&buffer->out_info[starter], buffer->unicode);
  
        buffer->skip_glyph ();
        continue;
@@ -279,7 +272,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
      /* Blocked, or doesn't compose. */
      buffer->next_glyph ();
  
-    if (buffer->out_info[buffer->out_len - 1].combining_class() == 0)
+    if (_hb_glyph_info_get_modified_combining_class (&buffer->out_info[buffer->out_len - 1]) == 0)
        starter = buffer->out_len - 1;
    }
    buffer->swap_buffers ();
diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh

index 5fc69b1..df0c705 100644 (file)
--- a/src/hb-ot-shape-private.hh
+++ b/src/hb-ot-shape-private.hh
@@ -53,4 +53,31 @@ _hb_ot_shape (hb_font_t          *font,
               const hb_feature_t *features,
               unsigned int        num_features);
  
+
+inline void
+_hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_unicode_funcs_t *unicode)
+{
+  info->unicode_props0() = ((unsigned int) hb_unicode_general_category (unicode, info->codepoint)) |
+                          (_hb_unicode_is_zero_width (info->codepoint) ? 0x80 : 0);
+  info->unicode_props1() = _hb_unicode_modified_combining_class (unicode, info->codepoint);
+}
+
+inline hb_unicode_general_category_t
+_hb_glyph_info_get_general_category (const hb_glyph_info_t *info)
+{
+  return (hb_unicode_general_category_t) (info->unicode_props0() & 0x7F);
+}
+
+inline unsigned int
+_hb_glyph_info_get_modified_combining_class (const hb_glyph_info_t *info)
+{
+  return info->unicode_props1();
+}
+
+inline hb_bool_t
+_hb_glyph_info_is_zero_width (const hb_glyph_info_t *info)
+{
+  return !!(info->unicode_props0() & 0x80);
+}
+
  #endif /* HB_OT_SHAPE_PRIVATE_HH */
diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc

index 167b1d7..dbfcf18 100644 (file)
--- a/src/hb-ot-shape.cc
+++ b/src/hb-ot-shape.cc
@@ -43,6 +43,7 @@ hb_tag_t common_features[] = {
    HB_TAG('r','l','i','g'),
  };
  
+
  hb_tag_t horizontal_features[] = {
    HB_TAG('c','a','l','t'),
    HB_TAG('c','l','i','g'),
@@ -170,19 +171,12 @@ hb_ot_shape_setup_masks (hb_ot_shape_context_t *c)
  
  /* Prepare */
  
-static inline void
-set_unicode_props (hb_glyph_info_t *info, hb_unicode_funcs_t *unicode)
-{
-  info->general_category() = hb_unicode_general_category (unicode, info->codepoint);
-  info->combining_class() = _hb_unicode_modified_combining_class (unicode, info->codepoint);
-}
-
  static void
  hb_set_unicode_props (hb_buffer_t *buffer)
  {
    unsigned int count = buffer->len;
    for (unsigned int i = 0; i < count; i++)
-    set_unicode_props (&buffer->info[i], buffer->unicode);
+    _hb_glyph_info_set_unicode_props (&buffer->info[i], buffer->unicode);
  }
  
  static void
@@ -190,7 +184,7 @@ hb_form_clusters (hb_buffer_t *buffer)
  {
    unsigned int count = buffer->len;
    for (unsigned int i = 1; i < count; i++)
-    if (FLAG (buffer->info[i].general_category()) &
+    if (FLAG (_hb_glyph_info_get_general_category (&buffer->info[i])) &
         (FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) |
          FLAG (HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) |
          FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)))
@@ -379,6 +373,23 @@ hb_position_complex_fallback_visual (hb_ot_shape_context_t *c)
    hb_truetype_kern (c);
  }
  
+static void
+hb_hide_zerowidth (hb_ot_shape_context_t *c)
+{
+  /* TODO Save the space character in the font? */
+  hb_codepoint_t space;
+  if (!hb_font_get_glyph (c->font, ' ', 0, &space))
+    return; /* No point! */
+
+  unsigned int count = c->buffer->len;
+  for (unsigned int i = 0; i < count; i++)
+    if (unlikely (_hb_glyph_info_is_zero_width (&c->buffer->info[i]))) {
+      c->buffer->info[i].codepoint = space;
+      c->buffer->pos[i].x_advance = 0;
+      c->buffer->pos[i].y_advance = 0;
+    }
+}
+
  
  /* Do it! */
  
@@ -390,10 +401,10 @@ hb_ot_shape_execute_internal (hb_ot_shape_context_t *c)
    /* Save the original direction, we use it later. */
    c->target_direction = c->buffer->props.direction;
  
-  HB_BUFFER_ALLOCATE_VAR (c->buffer, general_category);
-  HB_BUFFER_ALLOCATE_VAR (c->buffer, combining_class);
+  HB_BUFFER_ALLOCATE_VAR (c->buffer, unicode_props0);
+  HB_BUFFER_ALLOCATE_VAR (c->buffer, unicode_props1);
  
-  hb_set_unicode_props (c->buffer); /* BUFFER: Set general_category and combining_class */
+  hb_set_unicode_props (c->buffer);
  
    hb_form_clusters (c->buffer);
  
@@ -427,8 +438,10 @@ hb_ot_shape_execute_internal (hb_ot_shape_context_t *c)
        hb_position_complex_fallback_visual (c);
    }
  
-  HB_BUFFER_DEALLOCATE_VAR (c->buffer, combining_class);
-  HB_BUFFER_DEALLOCATE_VAR (c->buffer, general_category);
+  hb_hide_zerowidth (c);
+
+  HB_BUFFER_DEALLOCATE_VAR (c->buffer, unicode_props1);
+  HB_BUFFER_DEALLOCATE_VAR (c->buffer, unicode_props0);
  
    c->buffer->props.direction = c->target_direction;
  
diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh

index ad85be7..c06dfe5 100644 (file)
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@@ -114,5 +114,43 @@ _hb_unicode_is_variation_selector (hb_codepoint_t unicode)
                    (unicode >= 0xE0100 && unicode <= 0xE01EF));  /* VARIATION SELECTOR-17..256 */
  }
  
+/* Zero-Width invisible characters:
+ *
+ *  00AD  SOFT HYPHEN
+ *  034F  COMBINING GRAPHEME JOINER
+ *
+ *  200B  ZERO WIDTH SPACE
+ *  200C  ZERO WIDTH NON-JOINER
+ *  200D  ZERO WIDTH JOINER
+ *  200E  LEFT-TO-RIGHT MARK
+ *  200F  RIGHT-TO-LEFT MARK
+ *
+ *  2028  LINE SEPARATOR
+ *
+ *  202A  LEFT-TO-RIGHT EMBEDDING
+ *  202B  RIGHT-TO-LEFT EMBEDDING
+ *  202C  POP DIRECTIONAL FORMATTING
+ *  202D  LEFT-TO-RIGHT OVERRIDE
+ *  202E  RIGHT-TO-LEFT OVERRIDE
+ *
+ *  2060  WORD JOINER
+ *  2061  FUNCTION APPLICATION
+ *  2062  INVISIBLE TIMES
+ *  2063  INVISIBLE SEPARATOR
+ *
+ *  FEFF  ZERO WIDTH NO-BREAK SPACE
+ */
+static inline hb_bool_t
+_hb_unicode_is_zero_width (hb_codepoint_t ch)
+{
+  return ((ch & ~0x007F) == 0x2000 && (
+         (ch >= 0x200B && ch <= 0x200F) ||
+         (ch >= 0x202A && ch <= 0x202E) ||
+         (ch >= 0x2060 && ch <= 0x2063) ||
+         (ch == 0x2028)
+        )) || unlikely (ch == 0x00AD
+                     || ch == 0x034F
+                     || ch == 0xFEFF);
+}
  
  #endif /* HB_UNICODE_PRIVATE_HH */
author	Behdad Esfahbod <behdad@behdad.org>
	Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)
committer	Behdad Esfahbod <behdad@behdad.org>
	Wed, 9 May 2012 13:04:13 +0000 (15:04 +0200)
src/hb-ot-shape-complex-arabic.cc		patch \| blob \| history
src/hb-ot-shape-complex-indic.cc		patch \| blob \| history
src/hb-ot-shape-complex-private.hh		patch \| blob \| history
src/hb-ot-shape-normalize.cc		patch \| blob \| history
src/hb-ot-shape-private.hh		patch \| blob \| history
src/hb-ot-shape.cc		patch \| blob \| history
src/hb-unicode-private.hh		patch \| blob \| history