src/hb-ot-shape-normalize.cc

   1 /*
   2  * Copyright © 2011,2012  Google, Inc.
   3  *
   4  *  This is part of HarfBuzz, a text shaping library.
   5  *
   6  * Permission is hereby granted, without written agreement and without
   7  * license or royalty fees, to use, copy, modify, and distribute this
   8  * software and its documentation for any purpose, provided that the
   9  * above copyright notice and the following two paragraphs appear in
  10  * all copies of this software.
  11  *
  12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  16  * DAMAGE.
  17  *
  18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  23  *
  24  * Google Author(s): Behdad Esfahbod
  25  */
  26
  27 #include "hb-ot-shape-normalize-private.hh"
  28 #include "hb-ot-shape-private.hh"
  29
  30
  31 /*
  32  * HIGHLEVEL DESIGN:
  33  *
  34  * This file exports one main function: _hb_ot_shape_normalize().
  35  *
  36  * This function closely reflects the Unicode Normalization Algorithm,
  37  * yet it's different.
  38  *
  39  * Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
  40  * The logic however tries to use whatever the font can support.
  41  *
  42  * In general what happens is that: each grapheme is decomposed in a chain
  43  * of 1:2 decompositions, marks reordered, and then recomposed if desired,
  44  * so far it's like Unicode Normalization.  However, the decomposition and
  45  * recomposition only happens if the font supports the resulting characters.
  46  *
  47  * The goals are:
  48  *
  49  *   - Try to render all canonically equivalent strings similarly.  To really
  50  *     achieve this we have to always do the full decomposition and then
  51  *     selectively recompose from there.  It's kinda too expensive though, so
  52  *     we skip some cases.  For example, if composed is desired, we simply
  53  *     don't touch 1-character clusters that are supported by the font, even
  54  *     though their NFC may be different.
  55  *
  56  *   - When a font has a precomposed character for a sequence but the 'ccmp'
  57  *     feature in the font is not adequate, use the precomposed character
  58  *     which typically has better mark positioning.
  59  *
  60  *   - When a font does not support a combining mark, but supports it precomposed
  61  *     with previous base, use that.  This needs the itemizer to have this
  62  *     knowledge too.  We need to provide assistance to the itemizer.
  63  *
  64  *   - When a font does not support a character but supports its decomposition,
  65  *     well, use the decomposition.
  66  *
  67  *   - The Indic shaper requests decomposed output.  This will handle splitting
  68  *     matra for the Indic shaper.
  69  */
  70
  71 static void
  72 output_glyph (hb_buffer_t *buffer, hb_codepoint_t glyph)
  73 {
  74   buffer->output_glyph (glyph);
  75   _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer->unicode);
  76 }
  77
  78 static bool
  79 decompose (hb_font_t *font, hb_buffer_t *buffer,
  80            bool shortest,
  81            hb_codepoint_t ab)
  82 {
  83   hb_codepoint_t a, b, glyph;
  84
  85   if (!hb_unicode_decompose (buffer->unicode, ab, &a, &b) ||
  86       (b && !hb_font_get_glyph (font, b, 0, &glyph)))
  87     return false;
  88
  89   bool has_a = hb_font_get_glyph (font, a, 0, &glyph);
  90   if (shortest && has_a) {
  91     /* Output a and b */
  92     output_glyph (buffer, a);
  93     if (b)
  94       output_glyph (buffer, b);
  95     return true;
  96   }
  97
  98   if (decompose (font, buffer, shortest, a)) {
  99     if (b)
 100       output_glyph (buffer, b);
 101     return true;
 102   }
 103
 104   if (has_a) {
 105     output_glyph (buffer, a);
 106     if (b)
 107       output_glyph (buffer, b);
 108     return true;
 109   }
 110
 111   return false;
 112 }
 113
 114 static void
 115 decompose_current_glyph (hb_font_t *font, hb_buffer_t *buffer,
 116                          bool shortest)
 117 {
 118   if (decompose (font, buffer, shortest, buffer->cur().codepoint))
 119     buffer->skip_glyph ();
 120   else
 121     buffer->next_glyph ();
 122 }
 123
 124 static void
 125 decompose_single_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
 126                                bool will_recompose)
 127 {
 128   hb_codepoint_t glyph;
 129
 130   /* If recomposing and font supports this, we're good to go */
 131   if (will_recompose && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) {
 132     buffer->next_glyph ();
 133     return;
 134   }
 135
 136   decompose_current_glyph (font, buffer, will_recompose);
 137 }
 138
 139 static void
 140 decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
 141                               unsigned int end)
 142 {
 143   /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */
 144   for (unsigned int i = buffer->idx; i < end; i++)
 145     if (unlikely (_hb_unicode_is_variation_selector (buffer->info[i].codepoint))) {
 146       while (buffer->idx < end)
 147         buffer->next_glyph ();
 148       return;
 149     }
 150
 151   while (buffer->idx < end)
 152     decompose_current_glyph (font, buffer, false);
 153 }
 154
 155 static int
 156 compare_combining_class (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)
 157 {
 158   unsigned int a = _hb_glyph_info_get_modified_combining_class (pa);
 159   unsigned int b = _hb_glyph_info_get_modified_combining_class (pb);
 160
 161   return a < b ? -1 : a == b ? 0 : +1;
 162 }
 163
 164 void
 165 _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
 166                         hb_ot_shape_normalization_mode_t mode)
 167 {
 168   bool recompose = mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED;
 169   bool has_multichar_clusters = false;
 170   unsigned int count;
 171
 172   /* We do a fairly straightforward yet custom normalization process in three
 173    * separate rounds: decompose, reorder, recompose (if desired).  Currently
 174    * this makes two buffer swaps.  We can make it faster by moving the last
 175    * two rounds into the inner loop for the first round, but it's more readable
 176    * this way. */
 177
 178
 179   /* First round, decompose */
 180
 181   buffer->clear_output ();
 182   count = buffer->len;
 183   for (buffer->idx = 0; buffer->idx < count;)
 184   {
 185     unsigned int end;
 186     for (end = buffer->idx + 1; end < count; end++)
 187       if (buffer->cur().cluster != buffer->info[end].cluster)
 188         break;
 189
 190     if (buffer->idx + 1 == end)
 191       decompose_single_char_cluster (font, buffer, recompose);
 192     else {
 193       decompose_multi_char_cluster (font, buffer, end);
 194       has_multichar_clusters = true;
 195     }
 196   }
 197   buffer->swap_buffers ();
 198
 199
 200   if (mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL && !has_multichar_clusters)
 201     return; /* Done! */
 202
 203
 204   /* Second round, reorder (inplace) */
 205
 206   count = buffer->len;
 207   for (unsigned int i = 0; i < count; i++)
 208   {
 209     if (_hb_glyph_info_get_modified_combining_class (&buffer->info[i]) == 0)
 210       continue;
 211
 212     unsigned int end;
 213     for (end = i + 1; end < count; end++)
 214       if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
 215         break;
 216
 217     /* We are going to do a bubble-sort.  Only do this if the
 218      * sequence is short.  Doing it on long sequences can result
 219      * in an O(n^2) DoS. */
 220     if (end - i > 10) {
 221       i = end;
 222       continue;
 223     }
 224
 225     hb_bubble_sort (buffer->info + i, end - i, compare_combining_class);
 226
 227     i = end;
 228   }
 229
 230
 231   if (!recompose)
 232     return;
 233
 234   /* Third round, recompose */
 235
 236   /* As noted in the comment earlier, we don't try to combine
 237    * ccc=0 chars with their previous Starter. */
 238
 239   buffer->clear_output ();
 240   count = buffer->len;
 241   unsigned int starter = 0;
 242   buffer->next_glyph ();
 243   while (buffer->idx < count)
 244   {
 245     hb_codepoint_t composed, glyph;
 246     if (/* If mode is NOT COMPOSED_FULL (ie. it's COMPOSED_DIACRITICS), we don't try to
 247          * compose a CCC=0 character with it's preceding starter. */
 248         (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL ||
 249          _hb_glyph_info_get_modified_combining_class (&buffer->cur()) != 0) &&
 250         /* If there's anything between the starter and this char, they should have CCC
 251          * smaller than this character's. */
 252         (starter == buffer->out_len - 1 ||
 253          _hb_glyph_info_get_modified_combining_class (&buffer->prev()) < _hb_glyph_info_get_modified_combining_class (&buffer->cur())) &&
 254         /* And compose. */
 255         hb_unicode_compose (buffer->unicode,
 256                             buffer->out_info[starter].codepoint,
 257                             buffer->cur().codepoint,
 258                             &composed) &&
 259         /* And the font has glyph for the composite. */
 260         hb_font_get_glyph (font, composed, 0, &glyph))
 261     {
 262       /* Composes. Modify starter and carry on. */
 263       buffer->out_info[starter].codepoint = composed;
 264       /* XXX update cluster */
 265       _hb_glyph_info_set_unicode_props (&buffer->out_info[starter], buffer->unicode);
 266
 267       buffer->skip_glyph ();
 268       continue;
 269     }
 270
 271     /* Blocked, or doesn't compose. */
 272     buffer->next_glyph ();
 273
 274     if (_hb_glyph_info_get_modified_combining_class (&buffer->prev()) == 0)
 275       starter = buffer->out_len - 1;
 276   }
 277   buffer->swap_buffers ();
 278
 279 }