From: Behdad Esfahbod Date: Tue, 31 Jul 2018 18:45:32 +0000 (-0700) Subject: [khmer] Rewrite most of shaper to better follow spec X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1a96cc825dc9c8e3b6eef1403fe0864a1cfc0245;p=platform%2Fupstream%2FlibHarfBuzzSharp.git [khmer] Rewrite most of shaper to better follow spec Khmer spec has only one reordering phase, and only simple prebase matra and Coeng-Ro reordering. Implement that. Specifically, this was done to address recognizing different orders of the matra and Coeng-Ro sequence. That said, some combinations are now reordered differently from Uniscribe. Not clear if that's intended or a bug in Uniscribe. The following two sequences render the same in Uniscribe whereas we reorder them differently: U+17A0,U+17D2,U+179A,U+17C2 U+17A0,U+17C2,U+17D2,U+179A For that reason, our test suite numbers regressed slightly. Used to be at 34 for fails, now at: KHMER: 299080 out of 299124 tests passed. 44 failed (0.0147096%) But generally a good change, and removed lots of code. Fixes https://github.com/harfbuzz/harfbuzz/issues/1026 --- diff --git a/src/hb-ot-shape-complex-indic-private.hh b/src/hb-ot-shape-complex-indic-private.hh index 9554994..bb7fff3 100644 --- a/src/hb-ot-shape-complex-indic-private.hh +++ b/src/hb-ot-shape-complex-indic-private.hh @@ -300,7 +300,9 @@ static const hb_codepoint_t ra_chars[] = { 0x0CB0u, /* Kannada */ 0x0D30u, /* Malayalam */ /* No Reph, Logical Repha */ - 0x0DBBu, /* Sinhala */ /* Reph formed only with ZWJ */ + 0x0DBBu, /* Sinhala */ /* Reph formed only with ZWJ */ + + 0x179Au, /* Khmer */ }; static inline bool diff --git a/src/hb-ot-shape-complex-khmer.cc b/src/hb-ot-shape-complex-khmer.cc index 7876d36..ba5b64e 100644 --- a/src/hb-ot-shape-complex-khmer.cc +++ b/src/hb-ot-shape-complex-khmer.cc @@ -42,7 +42,7 @@ khmer_features[] = { /* * Basic features. - * These features are applied in order, one at a time, after initial_reordering. + * These features are applied in order, one at a time, after reordering. */ {HB_TAG('p','r','e','f'), F_NONE}, {HB_TAG('b','l','w','f'), F_NONE}, @@ -51,9 +51,7 @@ khmer_features[] = {HB_TAG('c','f','a','r'), F_NONE}, /* * Other features. - * These features are applied all at once, after final_reordering. - * Default Bengali font in Windows for example has intermixed - * lookups for init,pres,abvs,blws features. + * These features are applied all at once. */ {HB_TAG('p','r','e','s'), F_GLOBAL}, {HB_TAG('a','b','v','s'), F_GLOBAL}, @@ -92,13 +90,9 @@ setup_syllables (const hb_ot_shape_plan_t *plan, hb_font_t *font, hb_buffer_t *buffer); static void -initial_reordering (const hb_ot_shape_plan_t *plan, - hb_font_t *font, - hb_buffer_t *buffer); -static void -final_reordering (const hb_ot_shape_plan_t *plan, - hb_font_t *font, - hb_buffer_t *buffer); +reorder (const hb_ot_shape_plan_t *plan, + hb_font_t *font, + hb_buffer_t *buffer); static void clear_syllables (const hb_ot_shape_plan_t *plan, hb_font_t *font, @@ -119,12 +113,11 @@ collect_features_khmer (hb_ot_shape_planner_t *plan) unsigned int i = 0; - map->add_gsub_pause (initial_reordering); + map->add_gsub_pause (reorder); for (; i < KHMER_BASIC_FEATURES; i++) { map->add_feature (khmer_features[i].tag, 1, khmer_features[i].flags | F_MANUAL_ZWJ | F_MANUAL_ZWNJ); map->add_gsub_pause (nullptr); } - map->add_gsub_pause (final_reordering); for (; i < KHMER_NUM_FEATURES; i++) { map->add_feature (khmer_features[i].tag, 1, khmer_features[i].flags | F_MANUAL_ZWJ | F_MANUAL_ZWNJ); } @@ -264,162 +257,58 @@ setup_syllables (const hb_ot_shape_plan_t *plan HB_UNUSED, buffer->unsafe_to_break (start, end); } -static int -compare_khmer_order (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb) -{ - int a = pa->khmer_position(); - int b = pb->khmer_position(); - - return a < b ? -1 : a == b ? 0 : +1; -} - /* Rules from: * https://docs.microsoft.com/en-us/typography/script-development/devanagari */ static void -initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, - hb_face_t *face, - hb_buffer_t *buffer, - unsigned int start, unsigned int end) +reorder_consonant_syllable (const hb_ot_shape_plan_t *plan, + hb_face_t *face, + hb_buffer_t *buffer, + unsigned int start, unsigned int end) { const khmer_shape_plan_t *khmer_plan = (const khmer_shape_plan_t *) plan->data; hb_glyph_info_t *info = buffer->info; - /* 1. Khmer shaping assumes that a syllable will begin with a Cons, IndV, or Number. */ - - /* The first consonant is always the base. */ - unsigned int base = start; - info[base].khmer_position() = POS_BASE_C; - - /* Mark all subsequent consonants as below. */ - for (unsigned int i = base + 1; i < end; i++) - if (is_consonant_or_vowel (info[i])) - info[i].khmer_position() = POS_BELOW_C; - - /* Mark final consonants. A final consonant is one appearing after a matra, - * like in Khmer. */ - for (unsigned int i = base + 1; i < end; i++) - if (info[i].khmer_category() == OT_M) { - for (unsigned int j = i + 1; j < end; j++) - if (is_consonant_or_vowel (info[j])) { - info[j].khmer_position() = POS_FINAL_C; - break; - } - break; - } - - /* Attach misc marks to previous char to move with them. */ - { - khmer_position_t last_pos = POS_START; - for (unsigned int i = start; i < end; i++) - { - if ((FLAG_UNSAFE (info[i].khmer_category()) & (JOINER_FLAGS | FLAG (OT_N) | FLAG (OT_RS) | MEDIAL_FLAGS | FLAG (OT_Coeng)))) - { - info[i].khmer_position() = last_pos; - if (unlikely (info[i].khmer_category() == OT_Coeng && - info[i].khmer_position() == POS_PRE_M)) - { - /* - * Uniscribe doesn't move the Halant with Left Matra. - * TEST: U+092B,U+093F,U+094DE - * We follow. This is important for the Sinhala - * U+0DDA split matra since it decomposes to U+0DD9,U+0DCA - * where U+0DD9 is a left matra and U+0DCA is the virama. - * We don't want to move the virama with the left matra. - * TEST: U+0D9A,U+0DDA - */ - for (unsigned int j = i; j > start; j--) - if (info[j - 1].khmer_position() != POS_PRE_M) { - info[i].khmer_position() = info[j - 1].khmer_position(); - break; - } - } - } else if (info[i].khmer_position() != POS_SMVD) { - last_pos = (khmer_position_t) info[i].khmer_position(); - } - } - } - /* For post-base consonants let them own anything before them - * since the last consonant or matra. */ - { - unsigned int last = base; - for (unsigned int i = base + 1; i < end; i++) - if (is_consonant_or_vowel (info[i])) - { - for (unsigned int j = last + 1; j < i; j++) - if (info[j].khmer_position() < POS_SMVD) - info[j].khmer_position() = info[i].khmer_position(); - last = i; - } else if (info[i].khmer_category() == OT_M) - last = i; - } - - { - /* Use syllable() for sort accounting temporarily. */ - unsigned int syllable = info[start].syllable(); - for (unsigned int i = start; i < end; i++) - info[i].syllable() = i - start; - - /* Sit tight, rock 'n roll! */ - hb_stable_sort (info + start, end - start, compare_khmer_order); - /* Find base again */ - base = end; - for (unsigned int i = start; i < end; i++) - if (info[i].khmer_position() == POS_BASE_C) - { - base = i; - break; - } - - if (unlikely (end - start >= 127)) - buffer->merge_clusters (start, end); - else - /* Note! syllable() is a one-byte field. */ - for (unsigned int i = base; i < end; i++) - if (info[i].syllable() != 255) - { - unsigned int max = i; - unsigned int j = start + info[i].syllable(); - while (j != i) - { - max = MAX (max, j); - unsigned int next = start + info[j].syllable(); - info[j].syllable() = 255; /* So we don't process j later again. */ - j = next; - } - if (i != max) - buffer->merge_clusters (i, max + 1); - } - - /* Put syllable back in. */ - for (unsigned int i = start; i < end; i++) - info[i].syllable() = syllable; - } - - /* Setup masks now */ - + /* Setup masks. */ { - hb_mask_t mask; - /* Post-base */ - mask = khmer_plan->mask_array[BLWF] | khmer_plan->mask_array[ABVF] | khmer_plan->mask_array[PSTF]; - for (unsigned int i = base + 1; i < end; i++) + hb_mask_t mask = khmer_plan->mask_array[BLWF] | khmer_plan->mask_array[ABVF] | khmer_plan->mask_array[PSTF]; + for (unsigned int i = start + 1; i < end; i++) info[i].mask |= mask; } - unsigned int pref_len = 2; - if (khmer_plan->mask_array[PREF] && base + pref_len < end) + unsigned int num_coengs = 0; + for (unsigned int i = start + 1; i < end; i++) { - /* Find a Halant,Ra sequence and mark it for pre-base-reordering processing. */ - for (unsigned int i = base + 1; i + pref_len - 1 < end; i++) { - hb_codepoint_t glyphs[2]; - for (unsigned int j = 0; j < pref_len; j++) - glyphs[j] = info[i + j].codepoint; - if (khmer_plan->pref.would_substitute (glyphs, pref_len, face)) + /* """ + * When a COENG + (Cons | IndV) combination are found (and subscript count + * is less than two) the character combination is handled according to the + * subscript type of the character following the COENG. + * + * ... + * + * Subscript Type 2 - The COENG + RO characters are reordered to immediately + * before the base glyph. Then the COENG + RO characters are assigned to have + * the 'pref' OpenType feature applied to them. + * """ + */ + if (info[i].khmer_category() == OT_Coeng && num_coengs <= 2 && i + 1 < end) + { + num_coengs++; + + if (info[i + 1].khmer_category() == OT_Ra) { - for (unsigned int j = 0; j < pref_len; j++) - info[i++].mask |= khmer_plan->mask_array[PREF]; + for (unsigned int j = 0; j < 2; j++) + info[i + j].mask |= khmer_plan->mask_array[PREF]; + + /* Move the Coeng,Ro sequence to the start. */ + buffer->merge_clusters (start, i + 2); + hb_glyph_info_t t0 = info[i]; + hb_glyph_info_t t1 = info[i + 1]; + memmove (&info[start + 2], &info[start], (i - start) * sizeof (info[0])); + info[start] = t0; + info[start + 1] = t1; /* Mark the subsequent stuff with 'cfar'. Used in Khmer. * Read the feature spec. @@ -428,12 +317,22 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, * U+1784,U+17D2,U+1782,U+17D2,U+179A */ if (khmer_plan->mask_array[CFAR]) - for (; i < end; i++) - info[i].mask |= khmer_plan->mask_array[CFAR]; + for (unsigned int j = i + 2; j < end; j++) + info[j].mask |= khmer_plan->mask_array[CFAR]; - break; + num_coengs = 2; /* Done. */ } } + + /* Reorder left matra piece. */ + else if (info[i].khmer_position() == POS_PRE_M) + { + /* Move to the start. */ + buffer->merge_clusters (start, i + 1); + hb_glyph_info_t t = info[i]; + memmove (&info[start + 1], &info[start], (i - start) * sizeof (info[0])); + info[start] = t; + } } } @@ -448,7 +347,7 @@ initial_reordering_syllable (const hb_ot_shape_plan_t *plan, { case broken_cluster: /* We already inserted dotted-circles, so just call the consonant_syllable. */ case consonant_syllable: - initial_reordering_consonant_syllable (plan, face, buffer, start, end); + reorder_consonant_syllable (plan, face, buffer, start, end); break; case non_khmer_cluster: @@ -518,263 +417,26 @@ insert_dotted_circles (const hb_ot_shape_plan_t *plan HB_UNUSED, } static void -initial_reordering (const hb_ot_shape_plan_t *plan, - hb_font_t *font, - hb_buffer_t *buffer) +reorder (const hb_ot_shape_plan_t *plan, + hb_font_t *font, + hb_buffer_t *buffer) { insert_dotted_circles (plan, font, buffer); foreach_syllable (buffer, start, end) initial_reordering_syllable (plan, font->face, buffer, start, end); -} - -static void -final_reordering_syllable (const hb_ot_shape_plan_t *plan, - hb_buffer_t *buffer, - unsigned int start, unsigned int end) -{ - const khmer_shape_plan_t *khmer_plan = (const khmer_shape_plan_t *) plan->data; - hb_glyph_info_t *info = buffer->info; - - - /* This function relies heavily on halant glyphs. Lots of ligation - * and possibly multiple substitutions happened prior to this - * phase, and that might have messed up our properties. Recover - * from a particular case of that where we're fairly sure that a - * class of OT_Coeng is desired but has been lost. */ - if (khmer_plan->virama_glyph) - { - unsigned int virama_glyph = khmer_plan->virama_glyph; - for (unsigned int i = start; i < end; i++) - if (info[i].codepoint == virama_glyph && - _hb_glyph_info_ligated (&info[i]) && - _hb_glyph_info_multiplied (&info[i])) - { - /* This will make sure that this glyph passes is_coeng() test. */ - info[i].khmer_category() = OT_Coeng; - _hb_glyph_info_clear_ligated_and_multiplied (&info[i]); - } - } - - - /* 4. Final reordering: - * - * After the localized forms and basic shaping forms GSUB features have been - * applied (see below), the shaping engine performs some final glyph - * reordering before applying all the remaining font features to the entire - * syllable. - */ - - bool try_pref = !!khmer_plan->mask_array[PREF]; - - /* Find base again */ - unsigned int base; - for (base = start; base < end; base++) - if (info[base].khmer_position() >= POS_BASE_C) - { - if (try_pref && base + 1 < end) - { - for (unsigned int i = base + 1; i < end; i++) - if ((info[i].mask & khmer_plan->mask_array[PREF]) != 0) - { - if (!(_hb_glyph_info_substituted (&info[i]) && - _hb_glyph_info_ligated_and_didnt_multiply (&info[i]))) - { - /* Ok, this was a 'pref' candidate but didn't form any. - * Base is around here... */ - base = i; - while (base < end && is_coeng (info[base])) - base++; - info[base].khmer_position() = POS_BASE_C; - - try_pref = false; - } - break; - } - } - - if (start < base && info[base].khmer_position() > POS_BASE_C) - base--; - break; - } - if (base == end && start < base && - is_one_of (info[base - 1], FLAG (OT_ZWJ))) - base--; - if (base < end) - while (start < base && - is_one_of (info[base], (FLAG (OT_N) | FLAG (OT_Coeng)))) - base--; - - - /* o Reorder matras: - * - * If a pre-base matra character had been reordered before applying basic - * features, the glyph can be moved closer to the main consonant based on - * whether half-forms had been formed. Actual position for the matra is - * defined as “after last standalone halant glyph, after initial matra - * position and before the main consonant”. If ZWJ or ZWNJ follow this - * halant, position is moved after it. - */ - - if (start + 1 < end && start < base) /* Otherwise there can't be any pre-base matra characters. */ - { - /* If we lost track of base, alas, position before last thingy. */ - unsigned int new_pos = base == end ? base - 2 : base - 1; - - while (new_pos > start && - !(is_one_of (info[new_pos], (FLAG (OT_M) | FLAG (OT_Coeng))))) - new_pos--; - - /* If we found no Halant we are done. - * Otherwise only proceed if the Halant does - * not belong to the Matra itself! */ - if (is_coeng (info[new_pos]) && - info[new_pos].khmer_position() != POS_PRE_M) - { - /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */ - if (new_pos + 1 < end && is_joiner (info[new_pos + 1])) - new_pos++; - } - else - new_pos = start; /* No move. */ - - if (start < new_pos && info[new_pos].khmer_position () != POS_PRE_M) - { - /* Now go see if there's actually any matras... */ - for (unsigned int i = new_pos; i > start; i--) - if (info[i - 1].khmer_position () == POS_PRE_M) - { - unsigned int old_pos = i - 1; - if (old_pos < base && base <= new_pos) /* Shouldn't actually happen. */ - base--; - - hb_glyph_info_t tmp = info[old_pos]; - memmove (&info[old_pos], &info[old_pos + 1], (new_pos - old_pos) * sizeof (info[0])); - info[new_pos] = tmp; - - /* Note: this merge_clusters() is intentionally *after* the reordering. - * Indic matra reordering is special and tricky... */ - buffer->merge_clusters (new_pos, MIN (end, base + 1)); - - new_pos--; - } - } else { - for (unsigned int i = start; i < base; i++) - if (info[i].khmer_position () == POS_PRE_M) { - buffer->merge_clusters (i, MIN (end, base + 1)); - break; - } - } - } - - - /* o Reorder pre-base-reordering consonants: - * - * If a pre-base-reordering consonant is found, reorder it according to - * the following rules: - */ - - if (try_pref && base + 1 < end) /* Otherwise there can't be any pre-base-reordering Ra. */ - { - for (unsigned int i = base + 1; i < end; i++) - if ((info[i].mask & khmer_plan->mask_array[PREF]) != 0) - { - /* 1. Only reorder a glyph produced by substitution during application - * of the feature. (Note that a font may shape a Ra consonant with - * the feature generally but block it in certain contexts.) - */ - /* Note: We just check that something got substituted. We don't check that - * the feature actually did it... - * - * Reorder pref only if it ligated. */ - if (_hb_glyph_info_ligated_and_didnt_multiply (&info[i])) - { - /* - * 2. Try to find a target position the same way as for pre-base matra. - * If it is found, reorder pre-base consonant glyph. - * - * 3. If position is not found, reorder immediately before main - * consonant. - */ - - unsigned int new_pos = base; - while (new_pos > start && - !(is_one_of (info[new_pos - 1], FLAG(OT_M) | FLAG (OT_Coeng)))) - new_pos--; - - /* In Khmer coeng model, a H,Ra can go *after* matras. If it goes after a - * split matra, it should be reordered to *before* the left part of such matra. */ - if (new_pos > start && info[new_pos - 1].khmer_category() == OT_M) - { - unsigned int old_pos = i; - for (unsigned int j = base + 1; j < old_pos; j++) - if (info[j].khmer_category() == OT_M) - { - new_pos--; - break; - } - } - - if (new_pos > start && is_coeng (info[new_pos - 1])) - { - /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */ - if (new_pos < end && is_joiner (info[new_pos])) - new_pos++; - } - - { - unsigned int old_pos = i; - - buffer->merge_clusters (new_pos, old_pos + 1); - hb_glyph_info_t tmp = info[old_pos]; - memmove (&info[new_pos + 1], &info[new_pos], (old_pos - new_pos) * sizeof (info[0])); - info[new_pos] = tmp; - - if (new_pos <= base && base < old_pos) - base++; - } - } - - break; - } - } - - - /* - * Finish off the clusters and go home! - */ - if (hb_options ().uniscribe_bug_compatible) - { - /* Uniscribe merges the entire syllable into a single cluster... Except for Tamil & Sinhala. - * This means, half forms are submerged into the main consonant's cluster. - * This is unnecessary, and makes cursor positioning harder, but that's what - * Uniscribe does. */ - buffer->merge_clusters (start, end); - } -} - - -static void -final_reordering (const hb_ot_shape_plan_t *plan, - hb_font_t *font HB_UNUSED, - hb_buffer_t *buffer) -{ - unsigned int count = buffer->len; - if (unlikely (!count)) return; - - foreach_syllable (buffer, start, end) - final_reordering_syllable (plan, buffer, start, end); HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category); HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_position); } - static void clear_syllables (const hb_ot_shape_plan_t *plan HB_UNUSED, hb_font_t *font HB_UNUSED, hb_buffer_t *buffer) { + /* TODO: In USE, we clear syllables right after reorder. Figure out + * what Uniscribe does. */ hb_glyph_info_t *info = buffer->info; unsigned int count = buffer->len; for (unsigned int i = 0; i < count; i++) diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh index ed6849b..37a4d91 100644 --- a/src/hb-ot-shape-complex-private.hh +++ b/src/hb-ot-shape-complex-private.hh @@ -279,20 +279,7 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner) return &_hb_ot_complex_shaper_indic; case HB_SCRIPT_KHMER: - /* A number of Khmer fonts in the wild don't have a 'pref' feature, - * and as such won't shape properly via the Indic shaper; - * however, they typically have 'liga' / 'clig' features that implement - * the necessary "reordering" by means of ligature substitutions. - * So we send such pref-less fonts through the generic shaper instead. */ - if (planner->map.found_script[0] && - hb_ot_layout_language_find_feature (planner->face, HB_OT_TAG_GSUB, - planner->map.script_index[0], - planner->map.language_index[0], - HB_TAG ('p','r','e','f'), - nullptr)) return &_hb_ot_complex_shaper_khmer; - else - return &_hb_ot_complex_shaper_default; case HB_SCRIPT_MYANMAR: if (planner->map.chosen_script[0] == HB_TAG ('m','y','m','2'))