Harfbuzz-thai: Hide ZWJ and ZWNJ characters and show Inherited characters
authorJohn Tapsell <john.tapsell.ext@basyskom.com>
Wed, 8 Feb 2012 10:12:13 +0000 (10:12 +0000)
committerQt by Nokia <qt-info@nokia.com>
Thu, 23 Feb 2012 14:07:58 +0000 (15:07 +0100)
Thai is not supposed to have ZWJ and ZWNJ characters or any other of the
Inherited Unicode Scripts
(http://www.verisigninc.com/assets/idn-inherited-unicode-script.pdf)
- they don't have a mapping to the thai encoding tis620 which libthai
requires.  However it is an unfortunate fact that there are many websites
etc that liberally place these ZWJ and ZWNJ characters throughout thai text
to force word boundaries, so we must also deal with them.

We deal with all Inherited characters by mapping them to the invalid code ~0
in tis620 encoding, following what libthai does internally in its own tis620
encoding functions, and then replacing this character with the original
unicode and setting dontPrint to true to hide the ZWJ and ZWNJ
characters.

Includes a unit test to check the behaviour.

Change-Id: I1ee8388b650cb5fc61bcb265efb9843c73f327ac
Reviewed-by: Adrian Yanes <ext-adrian.yanes@nokia.com>
Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com>
src/3rdparty/harfbuzz/src/harfbuzz-thai.c
tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp

index 386fd5c..bf6c35b 100644 (file)
@@ -77,7 +77,7 @@ static void to_tis620(const HB_UChar16 *string, hb_uint32 len, const char *cstr)
         else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
             result[i] = (unsigned char)(string[i] - 0xe00 + 0xa0);
         else
-            result[i] = '?';
+            result[i] = (unsigned char)~0; // Same encoding as libthai uses for invalid chars
     }
 
     result[len] = 0;
@@ -259,8 +259,13 @@ static HB_Bool HB_ThaiConvertStringToGlyphIndices (HB_ShaperItem *item)
         for (int lgi = 0; lgi < lgn; lgi++) {
             if ( rglyphs[lgi] == 0xdd/*TH_BLANK_BASE_GLYPH*/ ) {
                 glyphString[slen++] = C_DOTTED_CIRCLE;
-            }
-            else {
+            } else if (cstr[i] == (signed char)~0) {
+                // The only glyphs that should be passed to this function that cannot be mapped to
+                // tis620 are the ones of type Inherited class.  Pass these glyphs untouched.
+                glyphString[slen++] = string[i];
+                if (string[i] == 0x200D || string[i] == 0x200C)
+                    item->attributes[slen-1].dontPrint = true; // Hide ZWJ and ZWNJ characters
+            } else {
                 glyphString[slen++] = (HB_UChar16) thai_get_glyph_index (font_type, rglyphs[lgi]);
             }
         }
index cf02d1a..7db12ed 100644 (file)
@@ -104,6 +104,7 @@ private slots:
     void mirroredChars();
 
     void thaiIsolatedSaraAm();
+    void thaiWithZWJ();
 
 private:
     bool haveTestFonts;
@@ -1280,5 +1281,46 @@ void tst_QTextScriptEngine::thaiIsolatedSaraAm()
         QSKIP("Cannot find Waree.");
 }
 
+void tst_QTextScriptEngine::thaiWithZWJ()
+{
+    QString s(QString::fromUtf8("ร‍ร‌ร“ร…ร”ร\xA0ร本ร") + QChar(0x0363)/*superscript 'a', for testing Inherited class*/);
+    QTextLayout layout(s);
+    layout.beginLayout();
+    layout.createLine();
+    layout.endLayout();
+
+    QTextEngine *e = layout.engine();
+    e->width(0, s.length()); //force itemize and shape
+
+    // A thai implementation could either remove the ZWJ and ZWNJ characters, or hide them.
+    // The current implementation hides them, so we test for that.
+    // But make sure that we don't hide anything else
+    QCOMPARE(e->layoutData->items.size(), 11);
+    QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(5));  // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script
+    QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1));  // Common: The smart quotes cannot be handled by thai, so should be a seperate item
+    QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(1));  // Thai: Thai character
+    QCOMPARE(e->layoutData->items[3].num_glyphs, ushort(1));  // Common: Ellipsis
+    QCOMPARE(e->layoutData->items[4].num_glyphs, ushort(1));  // Thai: Thai character
+    QCOMPARE(e->layoutData->items[5].num_glyphs, ushort(1));  // Common: Smart quote
+    QCOMPARE(e->layoutData->items[6].num_glyphs, ushort(1));  // Thai: Thai character
+    QCOMPARE(e->layoutData->items[7].num_glyphs, ushort(1));  // Common: \xA0 = non-breaking space. Could be useful to have in thai, but not currently implemented
+    QCOMPARE(e->layoutData->items[8].num_glyphs, ushort(1));  // Thai: Thai character
+    QCOMPARE(e->layoutData->items[9].num_glyphs, ushort(1));  // Japanese: Kanji for tree
+    QCOMPARE(e->layoutData->items[10].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type
+
+    //A quick sanity check - check all the characters are individual clusters
+    unsigned short *logClusters = e->layoutData->logClustersPtr;
+    for (int i = 0; i < 5; i++)
+        QCOMPARE(logClusters[i], ushort(i));
+    for (int i = 0; i < 10; i++)
+        QCOMPARE(logClusters[i+5], ushort(0));
+    QCOMPARE(logClusters[15], ushort(1));
+
+    // The only characters that we should be hiding are the ZWJ and ZWNJ characters in position 1
+    // and 3.
+    for (int i = 0; i < 16; i++)
+        QCOMPARE((bool)e->layoutData->glyphLayout.attributes[i].dontPrint, (i == 1 || i == 3));
+}
+
 QTEST_MAIN(tst_QTextScriptEngine)
 #include "tst_qtextscriptengine.moc"