Update Arabic joining table to include Mandaic
authorBehdad Esfahbod <behdad@behdad.org>
Wed, 17 Nov 2010 21:52:58 +0000 (16:52 -0500)
committerBehdad Esfahbod <behdad@behdad.org>
Wed, 17 Nov 2010 21:52:58 +0000 (16:52 -0500)
Mandaic was added to Unicode 6.0, but the joining data was not updated.
Draft ArabicShaping.txt from 6.1 includes the joining data for Mandaic.
Use that.

src/gen-arabic-joining-table.py
src/hb-ot-shape-complex-arabic.cc

index f12c207..75ea733 100755 (executable)
@@ -11,22 +11,39 @@ for line in sys.stdin:
        fields = [x.strip() for x in line.split(';')]
        u = int(fields[0], 16)
 
-       if u < 0x0600 or (u > 0x07FF and u != 0x200C and u != 0x200D):
+       if u == 0x200C or u == 0x200D:
+               continue
+       if u < 0x0600:
                raise Exception ("Ooops, unexpected unicode character: ", fields)
        dic[u] = fields
 
-print "  /*"
-print "   * The following table is generated by running:"
-print "   *"
-print "   *   ./gen-arabic-joining-table.py < ArabicShaping.txt"
-print "   *"
-print "   * on the ArabicShaping.txt file with the header:"
-print "   *"
+v = dic.keys()
+v.sort()
+min_u, max_u = v[0], v[-1]
+occupancy = len(v) * 100 / (max_u - min_u + 1)
+
+# Maintain at least 40% occupancy in the table */
+if occupancy < 40:
+       raise Exception ("Table too sparse, please investigate: ", occupancy)
+
+print "/* == Start of generated table == */"
+print "/*"
+print " * The following table is generated by running:"
+print " *"
+print " *   ./gen-arabic-joining-table.py < ArabicShaping.txt"
+print " *"
+print " * on the ArabicShaping.txt file with the header:"
+print " *"
 for line in header:
-       print "   * %s" % (line.strip())
-print "   */"
-print "  /* == Start of generated table == */"
-for i in range(0x0600, 0x0800):
+       print " * %s" % (line.strip())
+print " */"
+
+print "#define JOINING_TABLE_FIRST     0x%04x" % min_u
+print "#define JOINING_TABLE_LAST      0x%04x" % max_u
+print "static const uint8_t joining_table[JOINING_TABLE_LAST-JOINING_TABLE_FIRST+2] ="
+print "{"
+
+for i in range(min_u, max_u + 1):
        if i not in dic:
                print "  JOINING_TYPE_X, /* %04X */" % i
        else:
@@ -36,4 +53,6 @@ for i in range(0x0600, 0x0800):
                else:
                        value = "JOINING_TYPE_" + entry[2]
                print "  %s, /* %s */" % (value, '; '.join(entry))
-print "  /* == End of generated table == */"
+print "  JOINING_TYPE_X  /* dummy */"
+print "};"
+print "/* == End of generated table == */"
index a63060c..63f836b 100644 (file)
@@ -56,23 +56,21 @@ enum {
  */
 
 
+/* == Start of generated table == */
 /*
- * Main joining-type table, covering U+0600..U+07FF.
- * Includes Arabic, Syriac, and N'ko.
+ * The following table is generated by running:
+ *
+ *   ./gen-arabic-joining-table.py < ArabicShaping.txt
+ *
+ * on the ArabicShaping.txt file with the header:
+ *
+ * # ArabicShaping-6.1.0.txt
+ * # Date: 2010-11-09, 12:10:00 PST [KW]
  */
-static const uint8_t arabic_syriac_nko_joining_types[0x0800 - 0x0600 + 1] =
+#define JOINING_TABLE_FIRST    0x0600
+#define JOINING_TABLE_LAST     0x0858
+static const uint8_t joining_table[JOINING_TABLE_LAST-JOINING_TABLE_FIRST+2] =
 {
-  /*
-   * The following table is generated by running:
-   *
-   *   ./gen-arabic-joining-table.py < ArabicShaping.txt
-   *
-   * on the ArabicShaping.txt file with the header:
-   *
-   * # ArabicShaping-6.0.0.txt
-   * # Date: 2010-04-30, 13:47:00 PDT [KW]
-   */
-  /* == Start of generated table == */
   JOINING_TYPE_U, /* 0600; ARABIC NUMBER SIGN; U; No_Joining_Group */
   JOINING_TYPE_U, /* 0601; ARABIC SIGN SANAH; U; No_Joining_Group */
   JOINING_TYPE_U, /* 0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group */
@@ -585,16 +583,105 @@ static const uint8_t arabic_syriac_nko_joining_types[0x0800 - 0x0600 + 1] =
   JOINING_TYPE_X, /* 07FD */
   JOINING_TYPE_X, /* 07FE */
   JOINING_TYPE_X, /* 07FF */
-  /* == End of generated table == */
-  JOINING_TYPE_X
+  JOINING_TYPE_X, /* 0800 */
+  JOINING_TYPE_X, /* 0801 */
+  JOINING_TYPE_X, /* 0802 */
+  JOINING_TYPE_X, /* 0803 */
+  JOINING_TYPE_X, /* 0804 */
+  JOINING_TYPE_X, /* 0805 */
+  JOINING_TYPE_X, /* 0806 */
+  JOINING_TYPE_X, /* 0807 */
+  JOINING_TYPE_X, /* 0808 */
+  JOINING_TYPE_X, /* 0809 */
+  JOINING_TYPE_X, /* 080A */
+  JOINING_TYPE_X, /* 080B */
+  JOINING_TYPE_X, /* 080C */
+  JOINING_TYPE_X, /* 080D */
+  JOINING_TYPE_X, /* 080E */
+  JOINING_TYPE_X, /* 080F */
+  JOINING_TYPE_X, /* 0810 */
+  JOINING_TYPE_X, /* 0811 */
+  JOINING_TYPE_X, /* 0812 */
+  JOINING_TYPE_X, /* 0813 */
+  JOINING_TYPE_X, /* 0814 */
+  JOINING_TYPE_X, /* 0815 */
+  JOINING_TYPE_X, /* 0816 */
+  JOINING_TYPE_X, /* 0817 */
+  JOINING_TYPE_X, /* 0818 */
+  JOINING_TYPE_X, /* 0819 */
+  JOINING_TYPE_X, /* 081A */
+  JOINING_TYPE_X, /* 081B */
+  JOINING_TYPE_X, /* 081C */
+  JOINING_TYPE_X, /* 081D */
+  JOINING_TYPE_X, /* 081E */
+  JOINING_TYPE_X, /* 081F */
+  JOINING_TYPE_X, /* 0820 */
+  JOINING_TYPE_X, /* 0821 */
+  JOINING_TYPE_X, /* 0822 */
+  JOINING_TYPE_X, /* 0823 */
+  JOINING_TYPE_X, /* 0824 */
+  JOINING_TYPE_X, /* 0825 */
+  JOINING_TYPE_X, /* 0826 */
+  JOINING_TYPE_X, /* 0827 */
+  JOINING_TYPE_X, /* 0828 */
+  JOINING_TYPE_X, /* 0829 */
+  JOINING_TYPE_X, /* 082A */
+  JOINING_TYPE_X, /* 082B */
+  JOINING_TYPE_X, /* 082C */
+  JOINING_TYPE_X, /* 082D */
+  JOINING_TYPE_X, /* 082E */
+  JOINING_TYPE_X, /* 082F */
+  JOINING_TYPE_X, /* 0830 */
+  JOINING_TYPE_X, /* 0831 */
+  JOINING_TYPE_X, /* 0832 */
+  JOINING_TYPE_X, /* 0833 */
+  JOINING_TYPE_X, /* 0834 */
+  JOINING_TYPE_X, /* 0835 */
+  JOINING_TYPE_X, /* 0836 */
+  JOINING_TYPE_X, /* 0837 */
+  JOINING_TYPE_X, /* 0838 */
+  JOINING_TYPE_X, /* 0839 */
+  JOINING_TYPE_X, /* 083A */
+  JOINING_TYPE_X, /* 083B */
+  JOINING_TYPE_X, /* 083C */
+  JOINING_TYPE_X, /* 083D */
+  JOINING_TYPE_X, /* 083E */
+  JOINING_TYPE_X, /* 083F */
+  JOINING_TYPE_R, /* 0840; MANDAIC HALQA; R; No_Joining_Group */
+  JOINING_TYPE_D, /* 0841; MANDAIC AB; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0842; MANDAIC AG; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0843; MANDAIC AD; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0844; MANDAIC AH; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0845; MANDAIC USHENNA; D; No_Joining_Group */
+  JOINING_TYPE_R, /* 0846; MANDAIC AZ; R; No_Joining_Group */
+  JOINING_TYPE_D, /* 0847; MANDAIC IT; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0848; MANDAIC ATT; D; No_Joining_Group */
+  JOINING_TYPE_R, /* 0849; MANDAIC AKSA; R; No_Joining_Group */
+  JOINING_TYPE_D, /* 084A; MANDAIC AK; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 084B; MANDAIC AL; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 084C; MANDAIC AM; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 084D; MANDAIC AN; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 084E; MANDAIC AS; D; No_Joining_Group */
+  JOINING_TYPE_R, /* 084F; MANDAIC IN; R; No_Joining_Group */
+  JOINING_TYPE_D, /* 0850; MANDAIC AP; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0851; MANDAIC ASZ; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0852; MANDAIC AQ; D; No_Joining_Group */
+  JOINING_TYPE_D, /* 0853; MANDAIC AR; D; No_Joining_Group */
+  JOINING_TYPE_R, /* 0854; MANDAIC ASH; R; No_Joining_Group */
+  JOINING_TYPE_D, /* 0855; MANDAIC AT; D; No_Joining_Group */
+  JOINING_TYPE_U, /* 0856; MANDAIC DUSHENNA; U; No_Joining_Group */
+  JOINING_TYPE_U, /* 0857; MANDAIC KAD; U; No_Joining_Group */
+  JOINING_TYPE_U, /* 0858; MANDAIC AIN; U; No_Joining_Group */
+  JOINING_TYPE_X  /* dummy */
 };
+/* == End of generated table == */
 
 static unsigned int get_joining_type (hb_codepoint_t u, hb_category_t gen_cat)
 {
   /* TODO Macroize the magic bit operations */
 
-  if (likely ((u & ~(0x0600^0x07FF)) == 0x0600)) {
-    unsigned int j_type = arabic_syriac_nko_joining_types[u - 0x0600];
+  if (likely (JOINING_TABLE_FIRST <= u && u <= JOINING_TABLE_LAST)) {
+    unsigned int j_type = joining_table[u - JOINING_TABLE_FIRST];
     if (likely (j_type != JOINING_TYPE_X))
       return j_type;
   }