Add script to/from ISO 15924 tag support
authorBehdad Esfahbod <behdad@behdad.org>
Wed, 16 Mar 2011 20:36:32 +0000 (17:36 -0300)
committerBehdad Esfahbod <behdad@behdad.org>
Wed, 16 Mar 2011 20:36:32 +0000 (17:36 -0300)
Also adds --script support to hb-view.

If a script tag is not known to us, we pass the ISO 15924 tag around.
Right now, the OT layer ignores that, but we can fix it to blindly
convert that to an OT script tag.

src/hb-ot-tag.c
src/hb-unicode.c
src/hb-unicode.h
src/hb-view.c

index 426fff7..df86bd4 100644 (file)
@@ -152,6 +152,8 @@ hb_ot_tags_from_script (hb_script_t script)
 {
   static const hb_tag_t def_tag[] = {HB_OT_TAG_DEFAULT_SCRIPT, HB_TAG_NONE};
 
+  /* XXX Handle non-enum scripts */
+
   if (unlikely ((unsigned int) script >= ARRAY_LENGTH (ot_scripts)))
     return def_tag;
 
@@ -170,6 +172,8 @@ hb_ot_tag_to_script (hb_tag_t tag)
         return i;
   }
 
+  /* XXX Convert to non-enum scripts */
+
   return HB_SCRIPT_UNKNOWN;
 }
 
index 98b7bc5..b69eec6 100644 (file)
@@ -234,6 +234,279 @@ hb_unicode_get_eastasian_width (hb_unicode_funcs_t *ufuncs,
 }
 
 
+/* hb_script_t */
+
+static const hb_tag_t script_to_iso15924_tag[] =
+{
+  HB_TAG('Z','y','y','y'),     /* HB_SCRIPT_COMMON */
+  HB_TAG('Q','a','a','i'),     /* HB_SCRIPT_INHERITED */
+  HB_TAG('A','r','a','b'),     /* HB_SCRIPT_ARABIC */
+  HB_TAG('A','r','m','n'),     /* HB_SCRIPT_ARMENIAN */
+  HB_TAG('B','e','n','g'),     /* HB_SCRIPT_BENGALI */
+  HB_TAG('B','o','p','o'),     /* HB_SCRIPT_BOPOMOFO */
+  HB_TAG('C','h','e','r'),     /* HB_SCRIPT_CHEROKEE */
+  HB_TAG('Q','a','a','c'),     /* HB_SCRIPT_COPTIC */
+  HB_TAG('C','y','r','l'),     /* HB_SCRIPT_CYRILLIC */
+  HB_TAG('D','s','r','t'),     /* HB_SCRIPT_DESERET */
+  HB_TAG('D','e','v','a'),     /* HB_SCRIPT_DEVANAGARI */
+  HB_TAG('E','t','h','i'),     /* HB_SCRIPT_ETHIOPIC */
+  HB_TAG('G','e','o','r'),     /* HB_SCRIPT_GEORGIAN */
+  HB_TAG('G','o','t','h'),     /* HB_SCRIPT_GOTHIC */
+  HB_TAG('G','r','e','k'),     /* HB_SCRIPT_GREEK */
+  HB_TAG('G','u','j','r'),     /* HB_SCRIPT_GUJARATI */
+  HB_TAG('G','u','r','u'),     /* HB_SCRIPT_GURMUKHI */
+  HB_TAG('H','a','n','i'),     /* HB_SCRIPT_HAN */
+  HB_TAG('H','a','n','g'),     /* HB_SCRIPT_HANGUL */
+  HB_TAG('H','e','b','r'),     /* HB_SCRIPT_HEBREW */
+  HB_TAG('H','i','r','a'),     /* HB_SCRIPT_HIRAGANA */
+  HB_TAG('K','n','d','a'),     /* HB_SCRIPT_KANNADA */
+  HB_TAG('K','a','n','a'),     /* HB_SCRIPT_KATAKANA */
+  HB_TAG('K','h','m','r'),     /* HB_SCRIPT_KHMER */
+  HB_TAG('L','a','o','o'),     /* HB_SCRIPT_LAO */
+  HB_TAG('L','a','t','n'),     /* HB_SCRIPT_LATIN */
+  HB_TAG('M','l','y','m'),     /* HB_SCRIPT_MALAYALAM */
+  HB_TAG('M','o','n','g'),     /* HB_SCRIPT_MONGOLIAN */
+  HB_TAG('M','y','m','r'),     /* HB_SCRIPT_MYANMAR */
+  HB_TAG('O','g','a','m'),     /* HB_SCRIPT_OGHAM */
+  HB_TAG('I','t','a','l'),     /* HB_SCRIPT_OLD_ITALIC */
+  HB_TAG('O','r','y','a'),     /* HB_SCRIPT_ORIYA */
+  HB_TAG('R','u','n','r'),     /* HB_SCRIPT_RUNIC */
+  HB_TAG('S','i','n','h'),     /* HB_SCRIPT_SINHALA */
+  HB_TAG('S','y','r','c'),     /* HB_SCRIPT_SYRIAC */
+  HB_TAG('T','a','m','l'),     /* HB_SCRIPT_TAMIL */
+  HB_TAG('T','e','l','u'),     /* HB_SCRIPT_TELUGU */
+  HB_TAG('T','h','a','a'),     /* HB_SCRIPT_THAANA */
+  HB_TAG('T','h','a','i'),     /* HB_SCRIPT_THAI */
+  HB_TAG('T','i','b','t'),     /* HB_SCRIPT_TIBETAN */
+  HB_TAG('C','a','n','s'),     /* HB_SCRIPT_CANADIAN_ABORIGINAL */
+  HB_TAG('Y','i','i','i'),     /* HB_SCRIPT_YI */
+  HB_TAG('T','g','l','g'),     /* HB_SCRIPT_TAGALOG */
+  HB_TAG('H','a','n','o'),     /* HB_SCRIPT_HANUNOO */
+  HB_TAG('B','u','h','d'),     /* HB_SCRIPT_BUHID */
+  HB_TAG('T','a','g','b'),     /* HB_SCRIPT_TAGBANWA */
+
+  /* Unicode-4.0 additions */
+  HB_TAG('B','r','a','i'),     /* HB_SCRIPT_BRAILLE */
+  HB_TAG('C','p','r','t'),     /* HB_SCRIPT_CYPRIOT */
+  HB_TAG('L','i','m','b'),     /* HB_SCRIPT_LIMBU */
+  HB_TAG('O','s','m','a'),     /* HB_SCRIPT_OSMANYA */
+  HB_TAG('S','h','a','w'),     /* HB_SCRIPT_SHAVIAN */
+  HB_TAG('L','i','n','b'),     /* HB_SCRIPT_LINEAR_B */
+  HB_TAG('T','a','l','e'),     /* HB_SCRIPT_TAI_LE */
+  HB_TAG('U','g','a','r'),     /* HB_SCRIPT_UGARITIC */
+
+  /* Unicode-4.1 additions */
+  HB_TAG('T','a','l','u'),     /* HB_SCRIPT_NEW_TAI_LUE */
+  HB_TAG('B','u','g','i'),     /* HB_SCRIPT_BUGINESE */
+  HB_TAG('G','l','a','g'),     /* HB_SCRIPT_GLAGOLITIC */
+  HB_TAG('T','f','n','g'),     /* HB_SCRIPT_TIFINAGH */
+  HB_TAG('S','y','l','o'),     /* HB_SCRIPT_SYLOTI_NAGRI */
+  HB_TAG('X','p','e','o'),     /* HB_SCRIPT_OLD_PERSIAN */
+  HB_TAG('K','h','a','r'),     /* HB_SCRIPT_KHAROSHTHI */
+
+  /* Unicode-5.0 additions */
+  HB_TAG('Z','z','z','z'),     /* HB_SCRIPT_UNKNOWN */
+  HB_TAG('B','a','l','i'),     /* HB_SCRIPT_BALINESE */
+  HB_TAG('X','s','u','x'),     /* HB_SCRIPT_CUNEIFORM */
+  HB_TAG('P','h','n','x'),     /* HB_SCRIPT_PHOENICIAN */
+  HB_TAG('P','h','a','g'),     /* HB_SCRIPT_PHAGS_PA */
+  HB_TAG('N','k','o','o'),     /* HB_SCRIPT_NKO */
+
+  /* Unicode-5.1 additions */
+  HB_TAG('K','a','l','i'),     /* HB_SCRIPT_KAYAH_LI */
+  HB_TAG('L','e','p','c'),     /* HB_SCRIPT_LEPCHA */
+  HB_TAG('R','j','n','g'),     /* HB_SCRIPT_REJANG */
+  HB_TAG('S','u','n','d'),     /* HB_SCRIPT_SUNDANESE */
+  HB_TAG('S','a','u','r'),     /* HB_SCRIPT_SAURASHTRA */
+  HB_TAG('C','h','a','m'),     /* HB_SCRIPT_CHAM */
+  HB_TAG('O','l','c','k'),     /* HB_SCRIPT_OL_CHIKI */
+  HB_TAG('V','a','i','i'),     /* HB_SCRIPT_VAI */
+  HB_TAG('C','a','r','i'),     /* HB_SCRIPT_CARIAN */
+  HB_TAG('L','y','c','i'),     /* HB_SCRIPT_LYCIAN */
+  HB_TAG('L','y','d','i'),     /* HB_SCRIPT_LYDIAN */
+
+  /* Unicode-5.2 additions */
+  HB_TAG('A','v','s','t'),     /* HB_SCRIPT_AVESTAN */
+  HB_TAG('B','a','m','u'),     /* HB_SCRIPT_BAMUM */
+  HB_TAG('E','g','y','p'),     /* HB_SCRIPT_EGYPTIAN_HIEROGLYPHS */
+  HB_TAG('A','r','m','i'),     /* HB_SCRIPT_IMPERIAL_ARAMAIC */
+  HB_TAG('P','h','l','i'),     /* HB_SCRIPT_INSCRIPTIONAL_PAHLAVI */
+  HB_TAG('P','r','t','i'),     /* HB_SCRIPT_INSCRIPTIONAL_PARTHIAN */
+  HB_TAG('J','a','v','a'),     /* HB_SCRIPT_JAVANESE */
+  HB_TAG('K','t','h','i'),     /* HB_SCRIPT_KAITHI */
+  HB_TAG('L','i','s','u'),     /* HB_SCRIPT_LISU */
+  HB_TAG('M','t','e','i'),     /* HB_SCRIPT_MEETEI_MAYEK */
+  HB_TAG('S','a','r','b'),     /* HB_SCRIPT_OLD_SOUTH_ARABIAN */
+  HB_TAG('O','r','k','h'),     /* HB_SCRIPT_OLD_TURKIC */
+  HB_TAG('S','a','m','r'),     /* HB_SCRIPT_SAMARITAN */
+  HB_TAG('L','a','n','a'),     /* HB_SCRIPT_TAI_THAM */
+  HB_TAG('T','a','v','t'),     /* HB_SCRIPT_TAI_VIET */
+
+  /* Unicode-6.0 additions */
+  HB_TAG('B','a','t','k'),     /* HB_SCRIPT_BATAK */
+  HB_TAG('B','r','a','h'),     /* HB_SCRIPT_BRAHMI */
+  HB_TAG('M','a','n','d')      /* HB_SCRIPT_MANDAIC */
+};
+
+struct tag_script_pair {
+  hb_tag_t tag;
+  hb_script_t script;
+};
+static const struct tag_script_pair script_from_iso15924_tag[] =
+{
+  {HB_TAG('A','r','a','b'), HB_SCRIPT_ARABIC},
+  {HB_TAG('A','r','m','i'), HB_SCRIPT_IMPERIAL_ARAMAIC},
+  {HB_TAG('A','r','m','n'), HB_SCRIPT_ARMENIAN},
+  {HB_TAG('A','v','s','t'), HB_SCRIPT_AVESTAN},
+  {HB_TAG('B','a','l','i'), HB_SCRIPT_BALINESE},
+  {HB_TAG('B','a','m','u'), HB_SCRIPT_BAMUM},
+  {HB_TAG('B','a','t','k'), HB_SCRIPT_BATAK},
+  {HB_TAG('B','e','n','g'), HB_SCRIPT_BENGALI},
+  {HB_TAG('B','o','p','o'), HB_SCRIPT_BOPOMOFO},
+  {HB_TAG('B','r','a','h'), HB_SCRIPT_BRAHMI},
+  {HB_TAG('B','r','a','i'), HB_SCRIPT_BRAILLE},
+  {HB_TAG('B','u','g','i'), HB_SCRIPT_BUGINESE},
+  {HB_TAG('B','u','h','d'), HB_SCRIPT_BUHID},
+  {HB_TAG('C','a','n','s'), HB_SCRIPT_CANADIAN_ABORIGINAL},
+  {HB_TAG('C','a','r','i'), HB_SCRIPT_CARIAN},
+  {HB_TAG('C','h','a','m'), HB_SCRIPT_CHAM},
+  {HB_TAG('C','h','e','r'), HB_SCRIPT_CHEROKEE},
+  {HB_TAG('C','p','r','t'), HB_SCRIPT_CYPRIOT},
+  {HB_TAG('C','y','r','l'), HB_SCRIPT_CYRILLIC},
+  {HB_TAG('C','y','r','s'), HB_SCRIPT_CYRILLIC},
+  {HB_TAG('D','e','v','a'), HB_SCRIPT_DEVANAGARI},
+  {HB_TAG('D','s','r','t'), HB_SCRIPT_DESERET},
+  {HB_TAG('E','g','y','p'), HB_SCRIPT_EGYPTIAN_HIEROGLYPHS},
+  {HB_TAG('E','t','h','i'), HB_SCRIPT_ETHIOPIC},
+  {HB_TAG('G','e','o','a'), HB_SCRIPT_GEORGIAN},
+  {HB_TAG('G','e','o','n'), HB_SCRIPT_GEORGIAN},
+  {HB_TAG('G','e','o','r'), HB_SCRIPT_GEORGIAN},
+  {HB_TAG('G','l','a','g'), HB_SCRIPT_GLAGOLITIC},
+  {HB_TAG('G','o','t','h'), HB_SCRIPT_GOTHIC},
+  {HB_TAG('G','r','e','k'), HB_SCRIPT_GREEK},
+  {HB_TAG('G','u','j','r'), HB_SCRIPT_GUJARATI},
+  {HB_TAG('G','u','r','u'), HB_SCRIPT_GURMUKHI},
+  {HB_TAG('H','a','n','g'), HB_SCRIPT_HANGUL},
+  {HB_TAG('H','a','n','i'), HB_SCRIPT_HAN},
+  {HB_TAG('H','a','n','o'), HB_SCRIPT_HANUNOO},
+  {HB_TAG('H','e','b','r'), HB_SCRIPT_HEBREW},
+  {HB_TAG('H','i','r','a'), HB_SCRIPT_HIRAGANA},
+  {HB_TAG('I','t','a','l'), HB_SCRIPT_OLD_ITALIC},
+  {HB_TAG('J','a','v','a'), HB_SCRIPT_JAVANESE},
+  {HB_TAG('K','a','l','i'), HB_SCRIPT_KAYAH_LI},
+  {HB_TAG('K','a','n','a'), HB_SCRIPT_KATAKANA},
+  {HB_TAG('K','h','a','r'), HB_SCRIPT_KHAROSHTHI},
+  {HB_TAG('K','h','m','r'), HB_SCRIPT_KHMER},
+  {HB_TAG('K','n','d','a'), HB_SCRIPT_KANNADA},
+  {HB_TAG('K','t','h','i'), HB_SCRIPT_KAITHI},
+  {HB_TAG('L','a','n','a'), HB_SCRIPT_TAI_THAM},
+  {HB_TAG('L','a','o','o'), HB_SCRIPT_LAO},
+  {HB_TAG('L','a','t','f'), HB_SCRIPT_LATIN},
+  {HB_TAG('L','a','t','g'), HB_SCRIPT_LATIN},
+  {HB_TAG('L','a','t','n'), HB_SCRIPT_LATIN},
+  {HB_TAG('L','e','p','c'), HB_SCRIPT_LEPCHA},
+  {HB_TAG('L','i','m','b'), HB_SCRIPT_LIMBU},
+  {HB_TAG('L','i','n','b'), HB_SCRIPT_LINEAR_B},
+  {HB_TAG('L','i','s','u'), HB_SCRIPT_LISU},
+  {HB_TAG('L','y','c','i'), HB_SCRIPT_LYCIAN},
+  {HB_TAG('L','y','d','i'), HB_SCRIPT_LYDIAN},
+  {HB_TAG('M','a','n','d'), HB_SCRIPT_MANDAIC},
+  {HB_TAG('M','l','y','m'), HB_SCRIPT_MALAYALAM},
+  {HB_TAG('M','o','n','g'), HB_SCRIPT_MONGOLIAN},
+  {HB_TAG('M','t','e','i'), HB_SCRIPT_MEETEI_MAYEK},
+  {HB_TAG('M','y','m','r'), HB_SCRIPT_MYANMAR},
+  {HB_TAG('N','k','o','o'), HB_SCRIPT_NKO},
+  {HB_TAG('O','g','a','m'), HB_SCRIPT_OGHAM},
+  {HB_TAG('O','l','c','k'), HB_SCRIPT_OL_CHIKI},
+  {HB_TAG('O','r','k','h'), HB_SCRIPT_OLD_TURKIC},
+  {HB_TAG('O','r','y','a'), HB_SCRIPT_ORIYA},
+  {HB_TAG('O','s','m','a'), HB_SCRIPT_OSMANYA},
+  {HB_TAG('P','h','a','g'), HB_SCRIPT_PHAGS_PA},
+  {HB_TAG('P','h','l','i'), HB_SCRIPT_INSCRIPTIONAL_PAHLAVI},
+  {HB_TAG('P','h','n','x'), HB_SCRIPT_PHOENICIAN},
+  {HB_TAG('P','r','t','i'), HB_SCRIPT_INSCRIPTIONAL_PARTHIAN},
+  {HB_TAG('Q','a','a','c'), HB_SCRIPT_COPTIC},
+  {HB_TAG('Q','a','a','i'), HB_SCRIPT_INHERITED},
+  {HB_TAG('R','j','n','g'), HB_SCRIPT_REJANG},
+  {HB_TAG('R','u','n','r'), HB_SCRIPT_RUNIC},
+  {HB_TAG('S','a','m','r'), HB_SCRIPT_SAMARITAN},
+  {HB_TAG('S','a','r','b'), HB_SCRIPT_OLD_SOUTH_ARABIAN},
+  {HB_TAG('S','a','u','r'), HB_SCRIPT_SAURASHTRA},
+  {HB_TAG('S','h','a','w'), HB_SCRIPT_SHAVIAN},
+  {HB_TAG('S','i','n','h'), HB_SCRIPT_SINHALA},
+  {HB_TAG('S','u','n','d'), HB_SCRIPT_SUNDANESE},
+  {HB_TAG('S','y','l','o'), HB_SCRIPT_SYLOTI_NAGRI},
+  {HB_TAG('S','y','r','c'), HB_SCRIPT_SYRIAC},
+  {HB_TAG('S','y','r','e'), HB_SCRIPT_SYRIAC},
+  {HB_TAG('S','y','r','n'), HB_SCRIPT_SYRIAC},
+  {HB_TAG('T','a','g','b'), HB_SCRIPT_TAGBANWA},
+  {HB_TAG('T','a','l','e'), HB_SCRIPT_TAI_LE},
+  {HB_TAG('T','a','l','u'), HB_SCRIPT_NEW_TAI_LUE},
+  {HB_TAG('T','a','m','l'), HB_SCRIPT_TAMIL},
+  {HB_TAG('T','a','v','t'), HB_SCRIPT_TAI_VIET},
+  {HB_TAG('T','e','l','u'), HB_SCRIPT_TELUGU},
+  {HB_TAG('T','f','n','g'), HB_SCRIPT_TIFINAGH},
+  {HB_TAG('T','g','l','g'), HB_SCRIPT_TAGALOG},
+  {HB_TAG('T','h','a','a'), HB_SCRIPT_THAANA},
+  {HB_TAG('T','h','a','i'), HB_SCRIPT_THAI},
+  {HB_TAG('T','i','b','t'), HB_SCRIPT_TIBETAN},
+  {HB_TAG('U','g','a','r'), HB_SCRIPT_UGARITIC},
+  {HB_TAG('V','a','i','i'), HB_SCRIPT_VAI},
+  {HB_TAG('X','p','e','o'), HB_SCRIPT_OLD_PERSIAN},
+  {HB_TAG('X','s','u','x'), HB_SCRIPT_CUNEIFORM},
+  {HB_TAG('Y','i','i','i'), HB_SCRIPT_YI},
+  {HB_TAG('Z','y','y','y'), HB_SCRIPT_COMMON},
+  {HB_TAG('Z','z','z','z'), HB_SCRIPT_UNKNOWN}
+};
+
+static int
+_tag_cmp (hb_tag_t *pa, hb_tag_t *pb)
+{
+  hb_tag_t a = *pa, b = *pb;
+  return a < b ? -1 : a == b ? 0 : +1;
+}
+
+
+hb_script_t
+hb_script_from_iso15924_tag (hb_tag_t tag)
+{
+  const struct tag_script_pair *pair;
+
+  /* Be lenient, adjust case (one capital letter followed by three small letters) */
+  tag = (tag & 0xDFDFDFDF) | 0x00202020;
+
+  pair = (const struct tag_script_pair *) bsearch (&tag,
+                                                  script_from_iso15924_tag,
+                                                  ARRAY_LENGTH (script_from_iso15924_tag),
+                                                  sizeof (script_from_iso15924_tag[0]),
+                                                  (hb_compare_func_t) _tag_cmp);
+
+  if (pair)
+    return pair->script;
+
+  /* If it looks right, just use the tag as a script */
+  if (((uint32_t) tag & 0xE0E0E0E0) == 0x40606060)
+    return (hb_script_t) tag;
+
+  /* Otherwise, return unknown */
+  return HB_SCRIPT_UNKNOWN;
+}
+
+hb_tag_t
+hb_script_to_iso15924_tag (hb_script_t script)
+{
+  if (likely ((unsigned int) script < ARRAY_LENGTH (script_to_iso15924_tag)))
+    return script_to_iso15924_tag[script];
+
+  /* if script is of the right shape (one capital letter followed by three small letters),
+   * return as is. */
+  if (((uint32_t) script & 0xE0E0E0E0) == 0x40606060)
+    return (hb_tag_t) script;
+
+  /* Otherwise, we don't know what that is */
+  return script_to_iso15924_tag[HB_SCRIPT_UNKNOWN];
+}
+
 
 #define LTR HB_DIRECTION_LTR
 #define RTL HB_DIRECTION_RTL
index 7b13931..9d13ba4 100644 (file)
@@ -290,7 +290,13 @@ hb_unicode_get_eastasian_width (hb_unicode_funcs_t *ufuncs,
                                hb_codepoint_t unicode);
 
 
-/* Misc functions */
+/* Script functions */
+
+hb_script_t
+hb_script_from_iso15924_tag (hb_tag_t tag);
+
+hb_tag_t
+hb_script_to_iso15924_tag (hb_script_t script);
 
 hb_direction_t
 hb_script_get_horizontal_direction (hb_script_t script);
index b5fb670..fd9fec7 100644 (file)
@@ -55,6 +55,7 @@ static const char *text = NULL;
 static const char *font_file = NULL;
 static const char *out_file = "/dev/stdout";
 static const char *language = NULL;
+static const char *script = NULL;
 
 /* Ugh, global vars.  Ugly, but does the job */
 static int width = 0;
@@ -97,6 +98,7 @@ parse_opts (int argc, char **argv)
        {"foreground", 1, 0, 'F'},
        {"background", 1, 0, 'B'},
        {"language", 1, 0, 'L'},
+       {"script", 1, 0, 'S'},
        {"output", 1, 0, 'o'},
        {0, 0, 0, 0}
       };
@@ -143,6 +145,9 @@ parse_opts (int argc, char **argv)
        case 'L':
          language = optarg;
          break;
+       case 'S':
+         script = optarg;
+         break;
        case 'o':
          out_file = optarg;
          break;
@@ -184,7 +189,10 @@ _hb_cr_text_glyphs (cairo_t *cr,
   hb_buffer_set_unicode_funcs (hb_buffer, hb_glib_get_unicode_funcs ());
 
   hb_buffer_add_utf8 (hb_buffer, text, len, 0, len);
-  hb_buffer_set_script (hb_buffer, HB_SCRIPT_INVALID);
+  if (script)
+    hb_buffer_set_script (hb_buffer, hb_script_from_iso15924_tag (hb_tag_from_string (script)));
+  else
+    hb_buffer_set_script (hb_buffer, HB_SCRIPT_INVALID);
   hb_buffer_set_direction (hb_buffer, HB_DIRECTION_INVALID);
   hb_buffer_set_language (hb_buffer, hb_language_from_string (language));