src/hb-ot-tag.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2011  Google, Inc.
   4  *
   5  *  This is part of HarfBuzz, a text shaping library.
   6  *
   7  * Permission is hereby granted, without written agreement and without
   8  * license or royalty fees, to use, copy, modify, and distribute this
   9  * software and its documentation for any purpose, provided that the
  10  * above copyright notice and the following two paragraphs appear in
  11  * all copies of this software.
  12  *
  13  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  14  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  15  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  16  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  17  * DAMAGE.
  18  *
  19  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  20  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  21  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  22  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  23  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  24  *
  25  * Red Hat Author(s): Behdad Esfahbod
  26  * Google Author(s): Behdad Esfahbod, Roozbeh Pournader
  27  */
  28
  29 #include "hb.hh"
  30
  31
  32 /* hb_script_t */
  33
  34 static hb_tag_t
  35 hb_ot_old_tag_from_script (hb_script_t script)
  36 {
  37   /* This seems to be accurate as of end of 2012. */
  38
  39   switch ((hb_tag_t) script)
  40   {
  41     case HB_SCRIPT_INVALID:             return HB_OT_TAG_DEFAULT_SCRIPT;
  42
  43     /* KATAKANA and HIRAGANA both map to 'kana' */
  44     case HB_SCRIPT_HIRAGANA:            return HB_TAG('k','a','n','a');
  45
  46     /* Spaces at the end are preserved, unlike ISO 15924 */
  47     case HB_SCRIPT_LAO:                 return HB_TAG('l','a','o',' ');
  48     case HB_SCRIPT_YI:                  return HB_TAG('y','i',' ',' ');
  49     /* Unicode-5.0 additions */
  50     case HB_SCRIPT_NKO:                 return HB_TAG('n','k','o',' ');
  51     /* Unicode-5.1 additions */
  52     case HB_SCRIPT_VAI:                 return HB_TAG('v','a','i',' ');
  53   }
  54
  55   /* Else, just change first char to lowercase and return */
  56   return ((hb_tag_t) script) | 0x20000000u;
  57 }
  58
  59 static hb_script_t
  60 hb_ot_old_tag_to_script (hb_tag_t tag)
  61 {
  62   if (unlikely (tag == HB_OT_TAG_DEFAULT_SCRIPT))
  63     return HB_SCRIPT_INVALID;
  64
  65   /* This side of the conversion is fully algorithmic. */
  66
  67   /* Any spaces at the end of the tag are replaced by repeating the last
  68    * letter.  Eg 'nko ' -> 'Nkoo' */
  69   if (unlikely ((tag & 0x0000FF00u) == 0x00002000u))
  70     tag |= (tag >> 8) & 0x0000FF00u; /* Copy second letter to third */
  71   if (unlikely ((tag & 0x000000FFu) == 0x00000020u))
  72     tag |= (tag >> 8) & 0x000000FFu; /* Copy third letter to fourth */
  73
  74   /* Change first char to uppercase and return */
  75   return (hb_script_t) (tag & ~0x20000000u);
  76 }
  77
  78 static hb_tag_t
  79 hb_ot_new_tag_from_script (hb_script_t script)
  80 {
  81   switch ((hb_tag_t) script) {
  82     case HB_SCRIPT_BENGALI:             return HB_TAG('b','n','g','2');
  83     case HB_SCRIPT_DEVANAGARI:          return HB_TAG('d','e','v','2');
  84     case HB_SCRIPT_GUJARATI:            return HB_TAG('g','j','r','2');
  85     case HB_SCRIPT_GURMUKHI:            return HB_TAG('g','u','r','2');
  86     case HB_SCRIPT_KANNADA:             return HB_TAG('k','n','d','2');
  87     case HB_SCRIPT_MALAYALAM:           return HB_TAG('m','l','m','2');
  88     case HB_SCRIPT_ORIYA:               return HB_TAG('o','r','y','2');
  89     case HB_SCRIPT_TAMIL:               return HB_TAG('t','m','l','2');
  90     case HB_SCRIPT_TELUGU:              return HB_TAG('t','e','l','2');
  91     case HB_SCRIPT_MYANMAR:             return HB_TAG('m','y','m','2');
  92   }
  93
  94   return HB_OT_TAG_DEFAULT_SCRIPT;
  95 }
  96
  97 static hb_script_t
  98 hb_ot_new_tag_to_script (hb_tag_t tag)
  99 {
 100   switch (tag) {
 101     case HB_TAG('b','n','g','2'):       return HB_SCRIPT_BENGALI;
 102     case HB_TAG('d','e','v','2'):       return HB_SCRIPT_DEVANAGARI;
 103     case HB_TAG('g','j','r','2'):       return HB_SCRIPT_GUJARATI;
 104     case HB_TAG('g','u','r','2'):       return HB_SCRIPT_GURMUKHI;
 105     case HB_TAG('k','n','d','2'):       return HB_SCRIPT_KANNADA;
 106     case HB_TAG('m','l','m','2'):       return HB_SCRIPT_MALAYALAM;
 107     case HB_TAG('o','r','y','2'):       return HB_SCRIPT_ORIYA;
 108     case HB_TAG('t','m','l','2'):       return HB_SCRIPT_TAMIL;
 109     case HB_TAG('t','e','l','2'):       return HB_SCRIPT_TELUGU;
 110     case HB_TAG('m','y','m','2'):       return HB_SCRIPT_MYANMAR;
 111   }
 112
 113   return HB_SCRIPT_UNKNOWN;
 114 }
 115
 116 void
 117 hb_ot_tags_from_script (hb_script_t  script,
 118                         hb_tag_t    *script_tag_1,
 119                         hb_tag_t    *script_tag_2)
 120 {
 121   unsigned int count = 2;
 122   hb_tag_t tags[2];
 123   hb_ot_tags_from_script_and_language (script, HB_LANGUAGE_INVALID, &count, tags, nullptr, nullptr);
 124   *script_tag_1 = count > 0 ? tags[0] : HB_OT_TAG_DEFAULT_SCRIPT;
 125   *script_tag_2 = count > 1 ? tags[1] : HB_OT_TAG_DEFAULT_SCRIPT;
 126 }
 127
 128 /*
 129  * Complete list at:
 130  * https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
 131  *
 132  * Most of the script tags are the same as the ISO 15924 tag but lowercased.
 133  * So we just do that, and handle the exceptional cases in a switch.
 134  */
 135
 136 static void
 137 hb_ot_all_tags_from_script (hb_script_t   script,
 138                             unsigned int *count /* IN/OUT */,
 139                             hb_tag_t     *tags /* OUT */)
 140 {
 141   unsigned int i = 0;
 142
 143   hb_tag_t new_tag = hb_ot_new_tag_from_script (script);
 144   if (unlikely (new_tag != HB_OT_TAG_DEFAULT_SCRIPT))
 145   {
 146     tags[i++] = new_tag | '3';
 147     if (*count > i)
 148       tags[i++] = new_tag;
 149   }
 150
 151   if (*count > i)
 152   {
 153     hb_tag_t old_tag = hb_ot_old_tag_from_script (script);
 154     if (old_tag != HB_OT_TAG_DEFAULT_SCRIPT)
 155       tags[i++] = old_tag;
 156   }
 157
 158   *count = i;
 159 }
 160
 161 hb_script_t
 162 hb_ot_tag_to_script (hb_tag_t tag)
 163 {
 164   unsigned char digit = tag & 0x000000FFu;
 165   if (unlikely (digit == '2' || digit == '3'))
 166     return hb_ot_new_tag_to_script (tag & 0xFFFFFF32);
 167
 168   return hb_ot_old_tag_to_script (tag);
 169 }
 170
 171
 172 /* hb_language_t */
 173
 174 static bool
 175 subtag_matches (const char *lang_str,
 176                 const char *limit,
 177                 const char *subtag)
 178 {
 179   do {
 180     const char *s = strstr (lang_str, subtag);
 181     if (!s || s >= limit)
 182       return false;
 183     if (!ISALNUM (s[strlen (subtag)]))
 184       return true;
 185     lang_str = s + strlen (subtag);
 186   } while (true);
 187 }
 188
 189 static hb_bool_t
 190 lang_matches (const char *lang_str, const char *spec)
 191 {
 192   unsigned int len = strlen (spec);
 193
 194   return strncmp (lang_str, spec, len) == 0 &&
 195          (lang_str[len] == '\0' || lang_str[len] == '-');
 196 }
 197
 198 struct LangTag
 199 {
 200   char language[4];
 201   hb_tag_t tags[HB_OT_MAX_TAGS_PER_LANGUAGE];
 202
 203   int cmp (const char *a) const
 204   {
 205     const char *b = this->language;
 206     unsigned int da, db;
 207     const char *p;
 208
 209     p = strchr (a, '-');
 210     da = p ? (unsigned int) (p - a) : strlen (a);
 211
 212     p = strchr (b, '-');
 213     db = p ? (unsigned int) (p - b) : strlen (b);
 214
 215     return strncmp (a, b, MAX (da, db));
 216   }
 217   int cmp (const LangTag *that) const
 218   { return cmp (that->language); }
 219 };
 220
 221 #include "hb-ot-tag-table.hh"
 222
 223 /* The corresponding languages IDs for the following IDs are unclear,
 224  * overlap, or are architecturally weird. Needs more research. */
 225
 226 /*{"??",        {HB_TAG('B','C','R',' ')}},*/   /* Bible Cree */
 227 /*{"zh?",       {HB_TAG('C','H','N',' ')}},*/   /* Chinese (seen in Microsoft fonts) */
 228 /*{"ar-Syrc?",  {HB_TAG('G','A','R',' ')}},*/   /* Garshuni */
 229 /*{"??",        {HB_TAG('N','G','R',' ')}},*/   /* Nagari */
 230 /*{"??",        {HB_TAG('Y','I','C',' ')}},*/   /* Yi Classic */
 231 /*{"zh?",       {HB_TAG('Z','H','P',' ')}},*/   /* Chinese Phonetic */
 232
 233 hb_tag_t
 234 hb_ot_tag_from_language (hb_language_t language)
 235 {
 236   unsigned int count = 1;
 237   hb_tag_t tags[1];
 238   hb_ot_tags_from_script_and_language (HB_SCRIPT_UNKNOWN, language, nullptr, nullptr, &count, tags);
 239   return count > 0 ? tags[0] : HB_OT_TAG_DEFAULT_LANGUAGE;
 240 }
 241
 242 static void
 243 hb_ot_tags_from_language (const char   *lang_str,
 244                           const char   *limit,
 245                           unsigned int *count,
 246                           hb_tag_t     *tags)
 247 {
 248   const char *s;
 249
 250   /* Check for matches of multiple subtags. */
 251   if (hb_ot_tags_from_complex_language (lang_str, limit, count, tags))
 252     return;
 253
 254   /* Find a language matching in the first component. */
 255   s = strchr (lang_str, '-');
 256   {
 257     const LangTag *lang_tag;
 258     if (s && limit - lang_str >= 6)
 259     {
 260       const char *extlang_end = strchr (s + 1, '-');
 261       /* If there is an extended language tag, use it. */
 262       if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) &&
 263           ISALPHA (s[1]))
 264         lang_str = s + 1;
 265     }
 266     lang_tag = hb_sorted_array (ot_languages).bsearch (lang_str);
 267     if (lang_tag)
 268     {
 269       unsigned int i;
 270       for (i = 0; i < *count && lang_tag->tags[i] != HB_TAG_NONE; i++)
 271         tags[i] = lang_tag->tags[i];
 272       *count = i;
 273       return;
 274     }
 275   }
 276
 277   if (!s)
 278     s = lang_str + strlen (lang_str);
 279   if (s - lang_str == 3) {
 280     /* Assume it's ISO-639-3 and upper-case and use it. */
 281     tags[0] = hb_tag_from_string (lang_str, s - lang_str) & ~0x20202000u;
 282     *count = 1;
 283     return;
 284   }
 285
 286   *count = 0;
 287 }
 288
 289 static bool
 290 parse_private_use_subtag (const char     *private_use_subtag,
 291                           unsigned int   *count,
 292                           hb_tag_t       *tags,
 293                           const char     *prefix,
 294                           unsigned char (*normalize) (unsigned char))
 295 {
 296   if (private_use_subtag && count && tags && *count)
 297   {
 298     const char *s = strstr (private_use_subtag, prefix);
 299     if (s)
 300     {
 301       char tag[4];
 302       int i;
 303       s += strlen (prefix);
 304       for (i = 0; i < 4 && ISALNUM (s[i]); i++)
 305         tag[i] = normalize (s[i]);
 306       if (i)
 307       {
 308         for (; i < 4; i++)
 309           tag[i] = ' ';
 310         tags[0] = HB_TAG (tag[0], tag[1], tag[2], tag[3]);
 311         if ((tags[0] & 0xDFDFDFDF) == HB_OT_TAG_DEFAULT_SCRIPT)
 312           tags[0] ^= ~0xDFDFDFDF;
 313         *count = 1;
 314         return false;
 315       }
 316     }
 317   }
 318   return true;
 319 }
 320
 321 /**
 322  * hb_ot_tags_from_script_and_language:
 323  * @script: an #hb_script_t to convert.
 324  * @language: an #hb_language_t to convert.
 325  * @script_count: (allow-none): maximum number of script tags to retrieve (IN)
 326  * and actual number of script tags retrieved (OUT)
 327  * @script_tags: (out) (allow-none): array of size at least @script_count to store the
 328  * script tag results
 329  * @language_count: (allow-none): maximum number of language tags to retrieve
 330  * (IN) and actual number of language tags retrieved (OUT)
 331  * @language_tags: (out) (allow-none): array of size at least @language_count to store
 332  * the language tag results
 333  *
 334  * Converts an #hb_script_t and an #hb_language_t to script and language tags.
 335  *
 336  * Since: 2.0.0
 337  **/
 338 void
 339 hb_ot_tags_from_script_and_language (hb_script_t   script,
 340                                      hb_language_t language,
 341                                      unsigned int *script_count /* IN/OUT */,
 342                                      hb_tag_t     *script_tags /* OUT */,
 343                                      unsigned int *language_count /* IN/OUT */,
 344                                      hb_tag_t     *language_tags /* OUT */)
 345 {
 346   bool needs_script = true;
 347
 348   if (language == HB_LANGUAGE_INVALID)
 349   {
 350     if (language_count && language_tags && *language_count)
 351       *language_count = 0;
 352   }
 353   else
 354   {
 355     const char *lang_str, *s, *limit, *private_use_subtag;
 356     bool needs_language;
 357
 358     lang_str = hb_language_to_string (language);
 359     limit = nullptr;
 360     private_use_subtag = nullptr;
 361     if (lang_str[0] == 'x' && lang_str[1] == '-')
 362     {
 363       private_use_subtag = lang_str;
 364     } else {
 365       for (s = lang_str + 1; *s; s++)
 366       {
 367         if (s[-1] == '-' && s[1] == '-')
 368         {
 369           if (s[0] == 'x')
 370           {
 371             private_use_subtag = s;
 372             if (!limit)
 373               limit = s - 1;
 374             break;
 375           } else if (!limit)
 376           {
 377             limit = s - 1;
 378           }
 379         }
 380       }
 381       if (!limit)
 382         limit = s;
 383     }
 384
 385     needs_script = parse_private_use_subtag (private_use_subtag, script_count, script_tags, "-hbsc", TOLOWER);
 386     needs_language = parse_private_use_subtag (private_use_subtag, language_count, language_tags, "-hbot", TOUPPER);
 387
 388     if (needs_language && language_count && language_tags && *language_count)
 389       hb_ot_tags_from_language (lang_str, limit, language_count, language_tags);
 390   }
 391
 392   if (needs_script && script_count && script_tags && *script_count)
 393     hb_ot_all_tags_from_script (script, script_count, script_tags);
 394 }
 395
 396 /**
 397  * hb_ot_tag_to_language:
 398  *
 399  *
 400  *
 401  * Return value: (transfer none):
 402  *
 403  * Since: 0.9.2
 404  **/
 405 hb_language_t
 406 hb_ot_tag_to_language (hb_tag_t tag)
 407 {
 408   unsigned int i;
 409
 410   if (tag == HB_OT_TAG_DEFAULT_LANGUAGE)
 411     return nullptr;
 412
 413   {
 414     hb_language_t disambiguated_tag = hb_ot_ambiguous_tag_to_language (tag);
 415     if (disambiguated_tag != HB_LANGUAGE_INVALID)
 416       return disambiguated_tag;
 417   }
 418
 419   for (i = 0; i < ARRAY_LENGTH (ot_languages); i++)
 420     if (ot_languages[i].tags[0] == tag)
 421       return hb_language_from_string (ot_languages[i].language, -1);
 422
 423   /* Else return a custom language in the form of "x-hbotABCD" */
 424   {
 425     unsigned char buf[11] = "x-hbot";
 426     buf[6] = tag >> 24;
 427     buf[7] = (tag >> 16) & 0xFF;
 428     buf[8] = (tag >> 8) & 0xFF;
 429     buf[9] = tag & 0xFF;
 430     if (buf[9] == 0x20)
 431       buf[9] = '\0';
 432     buf[10] = '\0';
 433     return hb_language_from_string ((char *) buf, -1);
 434   }
 435 }
 436
 437 /**
 438  * hb_ot_tags_to_script_and_language:
 439  * @script_tag: a script tag
 440  * @language_tag: a language tag
 441  * @script: (allow-none): the #hb_script_t corresponding to @script_tag (OUT).
 442  * @language: (allow-none): the #hb_language_t corresponding to @script_tag and
 443  * @language_tag (OUT).
 444  *
 445  * Converts a script tag and a language tag to an #hb_script_t and an
 446  * #hb_language_t.
 447  *
 448  * Since: 2.0.0
 449  **/
 450 void
 451 hb_ot_tags_to_script_and_language (hb_tag_t       script_tag,
 452                                    hb_tag_t       language_tag,
 453                                    hb_script_t   *script /* OUT */,
 454                                    hb_language_t *language /* OUT */)
 455 {
 456   hb_script_t script_out = hb_ot_tag_to_script (script_tag);
 457   if (script)
 458     *script = script_out;
 459   if (language)
 460   {
 461     unsigned int script_count = 1;
 462     hb_tag_t primary_script_tag[1];
 463     hb_ot_tags_from_script_and_language (script_out,
 464                                          HB_LANGUAGE_INVALID,
 465                                          &script_count,
 466                                          primary_script_tag,
 467                                          nullptr, nullptr);
 468     *language = hb_ot_tag_to_language (language_tag);
 469     if (script_count == 0 || primary_script_tag[0] != script_tag)
 470     {
 471       unsigned char *buf;
 472       const char *lang_str = hb_language_to_string (*language);
 473       size_t len = strlen (lang_str);
 474       buf = (unsigned char *) malloc (len + 11);
 475       if (unlikely (!buf))
 476       {
 477         *language = nullptr;
 478       }
 479       else
 480       {
 481         memcpy (buf, lang_str, len);
 482         if (lang_str[0] != 'x' || lang_str[1] != '-') {
 483           buf[len++] = '-';
 484           buf[len++] = 'x';
 485         }
 486         buf[len++] = '-';
 487         buf[len++] = 'h';
 488         buf[len++] = 'b';
 489         buf[len++] = 's';
 490         buf[len++] = 'c';
 491         buf[len++] = script_tag >> 24;
 492         buf[len++] = (script_tag >> 16) & 0xFF;
 493         buf[len++] = (script_tag >> 8) & 0xFF;
 494         buf[len++] = script_tag & 0xFF;
 495         *language = hb_language_from_string ((char *) buf, len);
 496         free (buf);
 497       }
 498     }
 499   }
 500 }
 501
 502 #ifdef MAIN
 503 static inline void
 504 test_langs_sorted ()
 505 {
 506   for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages); i++)
 507   {
 508     int c = ot_languages[i].cmp (&ot_languages[i - 1]);
 509     if (c >= 0)
 510     {
 511       fprintf (stderr, "ot_languages not sorted at index %d: %s %d %s\n",
 512                i, ot_languages[i-1].language, c, ot_languages[i].language);
 513       abort();
 514     }
 515   }
 516 }
 517
 518 int
 519 main ()
 520 {
 521   test_langs_sorted ();
 522   return 0;
 523 }
 524
 525 #endif