src/gen-tag-table.py

   1 #!/usr/bin/env python3
   2
   3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
   4 versa.
   5
   6 It creates a ``const LangTag[]``, matching the tags from the OpenType
   7 languages system tag list to the language subtags of the BCP 47 language
   8 subtag registry, with some manual adjustments. The mappings are
   9 supplemented with macrolanguages' sublanguages and retired codes'
  10 replacements, according to BCP 47 and some manual additions where BCP 47
  11 omits a retired code entirely.
  12
  13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
  14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
  15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
  16 multiple BCP 47 tags) are listed here, except when the alphabetically
  17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
  18 case, the fallback behavior will choose the right tag anyway.
  19
  20 usage: ./gen-tag-table.py languagetags language-subtag-registry
  21
  22 Input files:
  23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
  24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
  25 """
  26
  27 import collections
  28 from html.parser import HTMLParser
  29 def write (s):
  30         sys.stdout.flush ()
  31         sys.stdout.buffer.write (s.encode ('utf-8'))
  32 import itertools
  33 import re
  34 import sys
  35 import unicodedata
  36
  37 if len (sys.argv) != 3:
  38         sys.exit (__doc__)
  39
  40 from html import unescape
  41 def html_unescape (parser, entity):
  42         return unescape (entity)
  43
  44 def expect (condition, message=None):
  45         if not condition:
  46                 if message is None:
  47                         raise AssertionError
  48                 raise AssertionError (message)
  49
  50 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
  51 ISO_639_3_TO_1 = {
  52         'aar': 'aa',
  53         'abk': 'ab',
  54         'afr': 'af',
  55         'aka': 'ak',
  56         'amh': 'am',
  57         'ara': 'ar',
  58         'arg': 'an',
  59         'asm': 'as',
  60         'ava': 'av',
  61         'ave': 'ae',
  62         'aym': 'ay',
  63         'aze': 'az',
  64         'bak': 'ba',
  65         'bam': 'bm',
  66         'bel': 'be',
  67         'ben': 'bn',
  68         'bis': 'bi',
  69         'bod': 'bo',
  70         'bos': 'bs',
  71         'bre': 'br',
  72         'bul': 'bg',
  73         'cat': 'ca',
  74         'ces': 'cs',
  75         'cha': 'ch',
  76         'che': 'ce',
  77         'chu': 'cu',
  78         'chv': 'cv',
  79         'cor': 'kw',
  80         'cos': 'co',
  81         'cre': 'cr',
  82         'cym': 'cy',
  83         'dan': 'da',
  84         'deu': 'de',
  85         'div': 'dv',
  86         'dzo': 'dz',
  87         'ell': 'el',
  88         'eng': 'en',
  89         'epo': 'eo',
  90         'est': 'et',
  91         'eus': 'eu',
  92         'ewe': 'ee',
  93         'fao': 'fo',
  94         'fas': 'fa',
  95         'fij': 'fj',
  96         'fin': 'fi',
  97         'fra': 'fr',
  98         'fry': 'fy',
  99         'ful': 'ff',
 100         'gla': 'gd',
 101         'gle': 'ga',
 102         'glg': 'gl',
 103         'glv': 'gv',
 104         'grn': 'gn',
 105         'guj': 'gu',
 106         'hat': 'ht',
 107         'hau': 'ha',
 108         'hbs': 'sh',
 109         'heb': 'he',
 110         'her': 'hz',
 111         'hin': 'hi',
 112         'hmo': 'ho',
 113         'hrv': 'hr',
 114         'hun': 'hu',
 115         'hye': 'hy',
 116         'ibo': 'ig',
 117         'ido': 'io',
 118         'iii': 'ii',
 119         'iku': 'iu',
 120         'ile': 'ie',
 121         'ina': 'ia',
 122         'ind': 'id',
 123         'ipk': 'ik',
 124         'isl': 'is',
 125         'ita': 'it',
 126         'jav': 'jv',
 127         'jpn': 'ja',
 128         'kal': 'kl',
 129         'kan': 'kn',
 130         'kas': 'ks',
 131         'kat': 'ka',
 132         'kau': 'kr',
 133         'kaz': 'kk',
 134         'khm': 'km',
 135         'kik': 'ki',
 136         'kin': 'rw',
 137         'kir': 'ky',
 138         'kom': 'kv',
 139         'kon': 'kg',
 140         'kor': 'ko',
 141         'kua': 'kj',
 142         'kur': 'ku',
 143         'lao': 'lo',
 144         'lat': 'la',
 145         'lav': 'lv',
 146         'lim': 'li',
 147         'lin': 'ln',
 148         'lit': 'lt',
 149         'ltz': 'lb',
 150         'lub': 'lu',
 151         'lug': 'lg',
 152         'mah': 'mh',
 153         'mal': 'ml',
 154         'mar': 'mr',
 155         'mkd': 'mk',
 156         'mlg': 'mg',
 157         'mlt': 'mt',
 158         'mol': 'mo',
 159         'mon': 'mn',
 160         'mri': 'mi',
 161         'msa': 'ms',
 162         'mya': 'my',
 163         'nau': 'na',
 164         'nav': 'nv',
 165         'nbl': 'nr',
 166         'nde': 'nd',
 167         'ndo': 'ng',
 168         'nep': 'ne',
 169         'nld': 'nl',
 170         'nno': 'nn',
 171         'nob': 'nb',
 172         'nor': 'no',
 173         'nya': 'ny',
 174         'oci': 'oc',
 175         'oji': 'oj',
 176         'ori': 'or',
 177         'orm': 'om',
 178         'oss': 'os',
 179         'pan': 'pa',
 180         'pli': 'pi',
 181         'pol': 'pl',
 182         'por': 'pt',
 183         'pus': 'ps',
 184         'que': 'qu',
 185         'roh': 'rm',
 186         'ron': 'ro',
 187         'run': 'rn',
 188         'rus': 'ru',
 189         'sag': 'sg',
 190         'san': 'sa',
 191         'sin': 'si',
 192         'slk': 'sk',
 193         'slv': 'sl',
 194         'sme': 'se',
 195         'smo': 'sm',
 196         'sna': 'sn',
 197         'snd': 'sd',
 198         'som': 'so',
 199         'sot': 'st',
 200         'spa': 'es',
 201         'sqi': 'sq',
 202         'srd': 'sc',
 203         'srp': 'sr',
 204         'ssw': 'ss',
 205         'sun': 'su',
 206         'swa': 'sw',
 207         'swe': 'sv',
 208         'tah': 'ty',
 209         'tam': 'ta',
 210         'tat': 'tt',
 211         'tel': 'te',
 212         'tgk': 'tg',
 213         'tgl': 'tl',
 214         'tha': 'th',
 215         'tir': 'ti',
 216         'ton': 'to',
 217         'tsn': 'tn',
 218         'tso': 'ts',
 219         'tuk': 'tk',
 220         'tur': 'tr',
 221         'twi': 'tw',
 222         'uig': 'ug',
 223         'ukr': 'uk',
 224         'urd': 'ur',
 225         'uzb': 'uz',
 226         'ven': 've',
 227         'vie': 'vi',
 228         'vol': 'vo',
 229         'wln': 'wa',
 230         'wol': 'wo',
 231         'xho': 'xh',
 232         'yid': 'yi',
 233         'yor': 'yo',
 234         'zha': 'za',
 235         'zho': 'zh',
 236         'zul': 'zu',
 237 }
 238
 239 class LanguageTag (object):
 240         """A BCP 47 language tag.
 241
 242         Attributes:
 243                 subtags (List[str]): The list of subtags in this tag.
 244                 grandfathered (bool): Whether this tag is grandfathered. If
 245                         ``true``, the entire lowercased tag is the ``language``
 246                         and the other subtag fields are empty.
 247                 language (str): The language subtag.
 248                 script (str): The script subtag.
 249                 region (str): The region subtag.
 250                 variant (str): The variant subtag.
 251
 252         Args:
 253                 tag (str): A BCP 47 language tag.
 254
 255         """
 256         def __init__ (self, tag):
 257                 global bcp_47
 258                 self.subtags = tag.lower ().split ('-')
 259                 self.grandfathered = tag.lower () in bcp_47.grandfathered
 260                 if self.grandfathered:
 261                         self.language = tag.lower ()
 262                         self.script = ''
 263                         self.region = ''
 264                         self.variant = ''
 265                 else:
 266                         self.language = self.subtags[0]
 267                         self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
 268                         self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
 269                         self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
 270
 271         def __str__(self):
 272                 return '-'.join(self.subtags)
 273
 274         def __repr__ (self):
 275                 return 'LanguageTag(%r)' % str(self)
 276
 277         @staticmethod
 278         def _find_first (function, sequence):
 279                 try:
 280                         return next (iter (filter (function, sequence)))
 281                 except StopIteration:
 282                         return None
 283
 284         def is_complex (self):
 285                 """Return whether this tag is too complex to represent as a
 286                 ``LangTag`` in the generated code.
 287
 288                 Complex tags need to be handled in
 289                 ``hb_ot_tags_from_complex_language``.
 290
 291                 Returns:
 292                         Whether this tag is complex.
 293                 """
 294                 return not (len (self.subtags) == 1
 295                         or self.grandfathered
 296                         and len (self.subtags[1]) != 3
 297                         and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
 298
 299         def get_group (self):
 300                 """Return the group into which this tag should be categorized in
 301                 ``hb_ot_tags_from_complex_language``.
 302
 303                 The group is the first letter of the tag, or ``'und'`` if this tag
 304                 should not be matched in a ``switch`` statement in the generated
 305                 code.
 306
 307                 Returns:
 308                         This tag's group.
 309                 """
 310                 return ('und'
 311                         if (self.language == 'und'
 312                                 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
 313                         else self.language[0])
 314
 315 class OpenTypeRegistryParser (HTMLParser):
 316         """A parser for the OpenType language system tag registry.
 317
 318         Attributes:
 319                 header (str): The "last updated" line of the registry.
 320                 names (Mapping[str, str]): A map of language system tags to the
 321                         names they are given in the registry.
 322                 ranks (DefaultDict[str, int]): A map of language system tags to
 323                         numbers. If a single BCP 47 tag corresponds to multiple
 324                         OpenType tags, the tags are ordered in increasing order by
 325                         rank. The rank is based on the number of BCP 47 tags
 326                         associated with a tag, though it may be manually modified.
 327                 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
 328                         OpenType language system tags to sets of BCP 47 tags.
 329                 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
 330                         inverted. Its values start as unsorted sets;
 331                         ``sort_languages`` converts them to sorted lists.
 332
 333         """
 334         def __init__ (self):
 335                 HTMLParser.__init__ (self)
 336                 self.header = ''
 337                 self.names = {}
 338                 self.ranks = collections.defaultdict (int)
 339                 self.to_bcp_47 = collections.defaultdict (set)
 340                 self.from_bcp_47 = collections.defaultdict (set)
 341                 # Whether the parser is in a <td> element
 342                 self._td = False
 343                 # The text of the <td> elements of the current <tr> element.
 344                 self._current_tr = []
 345
 346         def handle_starttag (self, tag, attrs):
 347                 if tag == 'meta':
 348                         for attr, value in attrs:
 349                                 if attr == 'name' and value == 'updated_at':
 350                                         self.header = self.get_starttag_text ()
 351                                         break
 352                 elif tag == 'td':
 353                         self._td = True
 354                         self._current_tr.append ('')
 355                 elif tag == 'tr':
 356                         self._current_tr = []
 357
 358         def handle_endtag (self, tag):
 359                 if tag == 'td':
 360                         self._td = False
 361                 elif tag == 'tr' and self._current_tr:
 362                         expect (2 <= len (self._current_tr) <= 3)
 363                         name = self._current_tr[0].strip ()
 364                         tag = self._current_tr[1].strip ("\t\n\v\f\r '")
 365                         rank = 0
 366                         if len (tag) > 4:
 367                                 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
 368                                 name += ' (deprecated)'
 369                                 tag = tag.split (' ')[0]
 370                                 rank = 1
 371                         self.names[tag] = re.sub (' languages$', '', name)
 372                         if not self._current_tr[2]:
 373                                 return
 374                         iso_codes = self._current_tr[2].strip ()
 375                         self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
 376                         rank += 2 * len (self.to_bcp_47[tag])
 377                         self.ranks[tag] = rank
 378
 379         def handle_data (self, data):
 380                 if self._td:
 381                         self._current_tr[-1] += data
 382
 383         def handle_charref (self, name):
 384                 self.handle_data (html_unescape (self, '&#%s;' % name))
 385
 386         def handle_entityref (self, name):
 387                 self.handle_data (html_unescape (self, '&%s;' % name))
 388
 389         def parse (self, filename):
 390                 """Parse the OpenType language system tag registry.
 391
 392                 Args:
 393                         filename (str): The file name of the registry.
 394                 """
 395                 with open (filename, encoding='utf-8') as f:
 396                         self.feed (f.read ())
 397                 expect (self.header)
 398                 for tag, iso_codes in self.to_bcp_47.items ():
 399                         for iso_code in iso_codes:
 400                                 self.from_bcp_47[iso_code].add (tag)
 401
 402         def add_language (self, bcp_47_tag, ot_tag):
 403                 """Add a language as if it were in the registry.
 404
 405                 Args:
 406                         bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
 407                                 a language subtag, and if the language subtag is a
 408                                 macrolanguage, then new languages are added corresponding
 409                                 to the macrolanguages' individual languages with the
 410                                 remainder of the tag appended.
 411                         ot_tag (str): An OpenType language system tag.
 412                 """
 413                 global bcp_47
 414                 self.to_bcp_47[ot_tag].add (bcp_47_tag)
 415                 self.from_bcp_47[bcp_47_tag].add (ot_tag)
 416                 if bcp_47_tag.lower () not in bcp_47.grandfathered:
 417                         try:
 418                                 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
 419                                 if macrolanguage in bcp_47.macrolanguages:
 420                                         s = set ()
 421                                         for language in bcp_47.macrolanguages[macrolanguage]:
 422                                                 if language.lower () not in bcp_47.grandfathered:
 423                                                         s.add ('%s-%s' % (language, suffix))
 424                                         bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
 425                         except ValueError:
 426                                 pass
 427
 428         @staticmethod
 429         def _remove_language (tag_1, dict_1, dict_2):
 430                 for tag_2 in dict_1.pop (tag_1):
 431                         dict_2[tag_2].remove (tag_1)
 432                         if not dict_2[tag_2]:
 433                                 del dict_2[tag_2]
 434
 435         def remove_language_ot (self, ot_tag):
 436                 """Remove an OpenType tag from the registry.
 437
 438                 Args:
 439                         ot_tag (str): An OpenType tag.
 440                 """
 441                 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
 442
 443         def remove_language_bcp_47 (self, bcp_47_tag):
 444                 """Remove a BCP 47 tag from the registry.
 445
 446                 Args:
 447                         bcp_47_tag (str): A BCP 47 tag.
 448                 """
 449                 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
 450
 451         def inherit_from_macrolanguages (self):
 452                 """Copy mappings from macrolanguages to individual languages.
 453
 454                 If a BCP 47 tag for an individual mapping has no OpenType
 455                 mapping but its macrolanguage does, the mapping is copied to
 456                 the individual language. For example, als (Tosk Albanian) has no
 457                 explicit mapping, so it inherits from sq (Albanian) the mapping
 458                 to SQI.
 459
 460                 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
 461                 all of its individual languages do and they all map to the same
 462                 tags, the mapping is copied to the macrolanguage.
 463                 """
 464                 global bcp_47
 465                 original_ot_from_bcp_47 = dict (self.from_bcp_47)
 466                 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
 467                         ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
 468                         if ot_macrolanguages:
 469                                 for ot_macrolanguage in ot_macrolanguages:
 470                                         for language in languages:
 471                                                 # Remove the following condition if e.g. nn should map to NYN,NOR
 472                                                 # instead of just NYN.
 473                                                 if language not in original_ot_from_bcp_47:
 474                                                         self.add_language (language, ot_macrolanguage)
 475                                                         self.ranks[ot_macrolanguage] += 1
 476                         else:
 477                                 for language in languages:
 478                                         if language in original_ot_from_bcp_47:
 479                                                 if ot_macrolanguages:
 480                                                         ml = original_ot_from_bcp_47[language]
 481                                                         if ml:
 482                                                                 ot_macrolanguages &= ml
 483                                                         else:
 484                                                                 pass
 485                                                 else:
 486                                                         ot_macrolanguages |= original_ot_from_bcp_47[language]
 487                                         else:
 488                                                 ot_macrolanguages.clear ()
 489                                         if not ot_macrolanguages:
 490                                                 break
 491                                 for ot_macrolanguage in ot_macrolanguages:
 492                                         self.add_language (macrolanguage, ot_macrolanguage)
 493
 494         def sort_languages (self):
 495                 """Sort the values of ``from_bcp_47`` in ascending rank order."""
 496                 for language, tags in self.from_bcp_47.items ():
 497                         self.from_bcp_47[language] = sorted (tags,
 498                                         key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
 499
 500 ot = OpenTypeRegistryParser ()
 501
 502 class BCP47Parser (object):
 503         """A parser for the BCP 47 subtag registry.
 504
 505         Attributes:
 506                 header (str): The "File-Date" line of the registry.
 507                 names (Mapping[str, str]): A map of subtags to the names they
 508                         are given in the registry. Each value is a
 509                         ``'\\n'``-separated list of names.
 510                 scopes (Mapping[str, str]): A map of language subtags to strings
 511                         suffixed to language names, including suffixes to explain
 512                         language scopes.
 513                 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
 514                         language subtags to the sets of language subtags which
 515                         inherit from them. See
 516                         ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
 517                 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
 518                         subtags to their prefixes.
 519                 grandfathered (AbstractSet[str]): The set of grandfathered tags,
 520                         normalized to lowercase.
 521
 522         """
 523         def __init__ (self):
 524                 self.header = ''
 525                 self.names = {}
 526                 self.scopes = {}
 527                 self.macrolanguages = collections.defaultdict (set)
 528                 self.prefixes = collections.defaultdict (set)
 529                 self.grandfathered = set ()
 530
 531         def parse (self, filename):
 532                 """Parse the BCP 47 subtag registry.
 533
 534                 Args:
 535                         filename (str): The file name of the registry.
 536                 """
 537                 with open (filename, encoding='utf-8') as f:
 538                         subtag_type = None
 539                         subtag = None
 540                         deprecated = False
 541                         has_preferred_value = False
 542                         line_buffer = ''
 543                         for line in itertools.chain (f, ['']):
 544                                 line = line.rstrip ()
 545                                 if line.startswith (' '):
 546                                         line_buffer += line[1:]
 547                                         continue
 548                                 line, line_buffer = line_buffer, line
 549                                 if line.startswith ('Type: '):
 550                                         subtag_type = line.split (' ')[1]
 551                                         deprecated = False
 552                                         has_preferred_value = False
 553                                 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
 554                                         subtag = line.split (' ')[1]
 555                                         if subtag_type == 'grandfathered':
 556                                                 self.grandfathered.add (subtag.lower ())
 557                                 elif line.startswith ('Description: '):
 558                                         description = line.split (' ', 1)[1].replace (' (individual language)', '')
 559                                         description = re.sub (' (\((individual |macro)language\)|languages)$', '',
 560                                                         description)
 561                                         if subtag in self.names:
 562                                                 self.names[subtag] += '\n' + description
 563                                         else:
 564                                                 self.names[subtag] = description
 565                                 elif subtag_type == 'language' or subtag_type == 'grandfathered':
 566                                         if line.startswith ('Scope: '):
 567                                                 scope = line.split (' ')[1]
 568                                                 if scope == 'macrolanguage':
 569                                                         scope = ' [macrolanguage]'
 570                                                 elif scope == 'collection':
 571                                                         scope = ' [family]'
 572                                                 else:
 573                                                         continue
 574                                                 self.scopes[subtag] = scope
 575                                         elif line.startswith ('Deprecated: '):
 576                                                 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
 577                                                 deprecated = True
 578                                         elif deprecated and line.startswith ('Comments: see '):
 579                                                 # If a subtag is split into multiple replacement subtags,
 580                                                 # it essentially represents a macrolanguage.
 581                                                 for language in line.replace (',', '').split (' ')[2:]:
 582                                                         self._add_macrolanguage (subtag, language)
 583                                         elif line.startswith ('Preferred-Value: '):
 584                                                 # If a subtag is deprecated in favor of a single replacement subtag,
 585                                                 # it is either a dialect or synonym of the preferred subtag. Either
 586                                                 # way, it is close enough to the truth to consider the replacement
 587                                                 # the macrolanguage of the deprecated language.
 588                                                 has_preferred_value = True
 589                                                 macrolanguage = line.split (' ')[1]
 590                                                 self._add_macrolanguage (macrolanguage, subtag)
 591                                         elif not has_preferred_value and line.startswith ('Macrolanguage: '):
 592                                                 self._add_macrolanguage (line.split (' ')[1], subtag)
 593                                 elif subtag_type == 'variant':
 594                                         if line.startswith ('Prefix: '):
 595                                                 self.prefixes[subtag].add (line.split (' ')[1])
 596                                 elif line.startswith ('File-Date: '):
 597                                         self.header = line
 598                 expect (self.header)
 599
 600         def _add_macrolanguage (self, macrolanguage, language):
 601                 global ot
 602                 if language not in ot.from_bcp_47:
 603                         for l in self.macrolanguages.get (language, set ()):
 604                                 self._add_macrolanguage (macrolanguage, l)
 605                 if macrolanguage not in ot.from_bcp_47:
 606                         for ls in list (self.macrolanguages.values ()):
 607                                 if macrolanguage in ls:
 608                                         ls.add (language)
 609                                         return
 610                 self.macrolanguages[macrolanguage].add (language)
 611
 612         def remove_extra_macrolanguages (self):
 613                 """Make every language have at most one macrolanguage."""
 614                 inverted = collections.defaultdict (list)
 615                 for macrolanguage, languages in self.macrolanguages.items ():
 616                         for language in languages:
 617                                 inverted[language].append (macrolanguage)
 618                 for language, macrolanguages in inverted.items ():
 619                         if len (macrolanguages) > 1:
 620                                 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
 621                                 biggest_macrolanguage = macrolanguages.pop ()
 622                                 for macrolanguage in macrolanguages:
 623                                         self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
 624
 625         def get_name (self, lt):
 626                 """Return the names of the subtags in a language tag.
 627
 628                 Args:
 629                         lt (LanguageTag): A BCP 47 language tag.
 630
 631                 Returns:
 632                         The name form of ``lt``.
 633                 """
 634                 name = self.names[lt.language].split ('\n')[0]
 635                 if lt.script:
 636                         name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
 637                 if lt.region:
 638                         name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
 639                 if lt.variant:
 640                         name += '; ' + self.names[lt.variant].split ('\n')[0]
 641                 return name
 642
 643 bcp_47 = BCP47Parser ()
 644
 645 ot.parse (sys.argv[1])
 646 bcp_47.parse (sys.argv[2])
 647
 648 ot.add_language ('ary', 'MOR')
 649
 650 ot.add_language ('ath', 'ATH')
 651
 652 ot.add_language ('bai', 'BML')
 653
 654 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
 655
 656 ot.add_language ('ber', 'BBR')
 657
 658 ot.remove_language_ot ('PGR')
 659 ot.add_language ('el-polyton', 'PGR')
 660
 661 bcp_47.macrolanguages['et'] = {'ekk'}
 662
 663 bcp_47.names['flm'] = 'Falam Chin'
 664 bcp_47.scopes['flm'] = ' (retired code)'
 665 bcp_47.macrolanguages['flm'] = {'cfm'}
 666
 667 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
 668
 669 ot.add_language ('und-fonipa', 'IPPH')
 670
 671 ot.add_language ('und-fonnapa', 'APPH')
 672
 673 ot.remove_language_ot ('IRT')
 674 ot.add_language ('ga-Latg', 'IRT')
 675
 676 ot.remove_language_ot ('KGE')
 677 ot.add_language ('und-Geok', 'KGE')
 678
 679 ot.add_language ('guk', 'GUK')
 680 ot.names['GUK'] = 'Gumuz (SIL fonts)'
 681 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
 682
 683 bcp_47.macrolanguages['id'] = {'in'}
 684
 685 bcp_47.macrolanguages['ijo'] = {'ijc'}
 686
 687 ot.add_language ('kht', 'KHN')
 688 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
 689 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
 690 ot.ranks['KHN'] = ot.ranks['KHT']
 691 ot.ranks['KHT'] += 1
 692
 693 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
 694
 695 ot.names['MAL'] = 'Malayalam Traditional'
 696 ot.ranks['MLR'] += 1
 697
 698 bcp_47.names['mhv'] = 'Arakanese'
 699 bcp_47.scopes['mhv'] = ' (retired code)'
 700
 701 ot.add_language ('no', 'NOR')
 702
 703 ot.add_language ('oc-provenc', 'PRO')
 704
 705 ot.add_language ('qu', 'QUZ')
 706 ot.add_language ('qub', 'QWH')
 707 ot.add_language ('qud', 'QVI')
 708 ot.add_language ('qug', 'QVI')
 709 ot.add_language ('qup', 'QVI')
 710 ot.add_language ('qur', 'QWH')
 711 ot.add_language ('qus', 'QUH')
 712 ot.add_language ('quw', 'QVI')
 713 ot.add_language ('qux', 'QWH')
 714 ot.add_language ('qva', 'QWH')
 715 ot.add_language ('qvh', 'QWH')
 716 ot.add_language ('qvj', 'QVI')
 717 ot.add_language ('qvl', 'QWH')
 718 ot.add_language ('qvm', 'QWH')
 719 ot.add_language ('qvn', 'QWH')
 720 ot.add_language ('qvo', 'QVI')
 721 ot.add_language ('qvp', 'QWH')
 722 ot.add_language ('qvw', 'QWH')
 723 ot.add_language ('qvz', 'QVI')
 724 ot.add_language ('qwa', 'QWH')
 725 ot.add_language ('qws', 'QWH')
 726 ot.add_language ('qxa', 'QWH')
 727 ot.add_language ('qxc', 'QWH')
 728 ot.add_language ('qxh', 'QWH')
 729 ot.add_language ('qxl', 'QVI')
 730 ot.add_language ('qxn', 'QWH')
 731 ot.add_language ('qxo', 'QWH')
 732 ot.add_language ('qxr', 'QVI')
 733 ot.add_language ('qxt', 'QWH')
 734 ot.add_language ('qxw', 'QWH')
 735
 736 bcp_47.macrolanguages['ro'].remove ('mo')
 737 bcp_47.macrolanguages['ro-MD'].add ('mo')
 738
 739 ot.add_language ('sgw', 'SGW')
 740 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
 741 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
 742
 743 ot.remove_language_ot ('SYRE')
 744 ot.remove_language_ot ('SYRJ')
 745 ot.remove_language_ot ('SYRN')
 746 ot.add_language ('und-Syre', 'SYRE')
 747 ot.add_language ('und-Syrj', 'SYRJ')
 748 ot.add_language ('und-Syrn', 'SYRN')
 749
 750 bcp_47.names['xst'] = "Silt'e"
 751 bcp_47.scopes['xst'] = ' (retired code)'
 752 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
 753
 754 ot.add_language ('xwo', 'TOD')
 755
 756 ot.remove_language_ot ('ZHH')
 757 ot.remove_language_ot ('ZHP')
 758 ot.remove_language_ot ('ZHT')
 759 bcp_47.macrolanguages['zh'].remove ('lzh')
 760 bcp_47.macrolanguages['zh'].remove ('yue')
 761 ot.add_language ('zh-Hant-MO', 'ZHH')
 762 ot.add_language ('zh-Hant-HK', 'ZHH')
 763 ot.add_language ('zh-Hans', 'ZHS')
 764 ot.add_language ('zh-Hant', 'ZHT')
 765 ot.add_language ('zh-HK', 'ZHH')
 766 ot.add_language ('zh-MO', 'ZHH')
 767 ot.add_language ('zh-TW', 'ZHT')
 768 ot.add_language ('lzh', 'ZHT')
 769 ot.add_language ('lzh-Hans', 'ZHS')
 770 ot.add_language ('yue', 'ZHH')
 771 ot.add_language ('yue-Hans', 'ZHS')
 772
 773 bcp_47.macrolanguages['zom'] = {'yos'}
 774
 775 def rank_delta (bcp_47, ot):
 776         """Return a delta to apply to a BCP 47 tag's rank.
 777
 778         Most OpenType tags have a constant rank, but a few have ranks that
 779         depend on the BCP 47 tag.
 780
 781         Args:
 782                 bcp_47 (str): A BCP 47 tag.
 783                 ot (str): An OpenType tag to.
 784
 785         Returns:
 786                 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
 787                 OpenType equivalents.
 788         """
 789         if bcp_47 == 'ak' and ot == 'AKA':
 790                 return -1
 791         if bcp_47 == 'tw' and ot == 'TWI':
 792                 return -1
 793         return 0
 794
 795 disambiguation = {
 796         'ALT': 'alt',
 797         'ARK': 'rki',
 798         'BHI': 'bhb',
 799         'BLN': 'bjt',
 800         'BTI': 'beb',
 801         'CCHN': 'cco',
 802         'CMR': 'swb',
 803         'CPP': 'crp',
 804         'CRR': 'crx',
 805         'DUJ': 'dwu',
 806         'ECR': 'crj',
 807         'HAL': 'cfm',
 808         'HND': 'hnd',
 809         'KIS': 'kqs',
 810         'KUI': 'uki',
 811         'LRC': 'bqi',
 812         'NDB': 'nd',
 813         'NIS': 'njz',
 814         'PLG': 'pce',
 815         'PRO': 'pro',
 816         'QIN': 'bgr',
 817         'QUH': 'quh',
 818         'QVI': 'qvi',
 819         'QWH': 'qwh',
 820         'SIG': 'stv',
 821         'TNE': 'yrk',
 822         'ZHH': 'zh-HK',
 823         'ZHS': 'zh-Hans',
 824         'ZHT': 'zh-Hant',
 825 }
 826
 827 ot.inherit_from_macrolanguages ()
 828 bcp_47.remove_extra_macrolanguages ()
 829 ot.inherit_from_macrolanguages ()
 830 ot.sort_languages ()
 831
 832 print ('/* == Start of generated table == */')
 833 print ('/*')
 834 print (' * The following table is generated by running:')
 835 print (' *')
 836 print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
 837 print (' *')
 838 print (' * on files with these headers:')
 839 print (' *')
 840 print (' * %s' % ot.header.strip ())
 841 print (' * %s' % bcp_47.header)
 842 print (' */')
 843 print ()
 844 print ('#ifndef HB_OT_TAG_TABLE_HH')
 845 print ('#define HB_OT_TAG_TABLE_HH')
 846 print ()
 847 print ('static const LangTag ot_languages[] = {')
 848
 849 def hb_tag (tag):
 850         """Convert a tag to ``HB_TAG`` form.
 851
 852         Args:
 853                 tag (str): An OpenType tag.
 854
 855         Returns:
 856                 A snippet of C++ representing ``tag``.
 857         """
 858         return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
 859
 860 def get_variant_set (name):
 861         """Return a set of variant language names from a name.
 862
 863         Args:
 864                 name (str): A list of language names from the BCP 47 registry,
 865                         joined on ``'\\n'``.
 866
 867         Returns:
 868                 A set of normalized language names.
 869         """
 870         return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
 871                         .encode ('ASCII', 'ignore')
 872                         .strip ()
 873                         for n in re.split ('[\n(),]', name) if n)
 874
 875 def language_name_intersection (a, b):
 876         """Return the names in common between two language names.
 877
 878         Args:
 879                 a (str): A list of language names from the BCP 47 registry,
 880                         joined on ``'\\n'``.
 881                 b (str): A list of language names from the BCP 47 registry,
 882                         joined on ``'\\n'``.
 883
 884         Returns:
 885                 The normalized language names shared by ``a`` and ``b``.
 886         """
 887         return get_variant_set (a).intersection (get_variant_set (b))
 888
 889 def get_matching_language_name (intersection, candidates):
 890         return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
 891
 892 def same_tag (bcp_47_tag, ot_tags):
 893         return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
 894
 895 for language, tags in sorted (ot.from_bcp_47.items ()):
 896         if language == '' or '-' in language:
 897                 continue
 898         commented_out = same_tag (language, tags)
 899         for i, tag in enumerate (tags, start=1):
 900                 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
 901                 if commented_out:
 902                         print ('*/', end='')
 903                 print ('\t/* ', end='')
 904                 bcp_47_name = bcp_47.names.get (language, '')
 905                 bcp_47_name_candidates = bcp_47_name.split ('\n')
 906                 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
 907                 scope = bcp_47.scopes.get (language, '')
 908                 if not intersection:
 909                         write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
 910                 else:
 911                         name = get_matching_language_name (intersection, bcp_47_name_candidates)
 912                         bcp_47.names[language] = name
 913                         write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
 914                 print (' */')
 915
 916 print ('};')
 917 print ()
 918
 919 print ('/**')
 920 print (' * hb_ot_tags_from_complex_language:')
 921 print (' * @lang_str: a BCP 47 language tag to convert.')
 922 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
 923 print (' * conversion.')
 924 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
 925 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
 926 print (' * @tags: array of size at least @language_count to store the language tag')
 927 print (' * results')
 928 print (' *')
 929 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
 930 print (' *')
 931 print (' * Return value: Whether any language systems were retrieved.')
 932 print (' **/')
 933 print ('static bool')
 934 print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
 935 print ('\t\t\t\t  const char   *limit,')
 936 print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
 937 print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
 938 print ('{')
 939
 940 def print_subtag_matches (subtag, new_line):
 941         if subtag:
 942                 if new_line:
 943                         print ()
 944                         print ('\t&& ', end='')
 945                 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
 946
 947 complex_tags = collections.defaultdict (list)
 948 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
 949                         (LanguageTag (language), tags)
 950                         for language, tags in sorted (ot.from_bcp_47.items (),
 951                                 key=lambda i: (-len (i[0]), i[0]))
 952                 ] if lt_tags[0].is_complex ()),
 953                 key=lambda lt_tags: lt_tags[0].get_group ()):
 954         complex_tags[initial] += group
 955
 956 for initial, items in sorted (complex_tags.items ()):
 957         if initial != 'und':
 958                 continue
 959         for lt, tags in items:
 960                 if lt.variant in bcp_47.prefixes:
 961                         expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
 962                                         '%s is not a valid prefix of %s' % (lt.language, lt.variant))
 963                 print ('  if (', end='')
 964                 print_subtag_matches (lt.script, False)
 965                 print_subtag_matches (lt.region, False)
 966                 print_subtag_matches (lt.variant, False)
 967                 print (')')
 968                 print ('  {')
 969                 write ('    /* %s */' % bcp_47.get_name (lt))
 970                 print ()
 971                 if len (tags) == 1:
 972                         write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
 973                         print ()
 974                         print ('    *count = 1;')
 975                 else:
 976                         print ('    hb_tag_t possible_tags[] = {')
 977                         for tag in tags:
 978                                 write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
 979                                 print ()
 980                         print ('    };')
 981                         print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
 982                         print ('      tags[i] = possible_tags[i];')
 983                         print ('    *count = i;')
 984                 print ('    return true;')
 985                 print ('  }')
 986
 987 print ('  switch (lang_str[0])')
 988 print ('  {')
 989 for initial, items in sorted (complex_tags.items ()):
 990         if initial == 'und':
 991                 continue
 992         print ("  case '%s':" % initial)
 993         for lt, tags in items:
 994                 print ('    if (', end='')
 995                 if lt.grandfathered:
 996                         print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
 997                 else:
 998                         string_literal = lt.language[1:] + '-'
 999                         if lt.script:
1000                                 string_literal += lt.script
1001                                 lt.script = None
1002                                 if lt.region:
1003                                         string_literal += '-' + lt.region
1004                                         lt.region = None
1005                         if string_literal[-1] == '-':
1006                                 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1007                         else:
1008                                 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1009                 print_subtag_matches (lt.script, True)
1010                 print_subtag_matches (lt.region, True)
1011                 print_subtag_matches (lt.variant, True)
1012                 print (')')
1013                 print ('    {')
1014                 write ('      /* %s */' % bcp_47.get_name (lt))
1015                 print ()
1016                 if len (tags) == 1:
1017                         write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1018                         print ()
1019                         print ('      *count = 1;')
1020                 else:
1021                         print ('      unsigned int i;')
1022                         print ('      hb_tag_t possible_tags[] = {')
1023                         for tag in tags:
1024                                 write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1025                                 print ()
1026                         print ('      };')
1027                         print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1028                         print ('\ttags[i] = possible_tags[i];')
1029                         print ('      *count = i;')
1030                 print ('      return true;')
1031                 print ('    }')
1032         print ('    break;')
1033
1034 print ('  }')
1035 print ('  return false;')
1036 print ('}')
1037 print ()
1038 print ('/**')
1039 print (' * hb_ot_ambiguous_tag_to_language')
1040 print (' * @tag: A language tag.')
1041 print (' *')
1042 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1043 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1044 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1045 print (' * in #ot_languages.')
1046 print (' *')
1047 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1048 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1049 print (' **/')
1050 print ('static hb_language_t')
1051 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1052 print ('{')
1053 print ('  switch (tag)')
1054 print ('  {')
1055
1056 def verify_disambiguation_dict ():
1057         """Verify and normalize ``disambiguation``.
1058
1059         ``disambiguation`` is a map of ambiguous OpenType language system
1060         tags to the particular BCP 47 tags they correspond to. This function
1061         checks that all its keys really are ambiguous and that each key's
1062         value is valid for that key. It checks that no ambiguous tag is
1063         missing, except when it can figure out which BCP 47 tag is the best
1064         by itself.
1065
1066         It modifies ``disambiguation`` to remove keys whose values are the
1067         same as those that the fallback would return anyway, and to add
1068         ambiguous keys whose disambiguations it determined automatically.
1069
1070         Raises:
1071                 AssertionError: Verification failed.
1072         """
1073         global bcp_47
1074         global disambiguation
1075         global ot
1076         for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1077                 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1078                 if len (primary_tags) == 1:
1079                         expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1080                         if '-' in primary_tags[0]:
1081                                 disambiguation[ot_tag] = primary_tags[0]
1082                 elif len (primary_tags) == 0:
1083                         expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1084                 else:
1085                         macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1086                         if len (macrolanguages) != 1:
1087                                 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1088                         if len (macrolanguages) != 1:
1089                                 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1090                         if len (macrolanguages) != 1:
1091                                 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1092                                 expect (disambiguation[ot_tag] in bcp_47_tags,
1093                                                 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1094                         elif ot_tag not in disambiguation:
1095                                 disambiguation[ot_tag] = macrolanguages[0]
1096                         different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1097                         if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1098                                 del disambiguation[ot_tag]
1099         for ot_tag in disambiguation.keys ():
1100                 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1101
1102 verify_disambiguation_dict ()
1103 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1104         write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1105         print ()
1106         write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1107         print ()
1108
1109 print ('  default:')
1110 print ('    return HB_LANGUAGE_INVALID;')
1111 print ('  }')
1112 print ('}')
1113
1114 print ()
1115 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1116 print ()
1117 print ('/* == End of generated table == */')
1118