src/gen-tag-table.py

   1 #!/usr/bin/python
   2
   3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
   4 versa.
   5
   6 It creates a ``const LangTag[]``, matching the tags from the OpenType
   7 languages system tag list to the language subtags of the BCP 47 language
   8 subtag registry, with some manual adjustments. The mappings are
   9 supplemented with macrolanguages' sublanguages and retired codes'
  10 replacements, according to BCP 47 and some manual additions where BCP 47
  11 omits a retired code entirely.
  12
  13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
  14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
  15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
  16 multiple BCP 47 tags) are listed here, except when the alphabetically
  17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
  18 case, the fallback behavior will choose the right tag anyway.
  19 """
  20
  21 from __future__ import absolute_import, division, print_function, unicode_literals
  22
  23 import collections
  24 try:
  25         from HTMLParser import HTMLParser
  26         def write (s):
  27                 print (s.encode ('utf-8'), end='')
  28 except ImportError:
  29         from html.parser import HTMLParser
  30         def write (s):
  31                 sys.stdout.flush ()
  32                 sys.stdout.buffer.write (s.encode ('utf-8'))
  33 import io
  34 import itertools
  35 import re
  36 import sys
  37 import unicodedata
  38
  39 if len (sys.argv) != 3:
  40         print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr)
  41         sys.exit (1)
  42
  43 try:
  44         from html import unescape
  45         def html_unescape (parser, entity):
  46                 return unescape (entity)
  47 except ImportError:
  48         def html_unescape (parser, entity):
  49                 return parser.unescape (entity)
  50
  51 def expect (condition, message=None):
  52         if not condition:
  53                 if message is None:
  54                         raise AssertionError
  55                 raise AssertionError (message)
  56
  57 # from http://www-01.sil.org/iso639-3/iso-639-3.tab
  58 ISO_639_3_TO_1 = {
  59         'aar': 'aa',
  60         'abk': 'ab',
  61         'afr': 'af',
  62         'aka': 'ak',
  63         'amh': 'am',
  64         'ara': 'ar',
  65         'arg': 'an',
  66         'asm': 'as',
  67         'ava': 'av',
  68         'ave': 'ae',
  69         'aym': 'ay',
  70         'aze': 'az',
  71         'bak': 'ba',
  72         'bam': 'bm',
  73         'bel': 'be',
  74         'ben': 'bn',
  75         'bis': 'bi',
  76         'bod': 'bo',
  77         'bos': 'bs',
  78         'bre': 'br',
  79         'bul': 'bg',
  80         'cat': 'ca',
  81         'ces': 'cs',
  82         'cha': 'ch',
  83         'che': 'ce',
  84         'chu': 'cu',
  85         'chv': 'cv',
  86         'cor': 'kw',
  87         'cos': 'co',
  88         'cre': 'cr',
  89         'cym': 'cy',
  90         'dan': 'da',
  91         'deu': 'de',
  92         'div': 'dv',
  93         'dzo': 'dz',
  94         'ell': 'el',
  95         'eng': 'en',
  96         'epo': 'eo',
  97         'est': 'et',
  98         'eus': 'eu',
  99         'ewe': 'ee',
 100         'fao': 'fo',
 101         'fas': 'fa',
 102         'fij': 'fj',
 103         'fin': 'fi',
 104         'fra': 'fr',
 105         'fry': 'fy',
 106         'ful': 'ff',
 107         'gla': 'gd',
 108         'gle': 'ga',
 109         'glg': 'gl',
 110         'glv': 'gv',
 111         'grn': 'gn',
 112         'guj': 'gu',
 113         'hat': 'ht',
 114         'hau': 'ha',
 115         'hbs': 'sh',
 116         'heb': 'he',
 117         'her': 'hz',
 118         'hin': 'hi',
 119         'hmo': 'ho',
 120         'hrv': 'hr',
 121         'hun': 'hu',
 122         'hye': 'hy',
 123         'ibo': 'ig',
 124         'ido': 'io',
 125         'iii': 'ii',
 126         'iku': 'iu',
 127         'ile': 'ie',
 128         'ina': 'ia',
 129         'ind': 'id',
 130         'ipk': 'ik',
 131         'isl': 'is',
 132         'ita': 'it',
 133         'jav': 'jv',
 134         'jpn': 'ja',
 135         'kal': 'kl',
 136         'kan': 'kn',
 137         'kas': 'ks',
 138         'kat': 'ka',
 139         'kau': 'kr',
 140         'kaz': 'kk',
 141         'khm': 'km',
 142         'kik': 'ki',
 143         'kin': 'rw',
 144         'kir': 'ky',
 145         'kom': 'kv',
 146         'kon': 'kg',
 147         'kor': 'ko',
 148         'kua': 'kj',
 149         'kur': 'ku',
 150         'lao': 'lo',
 151         'lat': 'la',
 152         'lav': 'lv',
 153         'lim': 'li',
 154         'lin': 'ln',
 155         'lit': 'lt',
 156         'ltz': 'lb',
 157         'lub': 'lu',
 158         'lug': 'lg',
 159         'mah': 'mh',
 160         'mal': 'ml',
 161         'mar': 'mr',
 162         'mkd': 'mk',
 163         'mlg': 'mg',
 164         'mlt': 'mt',
 165         'mol': 'mo',
 166         'mon': 'mn',
 167         'mri': 'mi',
 168         'msa': 'ms',
 169         'mya': 'my',
 170         'nau': 'na',
 171         'nav': 'nv',
 172         'nbl': 'nr',
 173         'nde': 'nd',
 174         'ndo': 'ng',
 175         'nep': 'ne',
 176         'nld': 'nl',
 177         'nno': 'nn',
 178         'nob': 'nb',
 179         'nor': 'no',
 180         'nya': 'ny',
 181         'oci': 'oc',
 182         'oji': 'oj',
 183         'ori': 'or',
 184         'orm': 'om',
 185         'oss': 'os',
 186         'pan': 'pa',
 187         'pli': 'pi',
 188         'pol': 'pl',
 189         'por': 'pt',
 190         'pus': 'ps',
 191         'que': 'qu',
 192         'roh': 'rm',
 193         'ron': 'ro',
 194         'run': 'rn',
 195         'rus': 'ru',
 196         'sag': 'sg',
 197         'san': 'sa',
 198         'sin': 'si',
 199         'slk': 'sk',
 200         'slv': 'sl',
 201         'sme': 'se',
 202         'smo': 'sm',
 203         'sna': 'sn',
 204         'snd': 'sd',
 205         'som': 'so',
 206         'sot': 'st',
 207         'spa': 'es',
 208         'sqi': 'sq',
 209         'srd': 'sc',
 210         'srp': 'sr',
 211         'ssw': 'ss',
 212         'sun': 'su',
 213         'swa': 'sw',
 214         'swe': 'sv',
 215         'tah': 'ty',
 216         'tam': 'ta',
 217         'tat': 'tt',
 218         'tel': 'te',
 219         'tgk': 'tg',
 220         'tgl': 'tl',
 221         'tha': 'th',
 222         'tir': 'ti',
 223         'ton': 'to',
 224         'tsn': 'tn',
 225         'tso': 'ts',
 226         'tuk': 'tk',
 227         'tur': 'tr',
 228         'twi': 'tw',
 229         'uig': 'ug',
 230         'ukr': 'uk',
 231         'urd': 'ur',
 232         'uzb': 'uz',
 233         'ven': 've',
 234         'vie': 'vi',
 235         'vol': 'vo',
 236         'wln': 'wa',
 237         'wol': 'wo',
 238         'xho': 'xh',
 239         'yid': 'yi',
 240         'yor': 'yo',
 241         'zha': 'za',
 242         'zho': 'zh',
 243         'zul': 'zu',
 244 }
 245
 246 class LanguageTag (object):
 247         """A BCP 47 language tag.
 248
 249         Attributes:
 250                 subtags (List[str]): The list of subtags in this tag.
 251                 grandfathered (bool): Whether this tag is grandfathered. If
 252                         ``true``, the entire lowercased tag is the ``language``
 253                         and the other subtag fields are empty.
 254                 language (str): The language subtag.
 255                 script (str): The script subtag.
 256                 region (str): The region subtag.
 257                 variant (str): The variant subtag.
 258
 259         Args:
 260                 tag (str): A BCP 47 language tag.
 261
 262         """
 263         def __init__ (self, tag):
 264                 global bcp_47
 265                 self.subtags = tag.lower ().split ('-')
 266                 self.grandfathered = tag.lower () in bcp_47.grandfathered
 267                 if self.grandfathered:
 268                         self.language = tag.lower ()
 269                         self.script = ''
 270                         self.region = ''
 271                         self.variant = ''
 272                 else:
 273                         self.language = self.subtags[0]
 274                         self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
 275                         self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
 276                         self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
 277
 278         def __str__(self):
 279                 return '-'.join(self.subtags)
 280
 281         def __repr__ (self):
 282                 return 'LanguageTag(%r)' % str(self)
 283
 284         @staticmethod
 285         def _find_first (function, sequence):
 286                 try:
 287                         return next (iter (filter (function, sequence)))
 288                 except StopIteration:
 289                         return None
 290
 291         def is_complex (self):
 292                 """Return whether this tag is too complex to represent as a
 293                 ``LangTag`` in the generated code.
 294
 295                 Complex tags need to be handled in
 296                 ``hb_ot_tags_from_complex_language``.
 297
 298                 Returns:
 299                         Whether this tag is complex.
 300                 """
 301                 return not (len (self.subtags) == 1
 302                         or self.grandfathered
 303                         and len (self.subtags[1]) != 3
 304                         and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
 305
 306         def get_group (self):
 307                 """Return the group into which this tag should be categorized in
 308                 ``hb_ot_tags_from_complex_language``.
 309
 310                 The group is the first letter of the tag, or ``'und'`` if this tag
 311                 should not be matched in a ``switch`` statement in the generated
 312                 code.
 313
 314                 Returns:
 315                         This tag's group.
 316                 """
 317                 return ('und'
 318                         if (self.language == 'und'
 319                                 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
 320                         else self.language[0])
 321
 322 class OpenTypeRegistryParser (HTMLParser):
 323         """A parser for the OpenType language system tag registry.
 324
 325         Attributes:
 326                 header (str): The "last updated" line of the registry.
 327                 names (Mapping[str, str]): A map of language system tags to the
 328                         names they are given in the registry.
 329                 ranks (DefaultDict[str, int]): A map of language system tags to
 330                         numbers. If a single BCP 47 tag corresponds to multiple
 331                         OpenType tags, the tags are ordered in increasing order by
 332                         rank. The rank is based on the number of BCP 47 tags
 333                         associated with a tag, though it may be manually modified.
 334                 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
 335                         OpenType language system tags to sets of BCP 47 tags.
 336                 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
 337                         inverted. Its values start as unsorted sets;
 338                         ``sort_languages`` converts them to sorted lists.
 339
 340         """
 341         def __init__ (self):
 342                 HTMLParser.__init__ (self)
 343                 self.header = ''
 344                 self.names = {}
 345                 self.ranks = collections.defaultdict (int)
 346                 self.to_bcp_47 = collections.defaultdict (set)
 347                 self.from_bcp_47 = collections.defaultdict (set)
 348                 # Whether the parser is in a <td> element
 349                 self._td = False
 350                 # The text of the <td> elements of the current <tr> element.
 351                 self._current_tr = []
 352
 353         def handle_starttag (self, tag, attrs):
 354                 if tag == 'meta':
 355                         for attr, value in attrs:
 356                                 if attr == 'name' and value == 'updated_at':
 357                                         self.header = self.get_starttag_text ()
 358                                         break
 359                 elif tag == 'td':
 360                         self._td = True
 361                         self._current_tr.append ('')
 362                 elif tag == 'tr':
 363                         self._current_tr = []
 364
 365         def handle_endtag (self, tag):
 366                 if tag == 'td':
 367                         self._td = False
 368                 elif tag == 'tr' and self._current_tr:
 369                         expect (2 <= len (self._current_tr) <= 3)
 370                         name = self._current_tr[0].strip ()
 371                         tag = self._current_tr[1].strip ("\t\n\v\f\r '")
 372                         rank = 0
 373                         if len (tag) > 4:
 374                                 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
 375                                 name += ' (deprecated)'
 376                                 tag = tag.split (' ')[0]
 377                                 rank = 1
 378                         self.names[tag] = re.sub (' languages$', '', name)
 379                         if not self._current_tr[2]:
 380                                 return
 381                         iso_codes = self._current_tr[2].strip ()
 382                         self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
 383                         rank += 2 * len (self.to_bcp_47[tag])
 384                         self.ranks[tag] = rank
 385
 386         def handle_data (self, data):
 387                 if self._td:
 388                         self._current_tr[-1] += data
 389
 390         def handle_charref (self, name):
 391                 self.handle_data (html_unescape (self, '&#%s;' % name))
 392
 393         def handle_entityref (self, name):
 394                 self.handle_data (html_unescape (self, '&%s;' % name))
 395
 396         def parse (self, filename):
 397                 """Parse the OpenType language system tag registry.
 398
 399                 Args:
 400                         filename (str): The file name of the registry.
 401                 """
 402                 with io.open (filename, encoding='utf-8') as f:
 403                         self.feed (f.read ())
 404                 expect (self.header)
 405                 for tag, iso_codes in self.to_bcp_47.items ():
 406                         for iso_code in iso_codes:
 407                                 self.from_bcp_47[iso_code].add (tag)
 408
 409         def add_language (self, bcp_47_tag, ot_tag):
 410                 """Add a language as if it were in the registry.
 411
 412                 Args:
 413                         bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
 414                                 a language subtag, and if the language subtag is a
 415                                 macrolanguage, then new languages are added corresponding
 416                                 to the macrolanguages' individual languages with the
 417                                 remainder of the tag appended.
 418                         ot_tag (str): An OpenType language system tag.
 419                 """
 420                 global bcp_47
 421                 self.to_bcp_47[ot_tag].add (bcp_47_tag)
 422                 self.from_bcp_47[bcp_47_tag].add (ot_tag)
 423                 if bcp_47_tag.lower () not in bcp_47.grandfathered:
 424                         try:
 425                                 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
 426                                 if macrolanguage in bcp_47.macrolanguages:
 427                                         s = set ()
 428                                         for language in bcp_47.macrolanguages[macrolanguage]:
 429                                                 if language.lower () not in bcp_47.grandfathered:
 430                                                         s.add ('%s-%s' % (language, suffix))
 431                                         bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
 432                         except ValueError:
 433                                 pass
 434
 435         @staticmethod
 436         def _remove_language (tag_1, dict_1, dict_2):
 437                 for tag_2 in dict_1.pop (tag_1):
 438                         dict_2[tag_2].remove (tag_1)
 439                         if not dict_2[tag_2]:
 440                                 del dict_2[tag_2]
 441
 442         def remove_language_ot (self, ot_tag):
 443                 """Remove an OpenType tag from the registry.
 444
 445                 Args:
 446                         ot_tag (str): An OpenType tag.
 447                 """
 448                 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
 449
 450         def remove_language_bcp_47 (self, bcp_47_tag):
 451                 """Remove a BCP 47 tag from the registry.
 452
 453                 Args:
 454                         bcp_47_tag (str): A BCP 47 tag.
 455                 """
 456                 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
 457
 458         def inherit_from_macrolanguages (self):
 459                 """Copy mappings from macrolanguages to individual languages.
 460
 461                 If a BCP 47 tag for an individual mapping has no OpenType
 462                 mapping but its macrolanguage does, the mapping is copied to
 463                 the individual language. For example, als (Tosk Albanian) has no
 464                 explicit mapping, so it inherits from sq (Albanian) the mapping
 465                 to SQI.
 466
 467                 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
 468                 all of its individual languages do and they all map to the same
 469                 tags, the mapping is copied to the macrolanguage.
 470                 """
 471                 global bcp_47
 472                 original_ot_from_bcp_47 = dict (self.from_bcp_47)
 473                 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
 474                         ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
 475                         if ot_macrolanguages:
 476                                 for ot_macrolanguage in ot_macrolanguages:
 477                                         for language in languages:
 478                                                 # Remove the following condition if e.g. nn should map to NYN,NOR
 479                                                 # instead of just NYN.
 480                                                 if language not in original_ot_from_bcp_47:
 481                                                         self.add_language (language, ot_macrolanguage)
 482                                                         self.ranks[ot_macrolanguage] += 1
 483                         else:
 484                                 for language in languages:
 485                                         if language in original_ot_from_bcp_47:
 486                                                 if ot_macrolanguages:
 487                                                         ml = original_ot_from_bcp_47[language]
 488                                                         if ml:
 489                                                                 ot_macrolanguages &= ml
 490                                                         else:
 491                                                                 pass
 492                                                 else:
 493                                                         ot_macrolanguages |= original_ot_from_bcp_47[language]
 494                                         else:
 495                                                 ot_macrolanguages.clear ()
 496                                         if not ot_macrolanguages:
 497                                                 break
 498                                 for ot_macrolanguage in ot_macrolanguages:
 499                                         self.add_language (macrolanguage, ot_macrolanguage)
 500
 501         def sort_languages (self):
 502                 """Sort the values of ``from_bcp_47`` in ascending rank order."""
 503                 for language, tags in self.from_bcp_47.items ():
 504                         self.from_bcp_47[language] = sorted (tags,
 505                                         key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
 506
 507 ot = OpenTypeRegistryParser ()
 508
 509 class BCP47Parser (object):
 510         """A parser for the BCP 47 subtag registry.
 511
 512         Attributes:
 513                 header (str): The "File-Date" line of the registry.
 514                 names (Mapping[str, str]): A map of subtags to the names they
 515                         are given in the registry. Each value is a
 516                         ``'\\n'``-separated list of names.
 517                 scopes (Mapping[str, str]): A map of language subtags to strings
 518                         suffixed to language names, including suffixes to explain
 519                         language scopes.
 520                 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
 521                         language subtags to the sets of language subtags which
 522                         inherit from them. See
 523                         ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
 524                 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
 525                         subtags to their prefixes.
 526                 grandfathered (AbstractSet[str]): The set of grandfathered tags,
 527                         normalized to lowercase.
 528
 529         """
 530         def __init__ (self):
 531                 self.header = ''
 532                 self.names = {}
 533                 self.scopes = {}
 534                 self.macrolanguages = collections.defaultdict (set)
 535                 self.prefixes = collections.defaultdict (set)
 536                 self.grandfathered = set ()
 537
 538         def parse (self, filename):
 539                 """Parse the BCP 47 subtag registry.
 540
 541                 Args:
 542                         filename (str): The file name of the registry.
 543                 """
 544                 with io.open (filename, encoding='utf-8') as f:
 545                         subtag_type = None
 546                         subtag = None
 547                         deprecated = False
 548                         has_preferred_value = False
 549                         line_buffer = ''
 550                         for line in itertools.chain (f, ['']):
 551                                 line = line.rstrip ()
 552                                 if line.startswith (' '):
 553                                         line_buffer += line[1:]
 554                                         continue
 555                                 line, line_buffer = line_buffer, line
 556                                 if line.startswith ('Type: '):
 557                                         subtag_type = line.split (' ')[1]
 558                                         deprecated = False
 559                                         has_preferred_value = False
 560                                 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
 561                                         subtag = line.split (' ')[1]
 562                                         if subtag_type == 'grandfathered':
 563                                                 self.grandfathered.add (subtag.lower ())
 564                                 elif line.startswith ('Description: '):
 565                                         description = line.split (' ', 1)[1].replace (' (individual language)', '')
 566                                         description = re.sub (' (\((individual |macro)language\)|languages)$', '',
 567                                                         description)
 568                                         if subtag in self.names:
 569                                                 self.names[subtag] += '\n' + description
 570                                         else:
 571                                                 self.names[subtag] = description
 572                                 elif subtag_type == 'language' or subtag_type == 'grandfathered':
 573                                         if line.startswith ('Scope: '):
 574                                                 scope = line.split (' ')[1]
 575                                                 if scope == 'macrolanguage':
 576                                                         scope = ' [macrolanguage]'
 577                                                 elif scope == 'collection':
 578                                                         scope = ' [family]'
 579                                                 else:
 580                                                         continue
 581                                                 self.scopes[subtag] = scope
 582                                         elif line.startswith ('Deprecated: '):
 583                                                 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
 584                                                 deprecated = True
 585                                         elif deprecated and line.startswith ('Comments: see '):
 586                                                 # If a subtag is split into multiple replacement subtags,
 587                                                 # it essentially represents a macrolanguage.
 588                                                 for language in line.replace (',', '').split (' ')[2:]:
 589                                                         self._add_macrolanguage (subtag, language)
 590                                         elif line.startswith ('Preferred-Value: '):
 591                                                 # If a subtag is deprecated in favor of a single replacement subtag,
 592                                                 # it is either a dialect or synonym of the preferred subtag. Either
 593                                                 # way, it is close enough to the truth to consider the replacement
 594                                                 # the macrolanguage of the deprecated language.
 595                                                 has_preferred_value = True
 596                                                 macrolanguage = line.split (' ')[1]
 597                                                 self._add_macrolanguage (macrolanguage, subtag)
 598                                         elif not has_preferred_value and line.startswith ('Macrolanguage: '):
 599                                                 self._add_macrolanguage (line.split (' ')[1], subtag)
 600                                 elif subtag_type == 'variant':
 601                                         if line.startswith ('Prefix: '):
 602                                                 self.prefixes[subtag].add (line.split (' ')[1])
 603                                 elif line.startswith ('File-Date: '):
 604                                         self.header = line
 605                 expect (self.header)
 606
 607         def _add_macrolanguage (self, macrolanguage, language):
 608                 global ot
 609                 if language not in ot.from_bcp_47:
 610                         for l in self.macrolanguages.get (language, set ()):
 611                                 self._add_macrolanguage (macrolanguage, l)
 612                 if macrolanguage not in ot.from_bcp_47:
 613                         for ls in list (self.macrolanguages.values ()):
 614                                 if macrolanguage in ls:
 615                                         ls.add (language)
 616                                         return
 617                 self.macrolanguages[macrolanguage].add (language)
 618
 619         def remove_extra_macrolanguages (self):
 620                 """Make every language have at most one macrolanguage."""
 621                 inverted = collections.defaultdict (list)
 622                 for macrolanguage, languages in self.macrolanguages.items ():
 623                         for language in languages:
 624                                 inverted[language].append (macrolanguage)
 625                 for language, macrolanguages in inverted.items ():
 626                         if len (macrolanguages) > 1:
 627                                 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
 628                                 biggest_macrolanguage = macrolanguages.pop ()
 629                                 for macrolanguage in macrolanguages:
 630                                         self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
 631
 632         def get_name (self, lt):
 633                 """Return the names of the subtags in a language tag.
 634
 635                 Args:
 636                         lt (LanguageTag): A BCP 47 language tag.
 637
 638                 Returns:
 639                         The name form of ``lt``.
 640                 """
 641                 name = self.names[lt.language].split ('\n')[0]
 642                 if lt.script:
 643                         name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
 644                 if lt.region:
 645                         name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
 646                 if lt.variant:
 647                         name += '; ' + self.names[lt.variant].split ('\n')[0]
 648                 return name
 649
 650 bcp_47 = BCP47Parser ()
 651
 652 ot.parse (sys.argv[1])
 653 bcp_47.parse (sys.argv[2])
 654
 655 ot.add_language ('ary', 'MOR')
 656
 657 ot.add_language ('ath', 'ATH')
 658
 659 ot.add_language ('bai', 'BML')
 660
 661 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
 662
 663 ot.add_language ('ber', 'BBR')
 664
 665 ot.remove_language_ot ('PGR')
 666 ot.add_language ('el-polyton', 'PGR')
 667
 668 bcp_47.macrolanguages['et'] = {'ekk'}
 669
 670 bcp_47.names['flm'] = 'Falam Chin'
 671 bcp_47.scopes['flm'] = ' (retired code)'
 672 bcp_47.macrolanguages['flm'] = {'cfm'}
 673
 674 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
 675
 676 ot.add_language ('und-fonipa', 'IPPH')
 677
 678 ot.add_language ('und-fonnapa', 'APPH')
 679
 680 ot.remove_language_ot ('IRT')
 681 ot.add_language ('ga-Latg', 'IRT')
 682
 683 ot.remove_language_ot ('KGE')
 684 ot.add_language ('und-Geok', 'KGE')
 685
 686 ot.add_language ('guk', 'GUK')
 687 ot.names['GUK'] = 'Gumuz (SIL fonts)'
 688 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
 689
 690 bcp_47.macrolanguages['id'] = {'in'}
 691
 692 bcp_47.macrolanguages['ijo'] = {'ijc'}
 693
 694 ot.add_language ('kht', 'KHN')
 695 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
 696 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
 697 ot.ranks['KHN'] = ot.ranks['KHT']
 698 ot.ranks['KHT'] += 1
 699
 700 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
 701
 702 ot.names['MAL'] = 'Malayalam Traditional'
 703 ot.ranks['MLR'] += 1
 704
 705 bcp_47.names['mhv'] = 'Arakanese'
 706 bcp_47.scopes['mhv'] = ' (retired code)'
 707
 708 ot.add_language ('no', 'NOR')
 709
 710 ot.add_language ('oc-provenc', 'PRO')
 711
 712 ot.add_language ('qu', 'QUZ')
 713 ot.add_language ('qub', 'QWH')
 714 ot.add_language ('qud', 'QVI')
 715 ot.add_language ('qug', 'QVI')
 716 ot.add_language ('qup', 'QVI')
 717 ot.add_language ('qur', 'QWH')
 718 ot.add_language ('qus', 'QUH')
 719 ot.add_language ('quw', 'QVI')
 720 ot.add_language ('qux', 'QWH')
 721 ot.add_language ('qva', 'QWH')
 722 ot.add_language ('qvh', 'QWH')
 723 ot.add_language ('qvj', 'QVI')
 724 ot.add_language ('qvl', 'QWH')
 725 ot.add_language ('qvm', 'QWH')
 726 ot.add_language ('qvn', 'QWH')
 727 ot.add_language ('qvo', 'QVI')
 728 ot.add_language ('qvp', 'QWH')
 729 ot.add_language ('qvw', 'QWH')
 730 ot.add_language ('qvz', 'QVI')
 731 ot.add_language ('qwa', 'QWH')
 732 ot.add_language ('qws', 'QWH')
 733 ot.add_language ('qxa', 'QWH')
 734 ot.add_language ('qxc', 'QWH')
 735 ot.add_language ('qxh', 'QWH')
 736 ot.add_language ('qxl', 'QVI')
 737 ot.add_language ('qxn', 'QWH')
 738 ot.add_language ('qxo', 'QWH')
 739 ot.add_language ('qxr', 'QVI')
 740 ot.add_language ('qxt', 'QWH')
 741 ot.add_language ('qxw', 'QWH')
 742
 743 bcp_47.macrolanguages['ro'].remove ('mo')
 744 bcp_47.macrolanguages['ro-MD'].add ('mo')
 745
 746 ot.add_language ('sgw', 'SGW')
 747 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
 748 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
 749
 750 ot.remove_language_ot ('SYRE')
 751 ot.remove_language_ot ('SYRJ')
 752 ot.remove_language_ot ('SYRN')
 753 ot.add_language ('und-Syre', 'SYRE')
 754 ot.add_language ('und-Syrj', 'SYRJ')
 755 ot.add_language ('und-Syrn', 'SYRN')
 756
 757 bcp_47.names['xst'] = u"Silt'e"
 758 bcp_47.scopes['xst'] = ' (retired code)'
 759 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
 760
 761 ot.add_language ('xwo', 'TOD')
 762
 763 ot.remove_language_ot ('ZHH')
 764 ot.remove_language_ot ('ZHP')
 765 ot.remove_language_ot ('ZHT')
 766 bcp_47.macrolanguages['zh'].remove ('lzh')
 767 bcp_47.macrolanguages['zh'].remove ('yue')
 768 ot.add_language ('zh-Hant-MO', 'ZHH')
 769 ot.add_language ('zh-Hant-HK', 'ZHH')
 770 ot.add_language ('zh-Hans', 'ZHS')
 771 ot.add_language ('zh-Hant', 'ZHT')
 772 ot.add_language ('zh-HK', 'ZHH')
 773 ot.add_language ('zh-MO', 'ZHH')
 774 ot.add_language ('zh-TW', 'ZHT')
 775 ot.add_language ('lzh', 'ZHT')
 776 ot.add_language ('lzh-Hans', 'ZHS')
 777 ot.add_language ('yue', 'ZHH')
 778 ot.add_language ('yue-Hans', 'ZHS')
 779
 780 bcp_47.macrolanguages['zom'] = {'yos'}
 781
 782 def rank_delta (bcp_47, ot):
 783         """Return a delta to apply to a BCP 47 tag's rank.
 784
 785         Most OpenType tags have a constant rank, but a few have ranks that
 786         depend on the BCP 47 tag.
 787
 788         Args:
 789                 bcp_47 (str): A BCP 47 tag.
 790                 ot (str): An OpenType tag to.
 791
 792         Returns:
 793                 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
 794                 OpenType equivalents.
 795         """
 796         if bcp_47 == 'ak' and ot == 'AKA':
 797                 return -1
 798         if bcp_47 == 'tw' and ot == 'TWI':
 799                 return -1
 800         return 0
 801
 802 disambiguation = {
 803         'ALT': 'alt',
 804         'ARK': 'rki',
 805         'BHI': 'bhb',
 806         'BLN': 'bjt',
 807         'BTI': 'beb',
 808         'CCHN': 'cco',
 809         'CMR': 'swb',
 810         'CPP': 'crp',
 811         'CRR': 'crx',
 812         'DUJ': 'dwu',
 813         'ECR': 'crj',
 814         'HAL': 'cfm',
 815         'HND': 'hnd',
 816         'KIS': 'kqs',
 817         'LRC': 'bqi',
 818         'NDB': 'nd',
 819         'NIS': 'njz',
 820         'PLG': 'pce',
 821         'PRO': 'pro',
 822         'QIN': 'bgr',
 823         'QUH': 'quh',
 824         'QVI': 'qvi',
 825         'QWH': 'qwh',
 826         'SIG': 'stv',
 827         'TNE': 'yrk',
 828         'ZHH': 'zh-HK',
 829         'ZHS': 'zh-Hans',
 830         'ZHT': 'zh-Hant',
 831 }
 832
 833 ot.inherit_from_macrolanguages ()
 834 bcp_47.remove_extra_macrolanguages ()
 835 ot.inherit_from_macrolanguages ()
 836 ot.sort_languages ()
 837
 838 print ('/* == Start of generated table == */')
 839 print ('/*')
 840 print (' * The following table is generated by running:')
 841 print (' *')
 842 print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
 843 print (' *')
 844 print (' * on files with these headers:')
 845 print (' *')
 846 print (' * %s' % ot.header.strip ())
 847 print (' * %s' % bcp_47.header)
 848 print (' */')
 849 print ()
 850 print ('#ifndef HB_OT_TAG_TABLE_HH')
 851 print ('#define HB_OT_TAG_TABLE_HH')
 852 print ()
 853 print ('static const LangTag ot_languages[] = {')
 854
 855 def hb_tag (tag):
 856         """Convert a tag to ``HB_TAG`` form.
 857
 858         Args:
 859                 tag (str): An OpenType tag.
 860
 861         Returns:
 862                 A snippet of C++ representing ``tag``.
 863         """
 864         return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
 865
 866 def get_variant_set (name):
 867         """Return a set of variant language names from a name.
 868
 869         Args:
 870                 name (str): A list of language names from the BCP 47 registry,
 871                         joined on ``'\\n'``.
 872
 873         Returns:
 874                 A set of normalized language names.
 875         """
 876         return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'"))
 877                         .encode ('ASCII', 'ignore')
 878                         .strip ()
 879                         for n in re.split ('[\n(),]', name) if n)
 880
 881 def language_name_intersection (a, b):
 882         """Return the names in common between two language names.
 883
 884         Args:
 885                 a (str): A list of language names from the BCP 47 registry,
 886                         joined on ``'\\n'``.
 887                 b (str): A list of language names from the BCP 47 registry,
 888                         joined on ``'\\n'``.
 889
 890         Returns:
 891                 The normalized language names shared by ``a`` and ``b``.
 892         """
 893         return get_variant_set (a).intersection (get_variant_set (b))
 894
 895 def get_matching_language_name (intersection, candidates):
 896         return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
 897
 898 maximum_tags = 0
 899 for language, tags in sorted (ot.from_bcp_47.items ()):
 900         if language == '' or '-' in language:
 901                 continue
 902         print ('  {\"%s\",\t{' % language, end='')
 903         maximum_tags = max (maximum_tags, len (tags))
 904         tag_count = len (tags)
 905         for i, tag in enumerate (tags, start=1):
 906                 if i > 1:
 907                         print ('\t\t ', end='')
 908                 print (hb_tag (tag), end='')
 909                 if i == tag_count:
 910                         print ('}}', end='')
 911                 print (',\t/* ', end='')
 912                 bcp_47_name = bcp_47.names.get (language, '')
 913                 bcp_47_name_candidates = bcp_47_name.split ('\n')
 914                 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
 915                 scope = bcp_47.scopes.get (language, '')
 916                 if not intersection:
 917                         write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
 918                 else:
 919                         name = get_matching_language_name (intersection, bcp_47_name_candidates)
 920                         bcp_47.names[language] = name
 921                         write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
 922                 print (' */')
 923
 924 print ('};')
 925 print ()
 926 print ('static_assert (HB_OT_MAX_TAGS_PER_LANGUAGE == %iu, "");' % maximum_tags)
 927 print ()
 928
 929 print ('/**')
 930 print (' * hb_ot_tags_from_complex_language:')
 931 print (' * @lang_str: a BCP 47 language tag to convert.')
 932 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
 933 print (' * conversion.')
 934 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
 935 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
 936 print (' * @tags: array of size at least @language_count to store the language tag')
 937 print (' * results')
 938 print (' *')
 939 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
 940 print (' *')
 941 print (' * Return value: Whether any language systems were retrieved.')
 942 print (' **/')
 943 print ('static bool')
 944 print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
 945 print ('\t\t\t\t  const char   *limit,')
 946 print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
 947 print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
 948 print ('{')
 949
 950 def print_subtag_matches (subtag, new_line):
 951         if subtag:
 952                 if new_line:
 953                         print ()
 954                         print ('\t&& ', end='')
 955                 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
 956
 957 complex_tags = collections.defaultdict (list)
 958 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
 959                         (LanguageTag (language), tags)
 960                         for language, tags in sorted (ot.from_bcp_47.items (),
 961                                 key=lambda i: (-len (i[0]), i[0]))
 962                 ] if lt_tags[0].is_complex ()),
 963                 key=lambda lt_tags: lt_tags[0].get_group ()):
 964         complex_tags[initial] += group
 965
 966 for initial, items in sorted (complex_tags.items ()):
 967         if initial != 'und':
 968                 continue
 969         for lt, tags in items:
 970                 if lt.variant in bcp_47.prefixes:
 971                         expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
 972                                         '%s is not a valid prefix of %s' % (lt.language, lt.variant))
 973                 print ('  if (', end='')
 974                 print_subtag_matches (lt.script, False)
 975                 print_subtag_matches (lt.region, False)
 976                 print_subtag_matches (lt.variant, False)
 977                 print (')')
 978                 print ('  {')
 979                 write ('    /* %s */' % bcp_47.get_name (lt))
 980                 print ()
 981                 if len (tags) == 1:
 982                         write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
 983                         print ()
 984                         print ('    *count = 1;')
 985                 else:
 986                         print ('    hb_tag_t possible_tags[] = {')
 987                         for tag in tags:
 988                                 write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
 989                                 print ()
 990                         print ('    };')
 991                         print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
 992                         print ('      tags[i] = possible_tags[i];')
 993                         print ('    *count = i;')
 994                 print ('    return true;')
 995                 print ('  }')
 996
 997 print ('  switch (lang_str[0])')
 998 print ('  {')
 999 for initial, items in sorted (complex_tags.items ()):
1000         if initial == 'und':
1001                 continue
1002         print ("  case '%s':" % initial)
1003         for lt, tags in items:
1004                 print ('    if (', end='')
1005                 if lt.grandfathered:
1006                         print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1007                 else:
1008                         string_literal = lt.language[1:] + '-'
1009                         if lt.script:
1010                                 string_literal += lt.script
1011                                 lt.script = None
1012                                 if lt.region:
1013                                         string_literal += '-' + lt.region
1014                                         lt.region = None
1015                         if string_literal[-1] == '-':
1016                                 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1017                         else:
1018                                 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1019                 print_subtag_matches (lt.script, True)
1020                 print_subtag_matches (lt.region, True)
1021                 print_subtag_matches (lt.variant, True)
1022                 print (')')
1023                 print ('    {')
1024                 write ('      /* %s */' % bcp_47.get_name (lt))
1025                 print ()
1026                 if len (tags) == 1:
1027                         write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1028                         print ()
1029                         print ('      *count = 1;')
1030                 else:
1031                         print ('      unsigned int i;')
1032                         print ('      hb_tag_t possible_tags[] = {')
1033                         for tag in tags:
1034                                 write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1035                                 print ()
1036                         print ('      };')
1037                         print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1038                         print ('\ttags[i] = possible_tags[i];')
1039                         print ('      *count = i;')
1040                 print ('      return true;')
1041                 print ('    }')
1042         print ('    break;')
1043
1044 print ('  }')
1045 print ('  return false;')
1046 print ('}')
1047 print ()
1048 print ('/**')
1049 print (' * hb_ot_ambiguous_tag_to_language')
1050 print (' * @tag: A language tag.')
1051 print (' *')
1052 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1053 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1054 print (' * the best tag consists of multiple subtags.')
1055 print (' *')
1056 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1057 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1058 print (' **/')
1059 print ('static hb_language_t')
1060 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1061 print ('{')
1062 print ('  switch (tag)')
1063 print ('  {')
1064
1065 def verify_disambiguation_dict ():
1066         """Verify and normalize ``disambiguation``.
1067
1068         ``disambiguation`` is a map of ambiguous OpenType language system
1069         tags to the particular BCP 47 tags they correspond to. This function
1070         checks that all its keys really are ambiguous and that each key's
1071         value is valid for that key. It checks that no ambiguous tag is
1072         missing, except when it can figure out which BCP 47 tag is the best
1073         by itself.
1074
1075         It modifies ``disambiguation`` to remove keys whose values are the
1076         same as those that the fallback would return anyway, and to add
1077         ambiguous keys whose disambiguations it determined automatically.
1078
1079         Raises:
1080                 AssertionError: Verification failed.
1081         """
1082         global bcp_47
1083         global disambiguation
1084         global ot
1085         for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1086                 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1087                 if len (primary_tags) == 1:
1088                         expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1089                         if '-' in primary_tags[0]:
1090                                 disambiguation[ot_tag] = primary_tags[0]
1091                 elif len (primary_tags) == 0:
1092                         expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1093                 else:
1094                         macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1095                         if len (macrolanguages) != 1:
1096                                 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1097                         if len (macrolanguages) != 1:
1098                                 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1099                         if len (macrolanguages) != 1:
1100                                 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1101                                 expect (disambiguation[ot_tag] in bcp_47_tags,
1102                                                 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1103                         elif ot_tag not in disambiguation:
1104                                 disambiguation[ot_tag] = macrolanguages[0]
1105                         if disambiguation[ot_tag] == sorted (primary_tags)[0] and '-' not in disambiguation[ot_tag]:
1106                                 del disambiguation[ot_tag]
1107         for ot_tag in disambiguation.keys ():
1108                 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1109
1110 verify_disambiguation_dict ()
1111 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1112         write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1113         print ()
1114         write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1115         print ()
1116
1117 print ('  default:')
1118 print ('    return HB_LANGUAGE_INVALID;')
1119 print ('  }')
1120 print ('}')
1121
1122 print ()
1123 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1124 print ()
1125 print ('/* == End of generated table == */')
1126