src/gen-tag-table.py

   1 #!/usr/bin/python
   2
   3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
   4 versa.
   5
   6 It creates a ``const LangTag[]``, matching the tags from the OpenType
   7 languages system tag list to the language subtags of the BCP 47 language
   8 subtag registry, with some manual adjustments. The mappings are
   9 supplemented with macrolanguages' sublanguages and retired codes'
  10 replacements, according to BCP 47 and some manual additions where BCP 47
  11 omits a retired code entirely.
  12
  13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
  14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
  15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
  16 multiple BCP 47 tags) are listed here, except when the alphabetically
  17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
  18 case, the fallback behavior will choose the right tag anyway.
  19 """
  20
  21 from __future__ import absolute_import, division, print_function, unicode_literals
  22
  23 import collections
  24 try:
  25         from HTMLParser import HTMLParser
  26         def write (s):
  27                 print (s.encode ('utf-8'), end='')
  28 except ImportError:
  29         from html.parser import HTMLParser
  30         def write (s):
  31                 sys.stdout.flush ()
  32                 sys.stdout.buffer.write (s.encode ('utf-8'))
  33 import io
  34 import itertools
  35 import re
  36 import sys
  37 import unicodedata
  38
  39 if len (sys.argv) != 3:
  40         print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr)
  41         sys.exit (1)
  42
  43 try:
  44         from html import unescape
  45         def html_unescape (parser, entity):
  46                 return unescape (entity)
  47 except ImportError:
  48         def html_unescape (parser, entity):
  49                 return parser.unescape (entity)
  50
  51 def expect (condition, message=None):
  52         if not condition:
  53                 if message is None:
  54                         raise AssertionError
  55                 raise AssertionError (message)
  56
  57 # from http://www-01.sil.org/iso639-3/iso-639-3.tab
  58 ISO_639_3_TO_1 = {
  59         'aar': 'aa',
  60         'abk': 'ab',
  61         'afr': 'af',
  62         'aka': 'ak',
  63         'amh': 'am',
  64         'ara': 'ar',
  65         'arg': 'an',
  66         'asm': 'as',
  67         'ava': 'av',
  68         'ave': 'ae',
  69         'aym': 'ay',
  70         'aze': 'az',
  71         'bak': 'ba',
  72         'bam': 'bm',
  73         'bel': 'be',
  74         'ben': 'bn',
  75         'bis': 'bi',
  76         'bod': 'bo',
  77         'bos': 'bs',
  78         'bre': 'br',
  79         'bul': 'bg',
  80         'cat': 'ca',
  81         'ces': 'cs',
  82         'cha': 'ch',
  83         'che': 'ce',
  84         'chu': 'cu',
  85         'chv': 'cv',
  86         'cor': 'kw',
  87         'cos': 'co',
  88         'cre': 'cr',
  89         'cym': 'cy',
  90         'dan': 'da',
  91         'deu': 'de',
  92         'div': 'dv',
  93         'dzo': 'dz',
  94         'ell': 'el',
  95         'eng': 'en',
  96         'epo': 'eo',
  97         'est': 'et',
  98         'eus': 'eu',
  99         'ewe': 'ee',
 100         'fao': 'fo',
 101         'fas': 'fa',
 102         'fij': 'fj',
 103         'fin': 'fi',
 104         'fra': 'fr',
 105         'fry': 'fy',
 106         'ful': 'ff',
 107         'gla': 'gd',
 108         'gle': 'ga',
 109         'glg': 'gl',
 110         'glv': 'gv',
 111         'grn': 'gn',
 112         'guj': 'gu',
 113         'hat': 'ht',
 114         'hau': 'ha',
 115         'hbs': 'sh',
 116         'heb': 'he',
 117         'her': 'hz',
 118         'hin': 'hi',
 119         'hmo': 'ho',
 120         'hrv': 'hr',
 121         'hun': 'hu',
 122         'hye': 'hy',
 123         'ibo': 'ig',
 124         'ido': 'io',
 125         'iii': 'ii',
 126         'iku': 'iu',
 127         'ile': 'ie',
 128         'ina': 'ia',
 129         'ind': 'id',
 130         'ipk': 'ik',
 131         'isl': 'is',
 132         'ita': 'it',
 133         'jav': 'jv',
 134         'jpn': 'ja',
 135         'kal': 'kl',
 136         'kan': 'kn',
 137         'kas': 'ks',
 138         'kat': 'ka',
 139         'kau': 'kr',
 140         'kaz': 'kk',
 141         'khm': 'km',
 142         'kik': 'ki',
 143         'kin': 'rw',
 144         'kir': 'ky',
 145         'kom': 'kv',
 146         'kon': 'kg',
 147         'kor': 'ko',
 148         'kua': 'kj',
 149         'kur': 'ku',
 150         'lao': 'lo',
 151         'lat': 'la',
 152         'lav': 'lv',
 153         'lim': 'li',
 154         'lin': 'ln',
 155         'lit': 'lt',
 156         'ltz': 'lb',
 157         'lub': 'lu',
 158         'lug': 'lg',
 159         'mah': 'mh',
 160         'mal': 'ml',
 161         'mar': 'mr',
 162         'mkd': 'mk',
 163         'mlg': 'mg',
 164         'mlt': 'mt',
 165         'mol': 'mo',
 166         'mon': 'mn',
 167         'mri': 'mi',
 168         'msa': 'ms',
 169         'mya': 'my',
 170         'nau': 'na',
 171         'nav': 'nv',
 172         'nbl': 'nr',
 173         'nde': 'nd',
 174         'ndo': 'ng',
 175         'nep': 'ne',
 176         'nld': 'nl',
 177         'nno': 'nn',
 178         'nob': 'nb',
 179         'nor': 'no',
 180         'nya': 'ny',
 181         'oci': 'oc',
 182         'oji': 'oj',
 183         'ori': 'or',
 184         'orm': 'om',
 185         'oss': 'os',
 186         'pan': 'pa',
 187         'pli': 'pi',
 188         'pol': 'pl',
 189         'por': 'pt',
 190         'pus': 'ps',
 191         'que': 'qu',
 192         'roh': 'rm',
 193         'ron': 'ro',
 194         'run': 'rn',
 195         'rus': 'ru',
 196         'sag': 'sg',
 197         'san': 'sa',
 198         'sin': 'si',
 199         'slk': 'sk',
 200         'slv': 'sl',
 201         'sme': 'se',
 202         'smo': 'sm',
 203         'sna': 'sn',
 204         'snd': 'sd',
 205         'som': 'so',
 206         'sot': 'st',
 207         'spa': 'es',
 208         'sqi': 'sq',
 209         'srd': 'sc',
 210         'srp': 'sr',
 211         'ssw': 'ss',
 212         'sun': 'su',
 213         'swa': 'sw',
 214         'swe': 'sv',
 215         'tah': 'ty',
 216         'tam': 'ta',
 217         'tat': 'tt',
 218         'tel': 'te',
 219         'tgk': 'tg',
 220         'tgl': 'tl',
 221         'tha': 'th',
 222         'tir': 'ti',
 223         'ton': 'to',
 224         'tsn': 'tn',
 225         'tso': 'ts',
 226         'tuk': 'tk',
 227         'tur': 'tr',
 228         'twi': 'tw',
 229         'uig': 'ug',
 230         'ukr': 'uk',
 231         'urd': 'ur',
 232         'uzb': 'uz',
 233         'ven': 've',
 234         'vie': 'vi',
 235         'vol': 'vo',
 236         'wln': 'wa',
 237         'wol': 'wo',
 238         'xho': 'xh',
 239         'yid': 'yi',
 240         'yor': 'yo',
 241         'zha': 'za',
 242         'zho': 'zh',
 243         'zul': 'zu',
 244 }
 245
 246 class LanguageTag (object):
 247         """A BCP 47 language tag.
 248
 249         Attributes:
 250                 subtags (List[str]): The list of subtags in this tag.
 251                 grandfathered (bool): Whether this tag is grandfathered. If
 252                         ``true``, the entire lowercased tag is the ``language``
 253                         and the other subtag fields are empty.
 254                 language (str): The language subtag.
 255                 script (str): The script subtag.
 256                 region (str): The region subtag.
 257                 variant (str): The variant subtag.
 258
 259         Args:
 260                 tag (str): A BCP 47 language tag.
 261
 262         """
 263         def __init__ (self, tag):
 264                 global bcp_47
 265                 self.subtags = tag.lower ().split ('-')
 266                 self.grandfathered = tag.lower () in bcp_47.grandfathered
 267                 if self.grandfathered:
 268                         self.language = tag.lower ()
 269                         self.script = ''
 270                         self.region = ''
 271                         self.variant = ''
 272                 else:
 273                         self.language = self.subtags[0]
 274                         self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
 275                         self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
 276                         self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
 277
 278         def __str__(self):
 279                 return '-'.join(self.subtags)
 280
 281         def __repr__ (self):
 282                 return 'LanguageTag(%r)' % str(self)
 283
 284         @staticmethod
 285         def _find_first (function, sequence):
 286                 try:
 287                         return next (iter (filter (function, sequence)))
 288                 except StopIteration:
 289                         return None
 290
 291         def is_complex (self):
 292                 """Return whether this tag is too complex to represent as a
 293                 ``LangTag`` in the generated code.
 294
 295                 Complex tags need to be handled in
 296                 ``hb_ot_tags_from_complex_language``.
 297
 298                 Returns:
 299                         Whether this tag is complex.
 300                 """
 301                 return not (len (self.subtags) == 1
 302                         or self.grandfathered
 303                         and len (self.subtags[1]) != 3
 304                         and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
 305
 306         def get_group (self):
 307                 """Return the group into which this tag should be categorized in
 308                 ``hb_ot_tags_from_complex_language``.
 309
 310                 The group is the first letter of the tag, or ``'und'`` if this tag
 311                 should not be matched in a ``switch`` statement in the generated
 312                 code.
 313
 314                 Returns:
 315                         This tag's group.
 316                 """
 317                 return ('und'
 318                         if (self.language == 'und'
 319                                 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
 320                         else self.language[0])
 321
 322 class OpenTypeRegistryParser (HTMLParser):
 323         """A parser for the OpenType language system tag registry.
 324
 325         Attributes:
 326                 header (str): The "last updated" line of the registry.
 327                 names (Mapping[str, str]): A map of language system tags to the
 328                         names they are given in the registry.
 329                 ranks (DefaultDict[str, int]): A map of language system tags to
 330                         numbers. If a single BCP 47 tag corresponds to multiple
 331                         OpenType tags, the tags are ordered in increasing order by
 332                         rank. The rank is based on the number of BCP 47 tags
 333                         associated with a tag, though it may be manually modified.
 334                 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
 335                         OpenType language system tags to sets of BCP 47 tags.
 336                 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
 337                         inverted. Its values start as unsorted sets;
 338                         ``sort_languages`` converts them to sorted lists.
 339
 340         """
 341         def __init__ (self):
 342                 HTMLParser.__init__ (self)
 343                 self.header = ''
 344                 self.names = {}
 345                 self.ranks = collections.defaultdict (int)
 346                 self.to_bcp_47 = collections.defaultdict (set)
 347                 self.from_bcp_47 = collections.defaultdict (set)
 348                 # Whether the parser is in a <td> element
 349                 self._td = False
 350                 # The text of the <td> elements of the current <tr> element.
 351                 self._current_tr = []
 352
 353         def handle_starttag (self, tag, attrs):
 354                 if tag == 'meta':
 355                         for attr, value in attrs:
 356                                 if attr == 'name' and value == 'updated_at':
 357                                         self.header = self.get_starttag_text ()
 358                                         break
 359                 elif tag == 'td':
 360                         self._td = True
 361                         self._current_tr.append ('')
 362                 elif tag == 'tr':
 363                         self._current_tr = []
 364
 365         def handle_endtag (self, tag):
 366                 if tag == 'td':
 367                         self._td = False
 368                 elif tag == 'tr' and self._current_tr:
 369                         expect (2 <= len (self._current_tr) <= 3)
 370                         name = self._current_tr[0].strip ()
 371                         tag = self._current_tr[1].strip ("\t\n\v\f\r '")
 372                         rank = 0
 373                         if len (tag) > 4:
 374                                 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
 375                                 name += ' (deprecated)'
 376                                 tag = tag.split (' ')[0]
 377                                 rank = 1
 378                         self.names[tag] = re.sub (' languages$', '', name)
 379                         if not self._current_tr[2]:
 380                                 return
 381                         iso_codes = self._current_tr[2].strip ()
 382                         self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
 383                         rank += 2 * len (self.to_bcp_47[tag])
 384                         self.ranks[tag] = rank
 385
 386         def handle_data (self, data):
 387                 if self._td:
 388                         self._current_tr[-1] += data
 389
 390         def handle_charref (self, name):
 391                 self.handle_data (html_unescape (self, '&#%s;' % name))
 392
 393         def handle_entityref (self, name):
 394                 self.handle_data (html_unescape (self, '&%s;' % name))
 395
 396         def parse (self, filename):
 397                 """Parse the OpenType language system tag registry.
 398
 399                 Args:
 400                         filename (str): The file name of the registry.
 401                 """
 402                 with io.open (filename, encoding='utf-8') as f:
 403                         self.feed (f.read ())
 404                 expect (self.header)
 405                 for tag, iso_codes in self.to_bcp_47.items ():
 406                         for iso_code in iso_codes:
 407                                 self.from_bcp_47[iso_code].add (tag)
 408
 409         def add_language (self, bcp_47_tag, ot_tag):
 410                 """Add a language as if it were in the registry.
 411
 412                 Args:
 413                         bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
 414                                 a language subtag, and if the language subtag is a
 415                                 macrolanguage, then new languages are added corresponding
 416                                 to the macrolanguages' individual languages with the
 417                                 remainder of the tag appended.
 418                         ot_tag (str): An OpenType language system tag.
 419                 """
 420                 global bcp_47
 421                 self.to_bcp_47[ot_tag].add (bcp_47_tag)
 422                 self.from_bcp_47[bcp_47_tag].add (ot_tag)
 423                 if bcp_47_tag.lower () not in bcp_47.grandfathered:
 424                         try:
 425                                 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
 426                                 if macrolanguage in bcp_47.macrolanguages:
 427                                         s = set ()
 428                                         for language in bcp_47.macrolanguages[macrolanguage]:
 429                                                 if language.lower () not in bcp_47.grandfathered:
 430                                                         s.add ('%s-%s' % (language, suffix))
 431                                         bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
 432                         except ValueError:
 433                                 pass
 434
 435         @staticmethod
 436         def _remove_language (tag_1, dict_1, dict_2):
 437                 for tag_2 in dict_1.pop (tag_1):
 438                         dict_2[tag_2].remove (tag_1)
 439                         if not dict_2[tag_2]:
 440                                 del dict_2[tag_2]
 441
 442         def remove_language_ot (self, ot_tag):
 443                 """Remove an OpenType tag from the registry.
 444
 445                 Args:
 446                         ot_tag (str): An OpenType tag.
 447                 """
 448                 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
 449
 450         def remove_language_bcp_47 (self, bcp_47_tag):
 451                 """Remove a BCP 47 tag from the registry.
 452
 453                 Args:
 454                         bcp_47_tag (str): A BCP 47 tag.
 455                 """
 456                 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
 457
 458         def inherit_from_macrolanguages (self):
 459                 """Copy mappings from macrolanguages to individual languages.
 460
 461                 If a BCP 47 tag for an individual mapping has no OpenType
 462                 mapping but its macrolanguage does, the mapping is copied to
 463                 the individual language. For example, als (Tosk Albanian) has no
 464                 explicit mapping, so it inherits from sq (Albanian) the mapping
 465                 to SQI.
 466
 467                 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
 468                 all of its individual languages do and they all map to the same
 469                 tags, the mapping is copied to the macrolanguage.
 470                 """
 471                 global bcp_47
 472                 original_ot_from_bcp_47 = dict (self.from_bcp_47)
 473                 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
 474                         ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
 475                         if ot_macrolanguages:
 476                                 for ot_macrolanguage in ot_macrolanguages:
 477                                         for language in languages:
 478                                                 # Remove the following condition if e.g. nn should map to NYN,NOR
 479                                                 # instead of just NYN.
 480                                                 if language not in original_ot_from_bcp_47:
 481                                                         self.add_language (language, ot_macrolanguage)
 482                                                         self.ranks[ot_macrolanguage] += 1
 483                         else:
 484                                 for language in languages:
 485                                         if language in original_ot_from_bcp_47:
 486                                                 if ot_macrolanguages:
 487                                                         ml = original_ot_from_bcp_47[language]
 488                                                         if ml:
 489                                                                 ot_macrolanguages &= ml
 490                                                         else:
 491                                                                 pass
 492                                                 else:
 493                                                         ot_macrolanguages |= original_ot_from_bcp_47[language]
 494                                         else:
 495                                                 ot_macrolanguages.clear ()
 496                                         if not ot_macrolanguages:
 497                                                 break
 498                                 for ot_macrolanguage in ot_macrolanguages:
 499                                         self.add_language (macrolanguage, ot_macrolanguage)
 500
 501         def sort_languages (self):
 502                 """Sort the values of ``from_bcp_47`` in ascending rank order."""
 503                 for language, tags in self.from_bcp_47.items ():
 504                         self.from_bcp_47[language] = sorted (tags,
 505                                         key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
 506
 507 ot = OpenTypeRegistryParser ()
 508
 509 class BCP47Parser (object):
 510         """A parser for the BCP 47 subtag registry.
 511
 512         Attributes:
 513                 header (str): The "File-Date" line of the registry.
 514                 names (Mapping[str, str]): A map of subtags to the names they
 515                         are given in the registry. Each value is a
 516                         ``'\\n'``-separated list of names.
 517                 scopes (Mapping[str, str]): A map of language subtags to strings
 518                         suffixed to language names, including suffixes to explain
 519                         language scopes.
 520                 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
 521                         language subtags to the sets of language subtags which
 522                         inherit from them. See
 523                         ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
 524                 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
 525                         subtags to their prefixes.
 526                 grandfathered (AbstractSet[str]): The set of grandfathered tags,
 527                         normalized to lowercase.
 528
 529         """
 530         def __init__ (self):
 531                 self.header = ''
 532                 self.names = {}
 533                 self.scopes = {}
 534                 self.macrolanguages = collections.defaultdict (set)
 535                 self.prefixes = collections.defaultdict (set)
 536                 self.grandfathered = set ()
 537
 538         def parse (self, filename):
 539                 """Parse the BCP 47 subtag registry.
 540
 541                 Args:
 542                         filename (str): The file name of the registry.
 543                 """
 544                 with io.open (filename, encoding='utf-8') as f:
 545                         subtag_type = None
 546                         subtag = None
 547                         deprecated = False
 548                         has_preferred_value = False
 549                         line_buffer = ''
 550                         for line in itertools.chain (f, ['']):
 551                                 line = line.rstrip ()
 552                                 if line.startswith (' '):
 553                                         line_buffer += line[1:]
 554                                         continue
 555                                 line, line_buffer = line_buffer, line
 556                                 if line.startswith ('Type: '):
 557                                         subtag_type = line.split (' ')[1]
 558                                         deprecated = False
 559                                         has_preferred_value = False
 560                                 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
 561                                         subtag = line.split (' ')[1]
 562                                         if subtag_type == 'grandfathered':
 563                                                 self.grandfathered.add (subtag.lower ())
 564                                 elif line.startswith ('Description: '):
 565                                         description = line.split (' ', 1)[1].replace (' (individual language)', '')
 566                                         description = re.sub (' (\((individual |macro)language\)|languages)$', '',
 567                                                         description)
 568                                         if subtag in self.names:
 569                                                 self.names[subtag] += '\n' + description
 570                                         else:
 571                                                 self.names[subtag] = description
 572                                 elif subtag_type == 'language' or subtag_type == 'grandfathered':
 573                                         if line.startswith ('Scope: '):
 574                                                 scope = line.split (' ')[1]
 575                                                 if scope == 'macrolanguage':
 576                                                         scope = ' [macrolanguage]'
 577                                                 elif scope == 'collection':
 578                                                         scope = ' [family]'
 579                                                 else:
 580                                                         continue
 581                                                 self.scopes[subtag] = scope
 582                                         elif line.startswith ('Deprecated: '):
 583                                                 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
 584                                                 deprecated = True
 585                                         elif deprecated and line.startswith ('Comments: see '):
 586                                                 # If a subtag is split into multiple replacement subtags,
 587                                                 # it essentially represents a macrolanguage.
 588                                                 for language in line.replace (',', '').split (' ')[2:]:
 589                                                         self._add_macrolanguage (subtag, language)
 590                                         elif line.startswith ('Preferred-Value: '):
 591                                                 # If a subtag is deprecated in favor of a single replacement subtag,
 592                                                 # it is either a dialect or synonym of the preferred subtag. Either
 593                                                 # way, it is close enough to the truth to consider the replacement
 594                                                 # the macrolanguage of the deprecated language.
 595                                                 has_preferred_value = True
 596                                                 macrolanguage = line.split (' ')[1]
 597                                                 self._add_macrolanguage (macrolanguage, subtag)
 598                                         elif not has_preferred_value and line.startswith ('Macrolanguage: '):
 599                                                 self._add_macrolanguage (line.split (' ')[1], subtag)
 600                                 elif subtag_type == 'variant':
 601                                         if line.startswith ('Prefix: '):
 602                                                 self.prefixes[subtag].add (line.split (' ')[1])
 603                                 elif line.startswith ('File-Date: '):
 604                                         self.header = line
 605                 expect (self.header)
 606
 607         def _add_macrolanguage (self, macrolanguage, language):
 608                 global ot
 609                 if language not in ot.from_bcp_47:
 610                         for l in self.macrolanguages.get (language, set ()):
 611                                 self._add_macrolanguage (macrolanguage, l)
 612                 if macrolanguage not in ot.from_bcp_47:
 613                         for ls in list (self.macrolanguages.values ()):
 614                                 if macrolanguage in ls:
 615                                         ls.add (language)
 616                                         return
 617                 self.macrolanguages[macrolanguage].add (language)
 618
 619         def remove_extra_macrolanguages (self):
 620                 """Make every language have at most one macrolanguage."""
 621                 inverted = collections.defaultdict (list)
 622                 for macrolanguage, languages in self.macrolanguages.items ():
 623                         for language in languages:
 624                                 inverted[language].append (macrolanguage)
 625                 for language, macrolanguages in inverted.items ():
 626                         if len (macrolanguages) > 1:
 627                                 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
 628                                 biggest_macrolanguage = macrolanguages.pop ()
 629                                 for macrolanguage in macrolanguages:
 630                                         self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
 631
 632         def get_name (self, lt):
 633                 """Return the names of the subtags in a language tag.
 634
 635                 Args:
 636                         lt (LanguageTag): A BCP 47 language tag.
 637
 638                 Returns:
 639                         The name form of ``lt``.
 640                 """
 641                 name = self.names[lt.language].split ('\n')[0]
 642                 if lt.script:
 643                         name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
 644                 if lt.region:
 645                         name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
 646                 if lt.variant:
 647                         name += '; ' + self.names[lt.variant].split ('\n')[0]
 648                 return name
 649
 650 bcp_47 = BCP47Parser ()
 651
 652 ot.parse (sys.argv[1])
 653 bcp_47.parse (sys.argv[2])
 654
 655 ot.add_language ('ary', 'MOR')
 656
 657 ot.add_language ('ath', 'ATH')
 658
 659 ot.add_language ('bai', 'BML')
 660
 661 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
 662
 663 ot.add_language ('ber', 'BBR')
 664
 665 ot.remove_language_ot ('PGR')
 666 ot.add_language ('el-polyton', 'PGR')
 667
 668 bcp_47.macrolanguages['et'] = {'ekk'}
 669
 670 bcp_47.names['flm'] = 'Falam Chin'
 671 bcp_47.scopes['flm'] = ' (retired code)'
 672 bcp_47.macrolanguages['flm'] = {'cfm'}
 673
 674 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
 675
 676 ot.add_language ('und-fonipa', 'IPPH')
 677
 678 ot.add_language ('und-fonnapa', 'APPH')
 679
 680 ot.remove_language_ot ('IRT')
 681 ot.add_language ('ga-Latg', 'IRT')
 682
 683 ot.remove_language_ot ('KGE')
 684 ot.add_language ('und-Geok', 'KGE')
 685
 686 ot.add_language ('guk', 'GUK')
 687 ot.names['GUK'] = 'Gumuz (SIL fonts)'
 688 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
 689
 690 bcp_47.macrolanguages['id'] = {'in'}
 691
 692 bcp_47.macrolanguages['ijo'] = {'ijc'}
 693
 694 ot.add_language ('kht', 'KHN')
 695 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
 696 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
 697 ot.ranks['KHN'] = ot.ranks['KHT']
 698 ot.ranks['KHT'] += 1
 699
 700 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
 701
 702 ot.names['MAL'] = 'Malayalam Traditional'
 703 ot.ranks['MLR'] += 1
 704
 705 bcp_47.names['mhv'] = 'Arakanese'
 706 bcp_47.scopes['mhv'] = ' (retired code)'
 707
 708 ot.add_language ('no', 'NOR')
 709
 710 ot.add_language ('oc-provenc', 'PRO')
 711
 712 ot.add_language ('qu', 'QUZ')
 713 ot.add_language ('qub', 'QWH')
 714 ot.add_language ('qud', 'QVI')
 715 ot.add_language ('qug', 'QVI')
 716 ot.add_language ('qup', 'QVI')
 717 ot.add_language ('qur', 'QWH')
 718 ot.add_language ('qus', 'QUH')
 719 ot.add_language ('quw', 'QVI')
 720 ot.add_language ('qux', 'QWH')
 721 ot.add_language ('qva', 'QWH')
 722 ot.add_language ('qvh', 'QWH')
 723 ot.add_language ('qvj', 'QVI')
 724 ot.add_language ('qvl', 'QWH')
 725 ot.add_language ('qvm', 'QWH')
 726 ot.add_language ('qvn', 'QWH')
 727 ot.add_language ('qvo', 'QVI')
 728 ot.add_language ('qvp', 'QWH')
 729 ot.add_language ('qvw', 'QWH')
 730 ot.add_language ('qvz', 'QVI')
 731 ot.add_language ('qwa', 'QWH')
 732 ot.add_language ('qws', 'QWH')
 733 ot.add_language ('qxa', 'QWH')
 734 ot.add_language ('qxc', 'QWH')
 735 ot.add_language ('qxh', 'QWH')
 736 ot.add_language ('qxl', 'QVI')
 737 ot.add_language ('qxn', 'QWH')
 738 ot.add_language ('qxo', 'QWH')
 739 ot.add_language ('qxr', 'QVI')
 740 ot.add_language ('qxt', 'QWH')
 741 ot.add_language ('qxw', 'QWH')
 742
 743 bcp_47.macrolanguages['ro'].remove ('mo')
 744 bcp_47.macrolanguages['ro-MD'].add ('mo')
 745
 746 ot.add_language ('sgw', 'SGW')
 747 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
 748 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
 749
 750 ot.remove_language_ot ('SYRE')
 751 ot.remove_language_ot ('SYRJ')
 752 ot.remove_language_ot ('SYRN')
 753 ot.add_language ('und-Syre', 'SYRE')
 754 ot.add_language ('und-Syrj', 'SYRJ')
 755 ot.add_language ('und-Syrn', 'SYRN')
 756
 757 bcp_47.names['xst'] = u"Silt'e"
 758 bcp_47.scopes['xst'] = ' (retired code)'
 759 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
 760
 761 ot.add_language ('xwo', 'TOD')
 762
 763 ot.remove_language_ot ('ZHH')
 764 ot.remove_language_ot ('ZHP')
 765 ot.remove_language_ot ('ZHT')
 766 bcp_47.macrolanguages['zh'].remove ('lzh')
 767 bcp_47.macrolanguages['zh'].remove ('yue')
 768 ot.add_language ('zh-Hant-MO', 'ZHH')
 769 ot.add_language ('zh-Hant-HK', 'ZHH')
 770 ot.add_language ('zh-Hans', 'ZHS')
 771 ot.add_language ('zh-Hant', 'ZHT')
 772 ot.add_language ('zh-HK', 'ZHH')
 773 ot.add_language ('zh-MO', 'ZHH')
 774 ot.add_language ('zh-TW', 'ZHT')
 775 ot.add_language ('lzh', 'ZHT')
 776 ot.add_language ('lzh-Hans', 'ZHS')
 777 ot.add_language ('yue', 'ZHH')
 778 ot.add_language ('yue-Hans', 'ZHS')
 779
 780 bcp_47.macrolanguages['zom'] = {'yos'}
 781
 782 def rank_delta (bcp_47, ot):
 783         """Return a delta to apply to a BCP 47 tag's rank.
 784
 785         Most OpenType tags have a constant rank, but a few have ranks that
 786         depend on the BCP 47 tag.
 787
 788         Args:
 789                 bcp_47 (str): A BCP 47 tag.
 790                 ot (str): An OpenType tag to.
 791
 792         Returns:
 793                 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
 794                 OpenType equivalents.
 795         """
 796         if bcp_47 == 'ak' and ot == 'AKA':
 797                 return -1
 798         if bcp_47 == 'tw' and ot == 'TWI':
 799                 return -1
 800         return 0
 801
 802 disambiguation = {
 803         'ALT': 'alt',
 804         'ARK': 'rki',
 805         'BHI': 'bhb',
 806         'BLN': 'bjt',
 807         'BTI': 'beb',
 808         'CCHN': 'cco',
 809         'CMR': 'swb',
 810         'CPP': 'crp',
 811         'CRR': 'crx',
 812         'DUJ': 'dwu',
 813         'ECR': 'crj',
 814         'HAL': 'cfm',
 815         'HND': 'hnd',
 816         'KIS': 'kqs',
 817         'LRC': 'bqi',
 818         'NDB': 'nd',
 819         'NIS': 'njz',
 820         'PLG': 'pce',
 821         'PRO': 'pro',
 822         'QIN': 'bgr',
 823         'QUH': 'quh',
 824         'QVI': 'qvi',
 825         'QWH': 'qwh',
 826         'SIG': 'stv',
 827         'TNE': 'yrk',
 828         'ZHH': 'zh-HK',
 829         'ZHS': 'zh-Hans',
 830         'ZHT': 'zh-Hant',
 831 }
 832
 833 ot.inherit_from_macrolanguages ()
 834 bcp_47.remove_extra_macrolanguages ()
 835 ot.inherit_from_macrolanguages ()
 836 ot.sort_languages ()
 837
 838 print ('/* == Start of generated table == */')
 839 print ('/*')
 840 print (' * The following table is generated by running:')
 841 print (' *')
 842 print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
 843 print (' *')
 844 print (' * on files with these headers:')
 845 print (' *')
 846 print (' * %s' % ot.header.strip ())
 847 print (' * %s' % bcp_47.header)
 848 print (' */')
 849 print ()
 850 print ('#ifndef HB_OT_TAG_TABLE_HH')
 851 print ('#define HB_OT_TAG_TABLE_HH')
 852 print ()
 853 print ('static const LangTag ot_languages[] = {')
 854
 855 def hb_tag (tag):
 856         """Convert a tag to ``HB_TAG`` form.
 857
 858         Args:
 859                 tag (str): An OpenType tag.
 860
 861         Returns:
 862                 A snippet of C++ representing ``tag``.
 863         """
 864         return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
 865
 866 def get_variant_set (name):
 867         """Return a set of variant language names from a name.
 868
 869         Args:
 870                 name (str): A list of language names from the BCP 47 registry,
 871                         joined on ``'\\n'``.
 872
 873         Returns:
 874                 A set of normalized language names.
 875         """
 876         return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'"))
 877                         .encode ('ASCII', 'ignore')
 878                         .strip ()
 879                         for n in re.split ('[\n(),]', name) if n)
 880
 881 def language_name_intersection (a, b):
 882         """Return the names in common between two language names.
 883
 884         Args:
 885                 a (str): A list of language names from the BCP 47 registry,
 886                         joined on ``'\\n'``.
 887                 b (str): A list of language names from the BCP 47 registry,
 888                         joined on ``'\\n'``.
 889
 890         Returns:
 891                 The normalized language names shared by ``a`` and ``b``.
 892         """
 893         return get_variant_set (a).intersection (get_variant_set (b))
 894
 895 def get_matching_language_name (intersection, candidates):
 896         return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
 897
 898 def same_tag (bcp_47_tag, ot_tags):
 899         return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
 900
 901 for language, tags in sorted (ot.from_bcp_47.items ()):
 902         if language == '' or '-' in language:
 903                 continue
 904         commented_out = same_tag (language, tags)
 905         for i, tag in enumerate (tags, start=1):
 906                 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
 907                 if commented_out:
 908                         print ('*/', end='')
 909                 print ('\t/* ', end='')
 910                 bcp_47_name = bcp_47.names.get (language, '')
 911                 bcp_47_name_candidates = bcp_47_name.split ('\n')
 912                 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
 913                 scope = bcp_47.scopes.get (language, '')
 914                 if not intersection:
 915                         write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
 916                 else:
 917                         name = get_matching_language_name (intersection, bcp_47_name_candidates)
 918                         bcp_47.names[language] = name
 919                         write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
 920                 print (' */')
 921
 922 print ('};')
 923 print ()
 924
 925 print ('/**')
 926 print (' * hb_ot_tags_from_complex_language:')
 927 print (' * @lang_str: a BCP 47 language tag to convert.')
 928 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
 929 print (' * conversion.')
 930 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
 931 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
 932 print (' * @tags: array of size at least @language_count to store the language tag')
 933 print (' * results')
 934 print (' *')
 935 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
 936 print (' *')
 937 print (' * Return value: Whether any language systems were retrieved.')
 938 print (' **/')
 939 print ('static bool')
 940 print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
 941 print ('\t\t\t\t  const char   *limit,')
 942 print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
 943 print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
 944 print ('{')
 945
 946 def print_subtag_matches (subtag, new_line):
 947         if subtag:
 948                 if new_line:
 949                         print ()
 950                         print ('\t&& ', end='')
 951                 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
 952
 953 complex_tags = collections.defaultdict (list)
 954 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
 955                         (LanguageTag (language), tags)
 956                         for language, tags in sorted (ot.from_bcp_47.items (),
 957                                 key=lambda i: (-len (i[0]), i[0]))
 958                 ] if lt_tags[0].is_complex ()),
 959                 key=lambda lt_tags: lt_tags[0].get_group ()):
 960         complex_tags[initial] += group
 961
 962 for initial, items in sorted (complex_tags.items ()):
 963         if initial != 'und':
 964                 continue
 965         for lt, tags in items:
 966                 if lt.variant in bcp_47.prefixes:
 967                         expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
 968                                         '%s is not a valid prefix of %s' % (lt.language, lt.variant))
 969                 print ('  if (', end='')
 970                 print_subtag_matches (lt.script, False)
 971                 print_subtag_matches (lt.region, False)
 972                 print_subtag_matches (lt.variant, False)
 973                 print (')')
 974                 print ('  {')
 975                 write ('    /* %s */' % bcp_47.get_name (lt))
 976                 print ()
 977                 if len (tags) == 1:
 978                         write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
 979                         print ()
 980                         print ('    *count = 1;')
 981                 else:
 982                         print ('    hb_tag_t possible_tags[] = {')
 983                         for tag in tags:
 984                                 write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
 985                                 print ()
 986                         print ('    };')
 987                         print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
 988                         print ('      tags[i] = possible_tags[i];')
 989                         print ('    *count = i;')
 990                 print ('    return true;')
 991                 print ('  }')
 992
 993 print ('  switch (lang_str[0])')
 994 print ('  {')
 995 for initial, items in sorted (complex_tags.items ()):
 996         if initial == 'und':
 997                 continue
 998         print ("  case '%s':" % initial)
 999         for lt, tags in items:
1000                 print ('    if (', end='')
1001                 if lt.grandfathered:
1002                         print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1003                 else:
1004                         string_literal = lt.language[1:] + '-'
1005                         if lt.script:
1006                                 string_literal += lt.script
1007                                 lt.script = None
1008                                 if lt.region:
1009                                         string_literal += '-' + lt.region
1010                                         lt.region = None
1011                         if string_literal[-1] == '-':
1012                                 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1013                         else:
1014                                 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1015                 print_subtag_matches (lt.script, True)
1016                 print_subtag_matches (lt.region, True)
1017                 print_subtag_matches (lt.variant, True)
1018                 print (')')
1019                 print ('    {')
1020                 write ('      /* %s */' % bcp_47.get_name (lt))
1021                 print ()
1022                 if len (tags) == 1:
1023                         write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1024                         print ()
1025                         print ('      *count = 1;')
1026                 else:
1027                         print ('      unsigned int i;')
1028                         print ('      hb_tag_t possible_tags[] = {')
1029                         for tag in tags:
1030                                 write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1031                                 print ()
1032                         print ('      };')
1033                         print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1034                         print ('\ttags[i] = possible_tags[i];')
1035                         print ('      *count = i;')
1036                 print ('      return true;')
1037                 print ('    }')
1038         print ('    break;')
1039
1040 print ('  }')
1041 print ('  return false;')
1042 print ('}')
1043 print ()
1044 print ('/**')
1045 print (' * hb_ot_ambiguous_tag_to_language')
1046 print (' * @tag: A language tag.')
1047 print (' *')
1048 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1049 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1050 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1051 print (' * in #ot_languages.')
1052 print (' *')
1053 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1054 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1055 print (' **/')
1056 print ('static hb_language_t')
1057 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1058 print ('{')
1059 print ('  switch (tag)')
1060 print ('  {')
1061
1062 def verify_disambiguation_dict ():
1063         """Verify and normalize ``disambiguation``.
1064
1065         ``disambiguation`` is a map of ambiguous OpenType language system
1066         tags to the particular BCP 47 tags they correspond to. This function
1067         checks that all its keys really are ambiguous and that each key's
1068         value is valid for that key. It checks that no ambiguous tag is
1069         missing, except when it can figure out which BCP 47 tag is the best
1070         by itself.
1071
1072         It modifies ``disambiguation`` to remove keys whose values are the
1073         same as those that the fallback would return anyway, and to add
1074         ambiguous keys whose disambiguations it determined automatically.
1075
1076         Raises:
1077                 AssertionError: Verification failed.
1078         """
1079         global bcp_47
1080         global disambiguation
1081         global ot
1082         for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1083                 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1084                 if len (primary_tags) == 1:
1085                         expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1086                         if '-' in primary_tags[0]:
1087                                 disambiguation[ot_tag] = primary_tags[0]
1088                 elif len (primary_tags) == 0:
1089                         expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1090                 else:
1091                         macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1092                         if len (macrolanguages) != 1:
1093                                 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1094                         if len (macrolanguages) != 1:
1095                                 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1096                         if len (macrolanguages) != 1:
1097                                 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1098                                 expect (disambiguation[ot_tag] in bcp_47_tags,
1099                                                 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1100                         elif ot_tag not in disambiguation:
1101                                 disambiguation[ot_tag] = macrolanguages[0]
1102                         different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1103                         if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1104                                 del disambiguation[ot_tag]
1105         for ot_tag in disambiguation.keys ():
1106                 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1107
1108 verify_disambiguation_dict ()
1109 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1110         write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1111         print ()
1112         write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1113         print ()
1114
1115 print ('  default:')
1116 print ('    return HB_LANGUAGE_INVALID;')
1117 print ('  }')
1118 print ('}')
1119
1120 print ()
1121 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1122 print ()
1123 print ('/* == End of generated table == */')
1124