3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
6 It creates a ``const LangTag[]``, matching the tags from the OpenType
7 languages system tag list to the language subtags of the BCP 47 language
8 subtag registry, with some manual adjustments. The mappings are
9 supplemented with macrolanguages' sublanguages and retired codes'
10 replacements, according to BCP 47 and some manual additions where BCP 47
11 omits a retired code entirely.
13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16 multiple BCP 47 tags) are listed here, except when the alphabetically
17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
18 case, the fallback behavior will choose the right tag anyway.
20 usage: ./gen-tag-table.py languagetags language-subtag-registry
23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
29 from html.parser import HTMLParser
35 if len (sys.argv) != 3:
38 def expect (condition, message=None):
42 raise AssertionError (message)
46 sys.stdout.buffer.write (s.encode ('utf-8'))
48 DEFAULT_LANGUAGE_SYSTEM = ''
50 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
239 class LanguageTag (object):
240 """A BCP 47 language tag.
243 subtags (List[str]): The list of subtags in this tag.
244 grandfathered (bool): Whether this tag is grandfathered. If
245 ``true``, the entire lowercased tag is the ``language``
246 and the other subtag fields are empty.
247 language (str): The language subtag.
248 script (str): The script subtag.
249 region (str): The region subtag.
250 variant (str): The variant subtag.
253 tag (str): A BCP 47 language tag.
256 def __init__ (self, tag):
258 self.subtags = tag.lower ().split ('-')
259 self.grandfathered = tag.lower () in bcp_47.grandfathered
260 if self.grandfathered:
261 self.language = tag.lower ()
266 self.language = self.subtags[0]
267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
272 return '-'.join(self.subtags)
275 return 'LanguageTag(%r)' % str(self)
278 def _find_first (function, sequence):
280 return next (iter (filter (function, sequence)))
281 except StopIteration:
284 def is_complex (self):
285 """Return whether this tag is too complex to represent as a
286 ``LangTag`` in the generated code.
288 Complex tags need to be handled in
289 ``hb_ot_tags_from_complex_language``.
292 Whether this tag is complex.
294 return not (len (self.subtags) == 1
295 or self.grandfathered
296 and len (self.subtags[1]) != 3
297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
299 def get_group (self):
300 """Return the group into which this tag should be categorized in
301 ``hb_ot_tags_from_complex_language``.
303 The group is the first letter of the tag, or ``'und'`` if this tag
304 should not be matched in a ``switch`` statement in the generated
311 if (self.language == 'und'
312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313 else self.language[0])
315 class OpenTypeRegistryParser (HTMLParser):
316 """A parser for the OpenType language system tag registry.
319 header (str): The "last updated" line of the registry.
320 names (Mapping[str, str]): A map of language system tags to the
321 names they are given in the registry.
322 ranks (DefaultDict[str, int]): A map of language system tags to
323 numbers. If a single BCP 47 tag corresponds to multiple
324 OpenType tags, the tags are ordered in increasing order by
325 rank. The rank is based on the number of BCP 47 tags
326 associated with a tag, though it may be manually modified.
327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328 OpenType language system tags to sets of BCP 47 tags.
329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330 inverted. Its values start as unsorted sets;
331 ``sort_languages`` converts them to sorted lists.
332 from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
333 A copy of ``from_bcp_47``. It starts as ``None`` and is
334 populated at the beginning of the first call to
335 ``inherit_from_macrolanguages``.
339 HTMLParser.__init__ (self)
342 self.ranks = collections.defaultdict (int)
343 self.to_bcp_47 = collections.defaultdict (set)
344 self.from_bcp_47 = collections.defaultdict (set)
345 self.from_bcp_47_uninherited = None
346 # Whether the parser is in a <td> element
348 # Whether the parser is after a <br> element within the current <tr> element
350 # The text of the <td> elements of the current <tr> element.
351 self._current_tr = []
353 def handle_starttag (self, tag, attrs):
357 for attr, value in attrs:
358 if attr == 'name' and value == 'updated_at':
359 self.header = self.get_starttag_text ()
363 self._current_tr.append ('')
366 self._current_tr = []
368 def handle_endtag (self, tag):
371 elif tag == 'tr' and self._current_tr:
372 expect (2 <= len (self._current_tr) <= 3)
373 name = self._current_tr[0].strip ()
374 tag = self._current_tr[1].strip ("\t\n\v\f\r '")
377 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
378 name += ' (deprecated)'
379 tag = tag.split (' ')[0]
381 self.names[tag] = re.sub (' languages$', '', name)
382 if not self._current_tr[2]:
384 iso_codes = self._current_tr[2].strip ()
385 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
386 rank += 2 * len (self.to_bcp_47[tag])
387 self.ranks[tag] = rank
389 def handle_data (self, data):
390 if self._td and not self._br:
391 self._current_tr[-1] += data
393 def handle_charref (self, name):
394 self.handle_data (html.unescape ('&#%s;' % name))
396 def handle_entityref (self, name):
397 self.handle_data (html.unescape ('&%s;' % name))
399 def parse (self, filename):
400 """Parse the OpenType language system tag registry.
403 filename (str): The file name of the registry.
405 with open (filename, encoding='utf-8') as f:
406 self.feed (f.read ())
408 for tag, iso_codes in self.to_bcp_47.items ():
409 for iso_code in iso_codes:
410 self.from_bcp_47[iso_code].add (tag)
412 def add_language (self, bcp_47_tag, ot_tag):
413 """Add a language as if it were in the registry.
416 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
417 a language subtag, and if the language subtag is a
418 macrolanguage, then new languages are added corresponding
419 to the macrolanguages' individual languages with the
420 remainder of the tag appended.
421 ot_tag (str): An OpenType language system tag.
424 self.to_bcp_47[ot_tag].add (bcp_47_tag)
425 self.from_bcp_47[bcp_47_tag].add (ot_tag)
426 if bcp_47_tag.lower () not in bcp_47.grandfathered:
428 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
429 if macrolanguage in bcp_47.macrolanguages:
431 for language in bcp_47.macrolanguages[macrolanguage]:
432 if language.lower () not in bcp_47.grandfathered:
433 s.add ('%s-%s' % (language, suffix))
434 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
439 def _remove_language (tag_1, dict_1, dict_2):
440 for tag_2 in dict_1.pop (tag_1):
441 dict_2[tag_2].remove (tag_1)
442 if not dict_2[tag_2]:
445 def remove_language_ot (self, ot_tag):
446 """Remove an OpenType tag from the registry.
449 ot_tag (str): An OpenType tag.
451 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
453 def remove_language_bcp_47 (self, bcp_47_tag):
454 """Remove a BCP 47 tag from the registry.
457 bcp_47_tag (str): A BCP 47 tag.
459 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
461 def inherit_from_macrolanguages (self):
462 """Copy mappings from macrolanguages to individual languages.
464 If a BCP 47 tag for an individual mapping has no OpenType
465 mapping but its macrolanguage does, the mapping is copied to
466 the individual language. For example, als (Tosk Albanian) has no
467 explicit mapping, so it inherits from sq (Albanian) the mapping
470 However, if an OpenType tag maps to a BCP 47 macrolanguage and
471 some but not all of its individual languages, the mapping is not
472 inherited from the macrolanguage to the missing individual
473 languages. For example, INUK (Nunavik Inuktitut) is mapped to
474 ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
475 ikt (Inuinnaqtun, which is an individual language of iu), so
476 this method does not add a mapping from ikt to INUK.
478 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
479 some of its individual languages do, their mappings are copied
480 to the macrolanguage.
483 first_time = self.from_bcp_47_uninherited is None
485 self.from_bcp_47_uninherited = dict (self.from_bcp_47)
486 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
487 ot_macrolanguages = {
488 ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
490 blocked_ot_macrolanguages = set ()
491 if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
492 for ot_macrolanguage in ot_macrolanguages:
493 round_trip_macrolanguages = {
494 l for l in self.to_bcp_47[ot_macrolanguage]
495 if 'retired code' not in bcp_47.scopes.get (l, '')
497 round_trip_languages = {
499 if 'retired code' not in bcp_47.scopes.get (l, '')
501 intersection = round_trip_macrolanguages & round_trip_languages
502 if intersection and intersection != round_trip_languages:
503 blocked_ot_macrolanguages.add (ot_macrolanguage)
504 if ot_macrolanguages:
505 for ot_macrolanguage in ot_macrolanguages:
506 if ot_macrolanguage not in blocked_ot_macrolanguages:
507 for language in languages:
508 self.add_language (language, ot_macrolanguage)
509 if not blocked_ot_macrolanguages:
510 self.ranks[ot_macrolanguage] += 1
512 for language in languages:
513 if language in self.from_bcp_47_uninherited:
514 ot_macrolanguages |= self.from_bcp_47_uninherited[language]
516 ot_macrolanguages.clear ()
517 if not ot_macrolanguages:
519 for ot_macrolanguage in ot_macrolanguages:
520 self.add_language (macrolanguage, ot_macrolanguage)
522 def sort_languages (self):
523 """Sort the values of ``from_bcp_47`` in ascending rank order."""
524 for language, tags in self.from_bcp_47.items ():
525 self.from_bcp_47[language] = sorted (tags,
526 key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
528 ot = OpenTypeRegistryParser ()
530 class BCP47Parser (object):
531 """A parser for the BCP 47 subtag registry.
534 header (str): The "File-Date" line of the registry.
535 names (Mapping[str, str]): A map of subtags to the names they
536 are given in the registry. Each value is a
537 ``'\\n'``-separated list of names.
538 scopes (Mapping[str, str]): A map of language subtags to strings
539 suffixed to language names, including suffixes to explain
541 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
542 language subtags to the sets of language subtags which
543 inherit from them. See
544 ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
545 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
546 subtags to their prefixes.
547 grandfathered (AbstractSet[str]): The set of grandfathered tags,
548 normalized to lowercase.
555 self.macrolanguages = collections.defaultdict (set)
556 self.prefixes = collections.defaultdict (set)
557 self.grandfathered = set ()
559 def parse (self, filename):
560 """Parse the BCP 47 subtag registry.
563 filename (str): The file name of the registry.
565 with open (filename, encoding='utf-8') as f:
569 has_preferred_value = False
571 for line in itertools.chain (f, ['']):
572 line = line.rstrip ()
573 if line.startswith (' '):
574 line_buffer += line[1:]
576 line, line_buffer = line_buffer, line
577 if line.startswith ('Type: '):
578 subtag_type = line.split (' ')[1]
580 has_preferred_value = False
581 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
582 subtag = line.split (' ')[1]
583 if subtag_type == 'grandfathered':
584 self.grandfathered.add (subtag.lower ())
585 elif line.startswith ('Description: '):
586 description = line.split (' ', 1)[1].replace (' (individual language)', '')
587 description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
589 if subtag in self.names:
590 self.names[subtag] += '\n' + description
592 self.names[subtag] = description
593 elif subtag_type == 'language' or subtag_type == 'grandfathered':
594 if line.startswith ('Scope: '):
595 scope = line.split (' ')[1]
596 if scope == 'macrolanguage':
597 scope = ' [macrolanguage]'
598 elif scope == 'collection':
599 scope = ' [collection]'
602 self.scopes[subtag] = scope
603 elif line.startswith ('Deprecated: '):
604 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
606 elif deprecated and line.startswith ('Comments: see '):
607 # If a subtag is split into multiple replacement subtags,
608 # it essentially represents a macrolanguage.
609 for language in line.replace (',', '').split (' ')[2:]:
610 self._add_macrolanguage (subtag, language)
611 elif line.startswith ('Preferred-Value: '):
612 # If a subtag is deprecated in favor of a single replacement subtag,
613 # it is either a dialect or synonym of the preferred subtag. Either
614 # way, it is close enough to the truth to consider the replacement
615 # the macrolanguage of the deprecated language.
616 has_preferred_value = True
617 macrolanguage = line.split (' ')[1]
618 self._add_macrolanguage (macrolanguage, subtag)
619 elif not has_preferred_value and line.startswith ('Macrolanguage: '):
620 self._add_macrolanguage (line.split (' ')[1], subtag)
621 elif subtag_type == 'variant':
622 if line.startswith ('Deprecated: '):
623 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
624 elif line.startswith ('Prefix: '):
625 self.prefixes[subtag].add (line.split (' ')[1])
626 elif line.startswith ('File-Date: '):
630 def _add_macrolanguage (self, macrolanguage, language):
632 if language not in ot.from_bcp_47:
633 for l in self.macrolanguages.get (language, set ()):
634 self._add_macrolanguage (macrolanguage, l)
635 if macrolanguage not in ot.from_bcp_47:
636 for ls in list (self.macrolanguages.values ()):
637 if macrolanguage in ls:
640 self.macrolanguages[macrolanguage].add (language)
642 def remove_extra_macrolanguages (self):
643 """Make every language have at most one macrolanguage."""
644 inverted = collections.defaultdict (list)
645 for macrolanguage, languages in self.macrolanguages.items ():
646 for language in languages:
647 inverted[language].append (macrolanguage)
648 for language, macrolanguages in inverted.items ():
649 if len (macrolanguages) > 1:
650 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
651 biggest_macrolanguage = macrolanguages.pop ()
652 for macrolanguage in macrolanguages:
653 self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
655 def _get_name_piece (self, subtag):
656 """Return the first name of a subtag plus its scope suffix.
659 subtag (str): A BCP 47 subtag.
662 The name form of ``subtag``.
664 return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
666 def get_name (self, lt):
667 """Return the names of the subtags in a language tag.
670 lt (LanguageTag): A BCP 47 language tag.
673 The name form of ``lt``.
675 name = self._get_name_piece (lt.language)
677 name += '; ' + self._get_name_piece (lt.script.title ())
679 name += '; ' + self._get_name_piece (lt.region.upper ())
681 name += '; ' + self._get_name_piece (lt.variant)
684 bcp_47 = BCP47Parser ()
686 ot.parse (sys.argv[1])
687 bcp_47.parse (sys.argv[2])
689 ot.add_language ('ary', 'MOR')
691 ot.add_language ('ath', 'ATH')
693 ot.add_language ('bai', 'BML')
695 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
697 ot.add_language ('ber', 'BBR')
699 ot.remove_language_ot ('PGR')
700 ot.add_language ('el-polyton', 'PGR')
702 bcp_47.macrolanguages['et'] = {'ekk'}
704 bcp_47.names['flm'] = 'Falam Chin'
705 bcp_47.scopes['flm'] = ' (retired code)'
706 bcp_47.macrolanguages['flm'] = {'cfm'}
708 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
710 ot.add_language ('und-fonipa', 'IPPH')
712 ot.add_language ('und-fonnapa', 'APPH')
714 ot.remove_language_ot ('IRT')
715 ot.add_language ('ga-Latg', 'IRT')
717 ot.add_language ('hy-arevmda', 'HYE')
719 ot.remove_language_ot ('KGE')
720 ot.add_language ('und-Geok', 'KGE')
722 bcp_47.macrolanguages['id'] = {'in'}
724 bcp_47.macrolanguages['ijo'] = {'ijc'}
726 ot.add_language ('kht', 'KHN')
727 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
728 ot.ranks['KHN'] = ot.ranks['KHT'] + 1
730 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
732 ot.names['MAL'] = 'Malayalam Traditional'
735 bcp_47.names['mhv'] = 'Arakanese'
736 bcp_47.scopes['mhv'] = ' (retired code)'
738 ot.add_language ('mnw-TH', 'MONT')
740 ot.add_language ('no', 'NOR')
742 ot.add_language ('oc-provenc', 'PRO')
744 ot.remove_language_ot ('QUZ')
745 ot.add_language ('qu', 'QUZ')
746 ot.add_language ('qub', 'QWH')
747 ot.add_language ('qud', 'QVI')
748 ot.add_language ('qug', 'QVI')
749 ot.add_language ('qul', 'QUH')
750 ot.add_language ('qup', 'QVI')
751 ot.add_language ('qur', 'QWH')
752 ot.add_language ('qus', 'QUH')
753 ot.add_language ('quw', 'QVI')
754 ot.add_language ('qux', 'QWH')
755 ot.add_language ('qva', 'QWH')
756 ot.add_language ('qvh', 'QWH')
757 ot.add_language ('qvj', 'QVI')
758 ot.add_language ('qvl', 'QWH')
759 ot.add_language ('qvm', 'QWH')
760 ot.add_language ('qvn', 'QWH')
761 ot.add_language ('qvo', 'QVI')
762 ot.add_language ('qvp', 'QWH')
763 ot.add_language ('qvw', 'QWH')
764 ot.add_language ('qvz', 'QVI')
765 ot.add_language ('qwa', 'QWH')
766 ot.add_language ('qws', 'QWH')
767 ot.add_language ('qxa', 'QWH')
768 ot.add_language ('qxc', 'QWH')
769 ot.add_language ('qxh', 'QWH')
770 ot.add_language ('qxl', 'QVI')
771 ot.add_language ('qxn', 'QWH')
772 ot.add_language ('qxo', 'QWH')
773 ot.add_language ('qxr', 'QVI')
774 ot.add_language ('qxt', 'QWH')
775 ot.add_language ('qxw', 'QWH')
777 bcp_47.macrolanguages['ro-MD'].add ('mo')
779 ot.remove_language_ot ('SYRE')
780 ot.remove_language_ot ('SYRJ')
781 ot.remove_language_ot ('SYRN')
782 ot.add_language ('und-Syre', 'SYRE')
783 ot.add_language ('und-Syrj', 'SYRJ')
784 ot.add_language ('und-Syrn', 'SYRN')
786 bcp_47.names['xst'] = "Silt'e"
787 bcp_47.scopes['xst'] = ' (retired code)'
788 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
790 ot.add_language ('xwo', 'TOD')
792 ot.remove_language_ot ('ZHH')
793 ot.remove_language_ot ('ZHP')
794 ot.remove_language_ot ('ZHT')
795 ot.remove_language_ot ('ZHTM')
796 bcp_47.macrolanguages['zh'].remove ('lzh')
797 bcp_47.macrolanguages['zh'].remove ('yue')
798 ot.add_language ('zh-Hant-MO', 'ZHH')
799 ot.add_language ('zh-Hant-MO', 'ZHTM')
800 ot.add_language ('zh-Hant-HK', 'ZHH')
801 ot.add_language ('zh-Hans', 'ZHS')
802 ot.add_language ('zh-Hant', 'ZHT')
803 ot.add_language ('zh-HK', 'ZHH')
804 ot.add_language ('zh-MO', 'ZHH')
805 ot.add_language ('zh-MO', 'ZHTM')
806 ot.add_language ('zh-TW', 'ZHT')
807 ot.add_language ('lzh', 'ZHT')
808 ot.add_language ('lzh-Hans', 'ZHS')
809 ot.add_language ('yue', 'ZHH')
810 ot.add_language ('yue-Hans', 'ZHS')
812 bcp_47.macrolanguages['zom'] = {'yos'}
814 def rank_delta (bcp_47, ot):
815 """Return a delta to apply to a BCP 47 tag's rank.
817 Most OpenType tags have a constant rank, but a few have ranks that
818 depend on the BCP 47 tag.
821 bcp_47 (str): A BCP 47 tag.
822 ot (str): An OpenType tag to.
825 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
826 OpenType equivalents.
828 if bcp_47 == 'ak' and ot == 'AKA':
830 if bcp_47 == 'tw' and ot == 'TWI':
870 ot.inherit_from_macrolanguages ()
871 bcp_47.remove_extra_macrolanguages ()
872 ot.inherit_from_macrolanguages ()
873 ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
874 ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
875 for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
876 possible_bcp_47_tag = tricky_ot_tag.lower ()
877 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
878 ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
879 bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
882 print ('/* == Start of generated table == */')
884 print (' * The following table is generated by running:')
886 print (' * %s languagetags language-subtag-registry' % sys.argv[0])
888 print (' * on files with these headers:')
890 print (' * %s' % ot.header.strip ())
891 print (' * %s' % bcp_47.header)
894 print ('#ifndef HB_OT_TAG_TABLE_HH')
895 print ('#define HB_OT_TAG_TABLE_HH')
897 print ('static const LangTag ot_languages[] = {')
900 """Convert a tag to ``HB_TAG`` form.
903 tag (str): An OpenType tag.
906 A snippet of C++ representing ``tag``.
908 if tag == DEFAULT_LANGUAGE_SYSTEM:
909 return 'HB_TAG_NONE\t '
910 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
912 def get_variant_set (name):
913 """Return a set of variant language names from a name.
916 name (str): A list of language names from the BCP 47 registry,
920 A set of normalized language names.
922 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
923 .encode ('ASCII', 'ignore')
925 for n in re.split ('[\n(),]', name) if n)
927 def language_name_intersection (a, b):
928 """Return the names in common between two language names.
931 a (str): A list of language names from the BCP 47 registry,
933 b (str): A list of language names from the BCP 47 registry,
937 The normalized language names shared by ``a`` and ``b``.
939 return get_variant_set (a).intersection (get_variant_set (b))
941 def get_matching_language_name (intersection, candidates):
942 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
944 def same_tag (bcp_47_tag, ot_tags):
945 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
947 for language, tags in sorted (ot.from_bcp_47.items ()):
948 if language == '' or '-' in language:
950 commented_out = same_tag (language, tags)
951 for i, tag in enumerate (tags, start=1):
952 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
955 print ('\t/* ', end='')
956 bcp_47_name = bcp_47.names.get (language, '')
957 bcp_47_name_candidates = bcp_47_name.split ('\n')
958 ot_name = ot.names[tag]
959 scope = bcp_47.scopes.get (language, '')
960 if tag == DEFAULT_LANGUAGE_SYSTEM:
961 write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
963 intersection = language_name_intersection (bcp_47_name, ot_name)
965 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
967 name = get_matching_language_name (intersection, bcp_47_name_candidates)
968 bcp_47.names[language] = name
969 write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
976 print (' * hb_ot_tags_from_complex_language:')
977 print (' * @lang_str: a BCP 47 language tag to convert.')
978 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
979 print (' * conversion.')
980 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
981 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
982 print (' * @tags: array of size at least @language_count to store the language tag')
985 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
987 print (' * Return value: Whether any language systems were retrieved.')
989 print ('static bool')
990 print ('hb_ot_tags_from_complex_language (const char *lang_str,')
991 print ('\t\t\t\t const char *limit,')
992 print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
993 print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
996 def print_subtag_matches (subtag, new_line):
1000 print ('\t&& ', end='')
1001 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
1003 complex_tags = collections.defaultdict (list)
1004 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
1005 (LanguageTag (language), tags)
1006 for language, tags in sorted (ot.from_bcp_47.items (),
1007 key=lambda i: (-len (i[0]), i[0]))
1008 ] if lt_tags[0].is_complex ()),
1009 key=lambda lt_tags: lt_tags[0].get_group ()):
1010 complex_tags[initial] += group
1012 for initial, items in sorted (complex_tags.items ()):
1013 if initial != 'und':
1015 for lt, tags in items:
1018 if lt.variant in bcp_47.prefixes:
1019 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
1020 '%s is not a valid prefix of %s' % (lt.language, lt.variant))
1021 print (' if (', end='')
1022 print_subtag_matches (lt.script, False)
1023 print_subtag_matches (lt.region, False)
1024 print_subtag_matches (lt.variant, False)
1027 write (' /* %s */' % bcp_47.get_name (lt))
1030 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1032 print (' *count = 1;')
1034 print (' hb_tag_t possible_tags[] = {')
1036 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1039 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1040 print (' tags[i] = possible_tags[i];')
1041 print (' *count = i;')
1042 print (' return true;')
1045 print (' switch (lang_str[0])')
1047 for initial, items in sorted (complex_tags.items ()):
1048 if initial == 'und':
1050 print (" case '%s':" % initial)
1051 for lt, tags in items:
1054 print (' if (', end='')
1057 if lt.grandfathered:
1058 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1060 string_literal = lt.language[1:] + '-'
1062 string_literal += script
1065 string_literal += '-' + region
1067 if string_literal[-1] == '-':
1068 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1070 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1071 print_subtag_matches (script, True)
1072 print_subtag_matches (region, True)
1073 print_subtag_matches (lt.variant, True)
1076 write (' /* %s */' % bcp_47.get_name (lt))
1079 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1081 print (' *count = 1;')
1083 print (' unsigned int i;')
1084 print (' hb_tag_t possible_tags[] = {')
1086 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1089 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1090 print ('\ttags[i] = possible_tags[i];')
1091 print (' *count = i;')
1092 print (' return true;')
1097 print (' return false;')
1101 print (' * hb_ot_ambiguous_tag_to_language')
1102 print (' * @tag: A language tag.')
1104 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1105 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1106 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1107 print (' * in #ot_languages.')
1109 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1110 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1112 print ('static hb_language_t')
1113 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1115 print (' switch (tag)')
1118 def verify_disambiguation_dict ():
1119 """Verify and normalize ``disambiguation``.
1121 ``disambiguation`` is a map of ambiguous OpenType language system
1122 tags to the particular BCP 47 tags they correspond to. This function
1123 checks that all its keys really are ambiguous and that each key's
1124 value is valid for that key. It checks that no ambiguous tag is
1125 missing, except when it can figure out which BCP 47 tag is the best
1128 It modifies ``disambiguation`` to remove keys whose values are the
1129 same as those that the fallback would return anyway, and to add
1130 ambiguous keys whose disambiguations it determined automatically.
1133 AssertionError: Verification failed.
1136 global disambiguation
1138 for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1139 if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
1142 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1143 if len (primary_tags) == 1:
1144 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1145 if '-' in primary_tags[0]:
1146 disambiguation[ot_tag] = primary_tags[0]
1148 first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1149 if primary_tags[0] != first_tag:
1150 disambiguation[ot_tag] = primary_tags[0]
1151 elif len (primary_tags) == 0:
1152 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1154 original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
1155 if len (original_languages) == 1:
1156 macrolanguages = original_languages
1158 macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
1159 if len (macrolanguages) != 1:
1160 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
1161 if len (macrolanguages) != 1:
1162 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1163 if len (macrolanguages) != 1:
1164 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1165 expect (disambiguation[ot_tag] in bcp_47_tags,
1166 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1167 elif ot_tag not in disambiguation:
1168 disambiguation[ot_tag] = macrolanguages[0]
1169 different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1170 if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
1171 del disambiguation[ot_tag]
1172 for ot_tag in disambiguation.keys ():
1173 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1175 verify_disambiguation_dict ()
1176 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1177 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1179 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1183 print (' return HB_LANGUAGE_INVALID;')
1188 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1190 print ('/* == End of generated table == */')