3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
6 It creates a ``const LangTag[]``, matching the tags from the OpenType
7 languages system tag list to the language subtags of the BCP 47 language
8 subtag registry, with some manual adjustments. The mappings are
9 supplemented with macrolanguages' sublanguages and retired codes'
10 replacements, according to BCP 47 and some manual additions where BCP 47
11 omits a retired code entirely.
13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16 multiple BCP 47 tags) are listed here, except when the alphabetically
17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
18 case, the fallback behavior will choose the right tag anyway.
21 from __future__ import absolute_import, division, print_function, unicode_literals
25 from HTMLParser import HTMLParser
27 print (s.encode ('utf-8'), end='')
29 from html.parser import HTMLParser
32 sys.stdout.buffer.write (s.encode ('utf-8'))
39 if len (sys.argv) != 3:
40 print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr)
44 from html import unescape
45 def html_unescape (parser, entity):
46 return unescape (entity)
48 def html_unescape (parser, entity):
49 return parser.unescape (entity)
51 def expect (condition, message=None):
55 raise AssertionError (message)
57 # from http://www-01.sil.org/iso639-3/iso-639-3.tab
246 class LanguageTag (object):
247 """A BCP 47 language tag.
250 subtags (List[str]): The list of subtags in this tag.
251 grandfathered (bool): Whether this tag is grandfathered. If
252 ``true``, the entire lowercased tag is the ``language``
253 and the other subtag fields are empty.
254 language (str): The language subtag.
255 script (str): The script subtag.
256 region (str): The region subtag.
257 variant (str): The variant subtag.
260 tag (str): A BCP 47 language tag.
263 def __init__ (self, tag):
265 self.subtags = tag.lower ().split ('-')
266 self.grandfathered = tag.lower () in bcp_47.grandfathered
267 if self.grandfathered:
268 self.language = tag.lower ()
273 self.language = self.subtags[0]
274 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
275 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
276 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
279 return '-'.join(self.subtags)
282 return 'LanguageTag(%r)' % str(self)
285 def _find_first (function, sequence):
287 return next (iter (filter (function, sequence)))
288 except StopIteration:
291 def is_complex (self):
292 """Return whether this tag is too complex to represent as a
293 ``LangTag`` in the generated code.
295 Complex tags need to be handled in
296 ``hb_ot_tags_from_complex_language``.
299 Whether this tag is complex.
301 return not (len (self.subtags) == 1
302 or self.grandfathered
303 and len (self.subtags[1]) != 3
304 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
306 def get_group (self):
307 """Return the group into which this tag should be categorized in
308 ``hb_ot_tags_from_complex_language``.
310 The group is the first letter of the tag, or ``'und'`` if this tag
311 should not be matched in a ``switch`` statement in the generated
318 if (self.language == 'und'
319 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
320 else self.language[0])
322 class OpenTypeRegistryParser (HTMLParser):
323 """A parser for the OpenType language system tag registry.
326 header (str): The "last updated" line of the registry.
327 names (Mapping[str, str]): A map of language system tags to the
328 names they are given in the registry.
329 ranks (DefaultDict[str, int]): A map of language system tags to
330 numbers. If a single BCP 47 tag corresponds to multiple
331 OpenType tags, the tags are ordered in increasing order by
332 rank. The rank is based on the number of BCP 47 tags
333 associated with a tag, though it may be manually modified.
334 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
335 OpenType language system tags to sets of BCP 47 tags.
336 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
337 inverted. Its values start as unsorted sets;
338 ``sort_languages`` converts them to sorted lists.
342 HTMLParser.__init__ (self)
345 self.ranks = collections.defaultdict (int)
346 self.to_bcp_47 = collections.defaultdict (set)
347 self.from_bcp_47 = collections.defaultdict (set)
348 # Whether the parser is in a <td> element
350 # The text of the <td> elements of the current <tr> element.
351 self._current_tr = []
353 def handle_starttag (self, tag, attrs):
355 for attr, value in attrs:
356 if attr == 'name' and value == 'updated_at':
357 self.header = self.get_starttag_text ()
361 self._current_tr.append ('')
363 self._current_tr = []
365 def handle_endtag (self, tag):
368 elif tag == 'tr' and self._current_tr:
369 expect (2 <= len (self._current_tr) <= 3)
370 name = self._current_tr[0].strip ()
371 tag = self._current_tr[1].strip ("\t\n\v\f\r '")
374 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
375 name += ' (deprecated)'
376 tag = tag.split (' ')[0]
378 self.names[tag] = re.sub (' languages$', '', name)
379 if not self._current_tr[2]:
381 iso_codes = self._current_tr[2].strip ()
382 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
383 rank += 2 * len (self.to_bcp_47[tag])
384 self.ranks[tag] = rank
386 def handle_data (self, data):
388 self._current_tr[-1] += data
390 def handle_charref (self, name):
391 self.handle_data (html_unescape (self, '&#%s;' % name))
393 def handle_entityref (self, name):
394 self.handle_data (html_unescape (self, '&%s;' % name))
396 def parse (self, filename):
397 """Parse the OpenType language system tag registry.
400 filename (str): The file name of the registry.
402 with io.open (filename, encoding='utf-8') as f:
403 self.feed (f.read ())
405 for tag, iso_codes in self.to_bcp_47.items ():
406 for iso_code in iso_codes:
407 self.from_bcp_47[iso_code].add (tag)
409 def add_language (self, bcp_47_tag, ot_tag):
410 """Add a language as if it were in the registry.
413 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
414 a language subtag, and if the language subtag is a
415 macrolanguage, then new languages are added corresponding
416 to the macrolanguages' individual languages with the
417 remainder of the tag appended.
418 ot_tag (str): An OpenType language system tag.
421 self.to_bcp_47[ot_tag].add (bcp_47_tag)
422 self.from_bcp_47[bcp_47_tag].add (ot_tag)
423 if bcp_47_tag.lower () not in bcp_47.grandfathered:
425 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
426 if macrolanguage in bcp_47.macrolanguages:
428 for language in bcp_47.macrolanguages[macrolanguage]:
429 if language.lower () not in bcp_47.grandfathered:
430 s.add ('%s-%s' % (language, suffix))
431 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
436 def _remove_language (tag_1, dict_1, dict_2):
437 for tag_2 in dict_1.pop (tag_1):
438 dict_2[tag_2].remove (tag_1)
439 if not dict_2[tag_2]:
442 def remove_language_ot (self, ot_tag):
443 """Remove an OpenType tag from the registry.
446 ot_tag (str): An OpenType tag.
448 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
450 def remove_language_bcp_47 (self, bcp_47_tag):
451 """Remove a BCP 47 tag from the registry.
454 bcp_47_tag (str): A BCP 47 tag.
456 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
458 def inherit_from_macrolanguages (self):
459 """Copy mappings from macrolanguages to individual languages.
461 If a BCP 47 tag for an individual mapping has no OpenType
462 mapping but its macrolanguage does, the mapping is copied to
463 the individual language. For example, als (Tosk Albanian) has no
464 explicit mapping, so it inherits from sq (Albanian) the mapping
467 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
468 all of its individual languages do and they all map to the same
469 tags, the mapping is copied to the macrolanguage.
472 original_ot_from_bcp_47 = dict (self.from_bcp_47)
473 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
474 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
475 if ot_macrolanguages:
476 for ot_macrolanguage in ot_macrolanguages:
477 for language in languages:
478 # Remove the following condition if e.g. nn should map to NYN,NOR
479 # instead of just NYN.
480 if language not in original_ot_from_bcp_47:
481 self.add_language (language, ot_macrolanguage)
482 self.ranks[ot_macrolanguage] += 1
484 for language in languages:
485 if language in original_ot_from_bcp_47:
486 if ot_macrolanguages:
487 ml = original_ot_from_bcp_47[language]
489 ot_macrolanguages &= ml
493 ot_macrolanguages |= original_ot_from_bcp_47[language]
495 ot_macrolanguages.clear ()
496 if not ot_macrolanguages:
498 for ot_macrolanguage in ot_macrolanguages:
499 self.add_language (macrolanguage, ot_macrolanguage)
501 def sort_languages (self):
502 """Sort the values of ``from_bcp_47`` in ascending rank order."""
503 for language, tags in self.from_bcp_47.items ():
504 self.from_bcp_47[language] = sorted (tags,
505 key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
507 ot = OpenTypeRegistryParser ()
509 class BCP47Parser (object):
510 """A parser for the BCP 47 subtag registry.
513 header (str): The "File-Date" line of the registry.
514 names (Mapping[str, str]): A map of subtags to the names they
515 are given in the registry. Each value is a
516 ``'\\n'``-separated list of names.
517 scopes (Mapping[str, str]): A map of language subtags to strings
518 suffixed to language names, including suffixes to explain
520 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
521 language subtags to the sets of language subtags which
522 inherit from them. See
523 ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
524 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
525 subtags to their prefixes.
526 grandfathered (AbstractSet[str]): The set of grandfathered tags,
527 normalized to lowercase.
534 self.macrolanguages = collections.defaultdict (set)
535 self.prefixes = collections.defaultdict (set)
536 self.grandfathered = set ()
538 def parse (self, filename):
539 """Parse the BCP 47 subtag registry.
542 filename (str): The file name of the registry.
544 with io.open (filename, encoding='utf-8') as f:
548 has_preferred_value = False
550 for line in itertools.chain (f, ['']):
551 line = line.rstrip ()
552 if line.startswith (' '):
553 line_buffer += line[1:]
555 line, line_buffer = line_buffer, line
556 if line.startswith ('Type: '):
557 subtag_type = line.split (' ')[1]
559 has_preferred_value = False
560 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
561 subtag = line.split (' ')[1]
562 if subtag_type == 'grandfathered':
563 self.grandfathered.add (subtag.lower ())
564 elif line.startswith ('Description: '):
565 description = line.split (' ', 1)[1].replace (' (individual language)', '')
566 description = re.sub (' (\((individual |macro)language\)|languages)$', '',
568 if subtag in self.names:
569 self.names[subtag] += '\n' + description
571 self.names[subtag] = description
572 elif subtag_type == 'language' or subtag_type == 'grandfathered':
573 if line.startswith ('Scope: '):
574 scope = line.split (' ')[1]
575 if scope == 'macrolanguage':
576 scope = ' [macrolanguage]'
577 elif scope == 'collection':
581 self.scopes[subtag] = scope
582 elif line.startswith ('Deprecated: '):
583 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
585 elif deprecated and line.startswith ('Comments: see '):
586 # If a subtag is split into multiple replacement subtags,
587 # it essentially represents a macrolanguage.
588 for language in line.replace (',', '').split (' ')[2:]:
589 self._add_macrolanguage (subtag, language)
590 elif line.startswith ('Preferred-Value: '):
591 # If a subtag is deprecated in favor of a single replacement subtag,
592 # it is either a dialect or synonym of the preferred subtag. Either
593 # way, it is close enough to the truth to consider the replacement
594 # the macrolanguage of the deprecated language.
595 has_preferred_value = True
596 macrolanguage = line.split (' ')[1]
597 self._add_macrolanguage (macrolanguage, subtag)
598 elif not has_preferred_value and line.startswith ('Macrolanguage: '):
599 self._add_macrolanguage (line.split (' ')[1], subtag)
600 elif subtag_type == 'variant':
601 if line.startswith ('Prefix: '):
602 self.prefixes[subtag].add (line.split (' ')[1])
603 elif line.startswith ('File-Date: '):
607 def _add_macrolanguage (self, macrolanguage, language):
609 if language not in ot.from_bcp_47:
610 for l in self.macrolanguages.get (language, set ()):
611 self._add_macrolanguage (macrolanguage, l)
612 if macrolanguage not in ot.from_bcp_47:
613 for ls in list (self.macrolanguages.values ()):
614 if macrolanguage in ls:
617 self.macrolanguages[macrolanguage].add (language)
619 def remove_extra_macrolanguages (self):
620 """Make every language have at most one macrolanguage."""
621 inverted = collections.defaultdict (list)
622 for macrolanguage, languages in self.macrolanguages.items ():
623 for language in languages:
624 inverted[language].append (macrolanguage)
625 for language, macrolanguages in inverted.items ():
626 if len (macrolanguages) > 1:
627 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
628 biggest_macrolanguage = macrolanguages.pop ()
629 for macrolanguage in macrolanguages:
630 self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
632 def get_name (self, lt):
633 """Return the names of the subtags in a language tag.
636 lt (LanguageTag): A BCP 47 language tag.
639 The name form of ``lt``.
641 name = self.names[lt.language].split ('\n')[0]
643 name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
645 name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
647 name += '; ' + self.names[lt.variant].split ('\n')[0]
650 bcp_47 = BCP47Parser ()
652 ot.parse (sys.argv[1])
653 bcp_47.parse (sys.argv[2])
655 ot.add_language ('ary', 'MOR')
657 ot.add_language ('ath', 'ATH')
659 ot.add_language ('bai', 'BML')
661 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
663 ot.add_language ('ber', 'BBR')
665 ot.remove_language_ot ('PGR')
666 ot.add_language ('el-polyton', 'PGR')
668 bcp_47.macrolanguages['et'] = {'ekk'}
670 bcp_47.names['flm'] = 'Falam Chin'
671 bcp_47.scopes['flm'] = ' (retired code)'
672 bcp_47.macrolanguages['flm'] = {'cfm'}
674 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
676 ot.add_language ('und-fonipa', 'IPPH')
678 ot.add_language ('und-fonnapa', 'APPH')
680 ot.remove_language_ot ('IRT')
681 ot.add_language ('ga-Latg', 'IRT')
683 ot.remove_language_ot ('KGE')
684 ot.add_language ('und-Geok', 'KGE')
686 ot.add_language ('guk', 'GUK')
687 ot.names['GUK'] = 'Gumuz (SIL fonts)'
688 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
690 bcp_47.macrolanguages['id'] = {'in'}
692 bcp_47.macrolanguages['ijo'] = {'ijc'}
694 ot.add_language ('kht', 'KHN')
695 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
696 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
697 ot.ranks['KHN'] = ot.ranks['KHT']
700 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
702 ot.names['MAL'] = 'Malayalam Traditional'
705 bcp_47.names['mhv'] = 'Arakanese'
706 bcp_47.scopes['mhv'] = ' (retired code)'
708 ot.add_language ('no', 'NOR')
710 ot.add_language ('oc-provenc', 'PRO')
712 ot.add_language ('qu', 'QUZ')
713 ot.add_language ('qub', 'QWH')
714 ot.add_language ('qud', 'QVI')
715 ot.add_language ('qug', 'QVI')
716 ot.add_language ('qup', 'QVI')
717 ot.add_language ('qur', 'QWH')
718 ot.add_language ('qus', 'QUH')
719 ot.add_language ('quw', 'QVI')
720 ot.add_language ('qux', 'QWH')
721 ot.add_language ('qva', 'QWH')
722 ot.add_language ('qvh', 'QWH')
723 ot.add_language ('qvj', 'QVI')
724 ot.add_language ('qvl', 'QWH')
725 ot.add_language ('qvm', 'QWH')
726 ot.add_language ('qvn', 'QWH')
727 ot.add_language ('qvo', 'QVI')
728 ot.add_language ('qvp', 'QWH')
729 ot.add_language ('qvw', 'QWH')
730 ot.add_language ('qvz', 'QVI')
731 ot.add_language ('qwa', 'QWH')
732 ot.add_language ('qws', 'QWH')
733 ot.add_language ('qxa', 'QWH')
734 ot.add_language ('qxc', 'QWH')
735 ot.add_language ('qxh', 'QWH')
736 ot.add_language ('qxl', 'QVI')
737 ot.add_language ('qxn', 'QWH')
738 ot.add_language ('qxo', 'QWH')
739 ot.add_language ('qxr', 'QVI')
740 ot.add_language ('qxt', 'QWH')
741 ot.add_language ('qxw', 'QWH')
743 bcp_47.macrolanguages['ro'].remove ('mo')
744 bcp_47.macrolanguages['ro-MD'].add ('mo')
746 ot.add_language ('sgw', 'SGW')
747 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
748 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
750 ot.remove_language_ot ('SYRE')
751 ot.remove_language_ot ('SYRJ')
752 ot.remove_language_ot ('SYRN')
753 ot.add_language ('und-Syre', 'SYRE')
754 ot.add_language ('und-Syrj', 'SYRJ')
755 ot.add_language ('und-Syrn', 'SYRN')
757 bcp_47.names['xst'] = u"Silt'e"
758 bcp_47.scopes['xst'] = ' (retired code)'
759 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
761 ot.add_language ('xwo', 'TOD')
763 ot.remove_language_ot ('ZHH')
764 ot.remove_language_ot ('ZHP')
765 ot.remove_language_ot ('ZHT')
766 bcp_47.macrolanguages['zh'].remove ('lzh')
767 bcp_47.macrolanguages['zh'].remove ('yue')
768 ot.add_language ('zh-Hant-MO', 'ZHH')
769 ot.add_language ('zh-Hant-HK', 'ZHH')
770 ot.add_language ('zh-Hans', 'ZHS')
771 ot.add_language ('zh-Hant', 'ZHT')
772 ot.add_language ('zh-HK', 'ZHH')
773 ot.add_language ('zh-MO', 'ZHH')
774 ot.add_language ('zh-TW', 'ZHT')
775 ot.add_language ('lzh', 'ZHT')
776 ot.add_language ('lzh-Hans', 'ZHS')
777 ot.add_language ('yue', 'ZHH')
778 ot.add_language ('yue-Hans', 'ZHS')
780 bcp_47.macrolanguages['zom'] = {'yos'}
782 def rank_delta (bcp_47, ot):
783 """Return a delta to apply to a BCP 47 tag's rank.
785 Most OpenType tags have a constant rank, but a few have ranks that
786 depend on the BCP 47 tag.
789 bcp_47 (str): A BCP 47 tag.
790 ot (str): An OpenType tag to.
793 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
794 OpenType equivalents.
796 if bcp_47 == 'ak' and ot == 'AKA':
798 if bcp_47 == 'tw' and ot == 'TWI':
833 ot.inherit_from_macrolanguages ()
834 bcp_47.remove_extra_macrolanguages ()
835 ot.inherit_from_macrolanguages ()
838 print ('/* == Start of generated table == */')
840 print (' * The following table is generated by running:')
842 print (' * %s languagetags language-subtag-registry' % sys.argv[0])
844 print (' * on files with these headers:')
846 print (' * %s' % ot.header.strip ())
847 print (' * %s' % bcp_47.header)
850 print ('#ifndef HB_OT_TAG_TABLE_HH')
851 print ('#define HB_OT_TAG_TABLE_HH')
853 print ('static const LangTag ot_languages[] = {')
856 """Convert a tag to ``HB_TAG`` form.
859 tag (str): An OpenType tag.
862 A snippet of C++ representing ``tag``.
864 return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
866 def get_variant_set (name):
867 """Return a set of variant language names from a name.
870 name (str): A list of language names from the BCP 47 registry,
874 A set of normalized language names.
876 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'"))
877 .encode ('ASCII', 'ignore')
879 for n in re.split ('[\n(),]', name) if n)
881 def language_name_intersection (a, b):
882 """Return the names in common between two language names.
885 a (str): A list of language names from the BCP 47 registry,
887 b (str): A list of language names from the BCP 47 registry,
891 The normalized language names shared by ``a`` and ``b``.
893 return get_variant_set (a).intersection (get_variant_set (b))
895 def get_matching_language_name (intersection, candidates):
896 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
898 def same_tag (bcp_47_tag, ot_tags):
899 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
901 for language, tags in sorted (ot.from_bcp_47.items ()):
902 if language == '' or '-' in language:
904 commented_out = same_tag (language, tags)
905 for i, tag in enumerate (tags, start=1):
906 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
909 print ('\t/* ', end='')
910 bcp_47_name = bcp_47.names.get (language, '')
911 bcp_47_name_candidates = bcp_47_name.split ('\n')
912 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
913 scope = bcp_47.scopes.get (language, '')
915 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
917 name = get_matching_language_name (intersection, bcp_47_name_candidates)
918 bcp_47.names[language] = name
919 write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
926 print (' * hb_ot_tags_from_complex_language:')
927 print (' * @lang_str: a BCP 47 language tag to convert.')
928 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
929 print (' * conversion.')
930 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
931 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
932 print (' * @tags: array of size at least @language_count to store the language tag')
935 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
937 print (' * Return value: Whether any language systems were retrieved.')
939 print ('static bool')
940 print ('hb_ot_tags_from_complex_language (const char *lang_str,')
941 print ('\t\t\t\t const char *limit,')
942 print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
943 print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
946 def print_subtag_matches (subtag, new_line):
950 print ('\t&& ', end='')
951 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
953 complex_tags = collections.defaultdict (list)
954 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
955 (LanguageTag (language), tags)
956 for language, tags in sorted (ot.from_bcp_47.items (),
957 key=lambda i: (-len (i[0]), i[0]))
958 ] if lt_tags[0].is_complex ()),
959 key=lambda lt_tags: lt_tags[0].get_group ()):
960 complex_tags[initial] += group
962 for initial, items in sorted (complex_tags.items ()):
965 for lt, tags in items:
966 if lt.variant in bcp_47.prefixes:
967 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
968 '%s is not a valid prefix of %s' % (lt.language, lt.variant))
969 print (' if (', end='')
970 print_subtag_matches (lt.script, False)
971 print_subtag_matches (lt.region, False)
972 print_subtag_matches (lt.variant, False)
975 write (' /* %s */' % bcp_47.get_name (lt))
978 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
980 print (' *count = 1;')
982 print (' hb_tag_t possible_tags[] = {')
984 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
987 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
988 print (' tags[i] = possible_tags[i];')
989 print (' *count = i;')
990 print (' return true;')
993 print (' switch (lang_str[0])')
995 for initial, items in sorted (complex_tags.items ()):
998 print (" case '%s':" % initial)
999 for lt, tags in items:
1000 print (' if (', end='')
1001 if lt.grandfathered:
1002 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1004 string_literal = lt.language[1:] + '-'
1006 string_literal += lt.script
1009 string_literal += '-' + lt.region
1011 if string_literal[-1] == '-':
1012 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1014 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1015 print_subtag_matches (lt.script, True)
1016 print_subtag_matches (lt.region, True)
1017 print_subtag_matches (lt.variant, True)
1020 write (' /* %s */' % bcp_47.get_name (lt))
1023 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1025 print (' *count = 1;')
1027 print (' unsigned int i;')
1028 print (' hb_tag_t possible_tags[] = {')
1030 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1033 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1034 print ('\ttags[i] = possible_tags[i];')
1035 print (' *count = i;')
1036 print (' return true;')
1041 print (' return false;')
1045 print (' * hb_ot_ambiguous_tag_to_language')
1046 print (' * @tag: A language tag.')
1048 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1049 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1050 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1051 print (' * in #ot_languages.')
1053 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1054 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1056 print ('static hb_language_t')
1057 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1059 print (' switch (tag)')
1062 def verify_disambiguation_dict ():
1063 """Verify and normalize ``disambiguation``.
1065 ``disambiguation`` is a map of ambiguous OpenType language system
1066 tags to the particular BCP 47 tags they correspond to. This function
1067 checks that all its keys really are ambiguous and that each key's
1068 value is valid for that key. It checks that no ambiguous tag is
1069 missing, except when it can figure out which BCP 47 tag is the best
1072 It modifies ``disambiguation`` to remove keys whose values are the
1073 same as those that the fallback would return anyway, and to add
1074 ambiguous keys whose disambiguations it determined automatically.
1077 AssertionError: Verification failed.
1080 global disambiguation
1082 for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1083 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1084 if len (primary_tags) == 1:
1085 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1086 if '-' in primary_tags[0]:
1087 disambiguation[ot_tag] = primary_tags[0]
1088 elif len (primary_tags) == 0:
1089 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1091 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1092 if len (macrolanguages) != 1:
1093 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1094 if len (macrolanguages) != 1:
1095 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1096 if len (macrolanguages) != 1:
1097 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1098 expect (disambiguation[ot_tag] in bcp_47_tags,
1099 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1100 elif ot_tag not in disambiguation:
1101 disambiguation[ot_tag] = macrolanguages[0]
1102 different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1103 if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1104 del disambiguation[ot_tag]
1105 for ot_tag in disambiguation.keys ():
1106 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1108 verify_disambiguation_dict ()
1109 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1110 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1112 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1116 print (' return HB_LANGUAGE_INVALID;')
1121 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1123 print ('/* == End of generated table == */')