3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
6 It creates a ``const LangTag[]``, matching the tags from the OpenType
7 languages system tag list to the language subtags of the BCP 47 language
8 subtag registry, with some manual adjustments. The mappings are
9 supplemented with macrolanguages' sublanguages and retired codes'
10 replacements, according to BCP 47 and some manual additions where BCP 47
11 omits a retired code entirely.
13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16 multiple BCP 47 tags) are listed here, except when the alphabetically
17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
18 case, the fallback behavior will choose the right tag anyway.
21 from __future__ import absolute_import, division, print_function, unicode_literals
25 from HTMLParser import HTMLParser
27 print (s.encode ('utf-8'), end='')
29 from html.parser import HTMLParser
32 sys.stdout.buffer.write (s.encode ('utf-8'))
39 if len (sys.argv) != 3:
40 print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr)
44 from html import unescape
45 def html_unescape (parser, entity):
46 return unescape (entity)
48 def html_unescape (parser, entity):
49 return parser.unescape (entity)
51 def expect (condition, message=None):
55 raise AssertionError (message)
57 # from http://www-01.sil.org/iso639-3/iso-639-3.tab
246 class LanguageTag (object):
247 """A BCP 47 language tag.
250 subtags (List[str]): The list of subtags in this tag.
251 grandfathered (bool): Whether this tag is grandfathered. If
252 ``true``, the entire lowercased tag is the ``language``
253 and the other subtag fields are empty.
254 language (str): The language subtag.
255 script (str): The script subtag.
256 region (str): The region subtag.
257 variant (str): The variant subtag.
260 tag (str): A BCP 47 language tag.
263 def __init__ (self, tag):
265 self.subtags = tag.lower ().split ('-')
266 self.grandfathered = tag.lower () in bcp_47.grandfathered
267 if self.grandfathered:
268 self.language = tag.lower ()
273 self.language = self.subtags[0]
274 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
275 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
276 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
279 return '-'.join(self.subtags)
282 return 'LanguageTag(%r)' % str(self)
285 def _find_first (function, sequence):
287 return next (iter (filter (function, sequence)))
288 except StopIteration:
291 def is_complex (self):
292 """Return whether this tag is too complex to represent as a
293 ``LangTag`` in the generated code.
295 Complex tags need to be handled in
296 ``hb_ot_tags_from_complex_language``.
299 Whether this tag is complex.
301 return not (len (self.subtags) == 1
302 or self.grandfathered
303 and len (self.subtags[1]) != 3
304 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
306 def get_group (self):
307 """Return the group into which this tag should be categorized in
308 ``hb_ot_tags_from_complex_language``.
310 The group is the first letter of the tag, or ``'und'`` if this tag
311 should not be matched in a ``switch`` statement in the generated
318 if (self.language == 'und'
319 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
320 else self.language[0])
322 class OpenTypeRegistryParser (HTMLParser):
323 """A parser for the OpenType language system tag registry.
326 header (str): The "last updated" line of the registry.
327 names (Mapping[str, str]): A map of language system tags to the
328 names they are given in the registry.
329 ranks (DefaultDict[str, int]): A map of language system tags to
330 numbers. If a single BCP 47 tag corresponds to multiple
331 OpenType tags, the tags are ordered in increasing order by
332 rank. The rank is based on the number of BCP 47 tags
333 associated with a tag, though it may be manually modified.
334 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
335 OpenType language system tags to sets of BCP 47 tags.
336 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
337 inverted. Its values start as unsorted sets;
338 ``sort_languages`` converts them to sorted lists.
342 HTMLParser.__init__ (self)
345 self.ranks = collections.defaultdict (int)
346 self.to_bcp_47 = collections.defaultdict (set)
347 self.from_bcp_47 = collections.defaultdict (set)
348 # Whether the parser is in a <td> element
350 # The text of the <td> elements of the current <tr> element.
351 self._current_tr = []
353 def handle_starttag (self, tag, attrs):
355 for attr, value in attrs:
356 if attr == 'name' and value == 'updated_at':
357 self.header = self.get_starttag_text ()
361 self._current_tr.append ('')
363 self._current_tr = []
365 def handle_endtag (self, tag):
368 elif tag == 'tr' and self._current_tr:
369 expect (2 <= len (self._current_tr) <= 3)
370 name = self._current_tr[0].strip ()
371 tag = self._current_tr[1].strip ("\t\n\v\f\r '")
374 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
375 name += ' (deprecated)'
376 tag = tag.split (' ')[0]
378 self.names[tag] = re.sub (' languages$', '', name)
379 if not self._current_tr[2]:
381 iso_codes = self._current_tr[2].strip ()
382 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
383 rank += 2 * len (self.to_bcp_47[tag])
384 self.ranks[tag] = rank
386 def handle_data (self, data):
388 self._current_tr[-1] += data
390 def handle_charref (self, name):
391 self.handle_data (html_unescape (self, '&#%s;' % name))
393 def handle_entityref (self, name):
394 self.handle_data (html_unescape (self, '&%s;' % name))
396 def parse (self, filename):
397 """Parse the OpenType language system tag registry.
400 filename (str): The file name of the registry.
402 with io.open (filename, encoding='utf-8') as f:
403 self.feed (f.read ())
405 for tag, iso_codes in self.to_bcp_47.items ():
406 for iso_code in iso_codes:
407 self.from_bcp_47[iso_code].add (tag)
409 def add_language (self, bcp_47_tag, ot_tag):
410 """Add a language as if it were in the registry.
413 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
414 a language subtag, and if the language subtag is a
415 macrolanguage, then new languages are added corresponding
416 to the macrolanguages' individual languages with the
417 remainder of the tag appended.
418 ot_tag (str): An OpenType language system tag.
421 self.to_bcp_47[ot_tag].add (bcp_47_tag)
422 self.from_bcp_47[bcp_47_tag].add (ot_tag)
423 if bcp_47_tag.lower () not in bcp_47.grandfathered:
425 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
426 if macrolanguage in bcp_47.macrolanguages:
428 for language in bcp_47.macrolanguages[macrolanguage]:
429 if language.lower () not in bcp_47.grandfathered:
430 s.add ('%s-%s' % (language, suffix))
431 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
436 def _remove_language (tag_1, dict_1, dict_2):
437 for tag_2 in dict_1.pop (tag_1):
438 dict_2[tag_2].remove (tag_1)
439 if not dict_2[tag_2]:
442 def remove_language_ot (self, ot_tag):
443 """Remove an OpenType tag from the registry.
446 ot_tag (str): An OpenType tag.
448 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
450 def remove_language_bcp_47 (self, bcp_47_tag):
451 """Remove a BCP 47 tag from the registry.
454 bcp_47_tag (str): A BCP 47 tag.
456 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
458 def inherit_from_macrolanguages (self):
459 """Copy mappings from macrolanguages to individual languages.
461 If a BCP 47 tag for an individual mapping has no OpenType
462 mapping but its macrolanguage does, the mapping is copied to
463 the individual language. For example, als (Tosk Albanian) has no
464 explicit mapping, so it inherits from sq (Albanian) the mapping
467 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
468 all of its individual languages do and they all map to the same
469 tags, the mapping is copied to the macrolanguage.
472 original_ot_from_bcp_47 = dict (self.from_bcp_47)
473 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
474 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
475 if ot_macrolanguages:
476 for ot_macrolanguage in ot_macrolanguages:
477 for language in languages:
478 # Remove the following condition if e.g. nn should map to NYN,NOR
479 # instead of just NYN.
480 if language not in original_ot_from_bcp_47:
481 self.add_language (language, ot_macrolanguage)
482 self.ranks[ot_macrolanguage] += 1
484 for language in languages:
485 if language in original_ot_from_bcp_47:
486 if ot_macrolanguages:
487 ml = original_ot_from_bcp_47[language]
489 ot_macrolanguages &= ml
493 ot_macrolanguages |= original_ot_from_bcp_47[language]
495 ot_macrolanguages.clear ()
496 if not ot_macrolanguages:
498 for ot_macrolanguage in ot_macrolanguages:
499 self.add_language (macrolanguage, ot_macrolanguage)
501 def sort_languages (self):
502 """Sort the values of ``from_bcp_47`` in ascending rank order."""
503 for language, tags in self.from_bcp_47.items ():
504 self.from_bcp_47[language] = sorted (tags,
505 key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
507 ot = OpenTypeRegistryParser ()
509 class BCP47Parser (object):
510 """A parser for the BCP 47 subtag registry.
513 header (str): The "File-Date" line of the registry.
514 names (Mapping[str, str]): A map of subtags to the names they
515 are given in the registry. Each value is a
516 ``'\\n'``-separated list of names.
517 scopes (Mapping[str, str]): A map of language subtags to strings
518 suffixed to language names, including suffixes to explain
520 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
521 language subtags to the sets of language subtags which
522 inherit from them. See
523 ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
524 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
525 subtags to their prefixes.
526 grandfathered (AbstractSet[str]): The set of grandfathered tags,
527 normalized to lowercase.
534 self.macrolanguages = collections.defaultdict (set)
535 self.prefixes = collections.defaultdict (set)
536 self.grandfathered = set ()
538 def parse (self, filename):
539 """Parse the BCP 47 subtag registry.
542 filename (str): The file name of the registry.
544 with io.open (filename, encoding='utf-8') as f:
548 has_preferred_value = False
550 for line in itertools.chain (f, ['']):
551 line = line.rstrip ()
552 if line.startswith (' '):
553 line_buffer += line[1:]
555 line, line_buffer = line_buffer, line
556 if line.startswith ('Type: '):
557 subtag_type = line.split (' ')[1]
559 has_preferred_value = False
560 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
561 subtag = line.split (' ')[1]
562 if subtag_type == 'grandfathered':
563 self.grandfathered.add (subtag.lower ())
564 elif line.startswith ('Description: '):
565 description = line.split (' ', 1)[1].replace (' (individual language)', '')
566 description = re.sub (' (\((individual |macro)language\)|languages)$', '',
568 if subtag in self.names:
569 self.names[subtag] += '\n' + description
571 self.names[subtag] = description
572 elif subtag_type == 'language' or subtag_type == 'grandfathered':
573 if line.startswith ('Scope: '):
574 scope = line.split (' ')[1]
575 if scope == 'macrolanguage':
576 scope = ' [macrolanguage]'
577 elif scope == 'collection':
581 self.scopes[subtag] = scope
582 elif line.startswith ('Deprecated: '):
583 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
585 elif deprecated and line.startswith ('Comments: see '):
586 # If a subtag is split into multiple replacement subtags,
587 # it essentially represents a macrolanguage.
588 for language in line.replace (',', '').split (' ')[2:]:
589 self._add_macrolanguage (subtag, language)
590 elif line.startswith ('Preferred-Value: '):
591 # If a subtag is deprecated in favor of a single replacement subtag,
592 # it is either a dialect or synonym of the preferred subtag. Either
593 # way, it is close enough to the truth to consider the replacement
594 # the macrolanguage of the deprecated language.
595 has_preferred_value = True
596 macrolanguage = line.split (' ')[1]
597 self._add_macrolanguage (macrolanguage, subtag)
598 elif not has_preferred_value and line.startswith ('Macrolanguage: '):
599 self._add_macrolanguage (line.split (' ')[1], subtag)
600 elif subtag_type == 'variant':
601 if line.startswith ('Prefix: '):
602 self.prefixes[subtag].add (line.split (' ')[1])
603 elif line.startswith ('File-Date: '):
607 def _add_macrolanguage (self, macrolanguage, language):
609 if language not in ot.from_bcp_47:
610 for l in self.macrolanguages.get (language, set ()):
611 self._add_macrolanguage (macrolanguage, l)
612 if macrolanguage not in ot.from_bcp_47:
613 for ls in list (self.macrolanguages.values ()):
614 if macrolanguage in ls:
617 self.macrolanguages[macrolanguage].add (language)
619 def remove_extra_macrolanguages (self):
620 """Make every language have at most one macrolanguage."""
621 inverted = collections.defaultdict (list)
622 for macrolanguage, languages in self.macrolanguages.items ():
623 for language in languages:
624 inverted[language].append (macrolanguage)
625 for language, macrolanguages in inverted.items ():
626 if len (macrolanguages) > 1:
627 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
628 biggest_macrolanguage = macrolanguages.pop ()
629 for macrolanguage in macrolanguages:
630 self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
632 def get_name (self, lt):
633 """Return the names of the subtags in a language tag.
636 lt (LanguageTag): A BCP 47 language tag.
639 The name form of ``lt``.
641 name = self.names[lt.language].split ('\n')[0]
643 name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
645 name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
647 name += '; ' + self.names[lt.variant].split ('\n')[0]
650 bcp_47 = BCP47Parser ()
652 ot.parse (sys.argv[1])
653 bcp_47.parse (sys.argv[2])
655 ot.add_language ('ary', 'MOR')
657 ot.add_language ('ath', 'ATH')
659 ot.add_language ('bai', 'BML')
661 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
663 ot.add_language ('ber', 'BBR')
665 ot.remove_language_ot ('PGR')
666 ot.add_language ('el-polyton', 'PGR')
668 bcp_47.macrolanguages['et'] = {'ekk'}
670 bcp_47.names['flm'] = 'Falam Chin'
671 bcp_47.scopes['flm'] = ' (retired code)'
672 bcp_47.macrolanguages['flm'] = {'cfm'}
674 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
676 ot.add_language ('und-fonipa', 'IPPH')
678 ot.add_language ('und-fonnapa', 'APPH')
680 ot.remove_language_ot ('IRT')
681 ot.add_language ('ga-Latg', 'IRT')
683 ot.remove_language_ot ('KGE')
684 ot.add_language ('und-Geok', 'KGE')
686 ot.add_language ('guk', 'GUK')
687 ot.names['GUK'] = 'Gumuz (SIL fonts)'
688 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
690 bcp_47.macrolanguages['id'] = {'in'}
692 bcp_47.macrolanguages['ijo'] = {'ijc'}
694 ot.add_language ('kht', 'KHN')
695 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
696 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
697 ot.ranks['KHN'] = ot.ranks['KHT']
700 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
702 ot.names['MAL'] = 'Malayalam Traditional'
705 bcp_47.names['mhv'] = 'Arakanese'
706 bcp_47.scopes['mhv'] = ' (retired code)'
708 ot.add_language ('no', 'NOR')
710 ot.add_language ('oc-provenc', 'PRO')
712 ot.add_language ('qu', 'QUZ')
713 ot.add_language ('qub', 'QWH')
714 ot.add_language ('qud', 'QVI')
715 ot.add_language ('qug', 'QVI')
716 ot.add_language ('qup', 'QVI')
717 ot.add_language ('qur', 'QWH')
718 ot.add_language ('qus', 'QUH')
719 ot.add_language ('quw', 'QVI')
720 ot.add_language ('qux', 'QWH')
721 ot.add_language ('qva', 'QWH')
722 ot.add_language ('qvh', 'QWH')
723 ot.add_language ('qvj', 'QVI')
724 ot.add_language ('qvl', 'QWH')
725 ot.add_language ('qvm', 'QWH')
726 ot.add_language ('qvn', 'QWH')
727 ot.add_language ('qvo', 'QVI')
728 ot.add_language ('qvp', 'QWH')
729 ot.add_language ('qvw', 'QWH')
730 ot.add_language ('qvz', 'QVI')
731 ot.add_language ('qwa', 'QWH')
732 ot.add_language ('qws', 'QWH')
733 ot.add_language ('qxa', 'QWH')
734 ot.add_language ('qxc', 'QWH')
735 ot.add_language ('qxh', 'QWH')
736 ot.add_language ('qxl', 'QVI')
737 ot.add_language ('qxn', 'QWH')
738 ot.add_language ('qxo', 'QWH')
739 ot.add_language ('qxr', 'QVI')
740 ot.add_language ('qxt', 'QWH')
741 ot.add_language ('qxw', 'QWH')
743 bcp_47.macrolanguages['ro'].remove ('mo')
744 bcp_47.macrolanguages['ro-MD'].add ('mo')
746 ot.add_language ('sgw', 'SGW')
747 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
748 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
750 ot.remove_language_ot ('SYRE')
751 ot.remove_language_ot ('SYRJ')
752 ot.remove_language_ot ('SYRN')
753 ot.add_language ('und-Syre', 'SYRE')
754 ot.add_language ('und-Syrj', 'SYRJ')
755 ot.add_language ('und-Syrn', 'SYRN')
757 bcp_47.names['xst'] = u"Silt'e"
758 bcp_47.scopes['xst'] = ' (retired code)'
759 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
761 ot.add_language ('xwo', 'TOD')
763 ot.remove_language_ot ('ZHH')
764 ot.remove_language_ot ('ZHP')
765 ot.remove_language_ot ('ZHT')
766 bcp_47.macrolanguages['zh'].remove ('lzh')
767 bcp_47.macrolanguages['zh'].remove ('yue')
768 ot.add_language ('zh-Hant-MO', 'ZHH')
769 ot.add_language ('zh-Hant-HK', 'ZHH')
770 ot.add_language ('zh-Hans', 'ZHS')
771 ot.add_language ('zh-Hant', 'ZHT')
772 ot.add_language ('zh-HK', 'ZHH')
773 ot.add_language ('zh-MO', 'ZHH')
774 ot.add_language ('zh-TW', 'ZHT')
775 ot.add_language ('lzh', 'ZHT')
776 ot.add_language ('lzh-Hans', 'ZHS')
777 ot.add_language ('yue', 'ZHH')
778 ot.add_language ('yue-Hans', 'ZHS')
780 bcp_47.macrolanguages['zom'] = {'yos'}
782 def rank_delta (bcp_47, ot):
783 """Return a delta to apply to a BCP 47 tag's rank.
785 Most OpenType tags have a constant rank, but a few have ranks that
786 depend on the BCP 47 tag.
789 bcp_47 (str): A BCP 47 tag.
790 ot (str): An OpenType tag to.
793 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
794 OpenType equivalents.
796 if bcp_47 == 'ak' and ot == 'AKA':
798 if bcp_47 == 'tw' and ot == 'TWI':
833 ot.inherit_from_macrolanguages ()
834 bcp_47.remove_extra_macrolanguages ()
835 ot.inherit_from_macrolanguages ()
838 print ('/* == Start of generated table == */')
840 print (' * The following table is generated by running:')
842 print (' * %s languagetags language-subtag-registry' % sys.argv[0])
844 print (' * on files with these headers:')
846 print (' * %s' % ot.header.strip ())
847 print (' * %s' % bcp_47.header)
850 print ('#ifndef HB_OT_TAG_TABLE_HH')
851 print ('#define HB_OT_TAG_TABLE_HH')
853 print ('static const LangTag ot_languages[] = {')
856 """Convert a tag to ``HB_TAG`` form.
859 tag (str): An OpenType tag.
862 A snippet of C++ representing ``tag``.
864 return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
866 def get_variant_set (name):
867 """Return a set of variant language names from a name.
870 name (str): A list of language names from the BCP 47 registry,
874 A set of normalized language names.
876 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'"))
877 .encode ('ASCII', 'ignore')
879 for n in re.split ('[\n(),]', name) if n)
881 def language_name_intersection (a, b):
882 """Return the names in common between two language names.
885 a (str): A list of language names from the BCP 47 registry,
887 b (str): A list of language names from the BCP 47 registry,
891 The normalized language names shared by ``a`` and ``b``.
893 return get_variant_set (a).intersection (get_variant_set (b))
895 def get_matching_language_name (intersection, candidates):
896 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
899 for language, tags in sorted (ot.from_bcp_47.items ()):
900 if language == '' or '-' in language:
902 print (' {\"%s\",\t{' % language, end='')
903 maximum_tags = max (maximum_tags, len (tags))
904 tag_count = len (tags)
905 for i, tag in enumerate (tags, start=1):
907 print ('\t\t ', end='')
908 print (hb_tag (tag), end='')
911 print (',\t/* ', end='')
912 bcp_47_name = bcp_47.names.get (language, '')
913 bcp_47_name_candidates = bcp_47_name.split ('\n')
914 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
915 scope = bcp_47.scopes.get (language, '')
917 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
919 name = get_matching_language_name (intersection, bcp_47_name_candidates)
920 bcp_47.names[language] = name
921 write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
926 print ('static_assert (HB_OT_MAX_TAGS_PER_LANGUAGE == %iu, "");' % maximum_tags)
930 print (' * hb_ot_tags_from_complex_language:')
931 print (' * @lang_str: a BCP 47 language tag to convert.')
932 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
933 print (' * conversion.')
934 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
935 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
936 print (' * @tags: array of size at least @language_count to store the language tag')
939 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
941 print (' * Return value: Whether any language systems were retrieved.')
943 print ('static bool')
944 print ('hb_ot_tags_from_complex_language (const char *lang_str,')
945 print ('\t\t\t\t const char *limit,')
946 print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
947 print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
950 def print_subtag_matches (subtag, new_line):
954 print ('\t&& ', end='')
955 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
957 complex_tags = collections.defaultdict (list)
958 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
959 (LanguageTag (language), tags)
960 for language, tags in sorted (ot.from_bcp_47.items (),
961 key=lambda i: (-len (i[0]), i[0]))
962 ] if lt_tags[0].is_complex ()),
963 key=lambda lt_tags: lt_tags[0].get_group ()):
964 complex_tags[initial] += group
966 for initial, items in sorted (complex_tags.items ()):
969 for lt, tags in items:
970 if lt.variant in bcp_47.prefixes:
971 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
972 '%s is not a valid prefix of %s' % (lt.language, lt.variant))
973 print (' if (', end='')
974 print_subtag_matches (lt.script, False)
975 print_subtag_matches (lt.region, False)
976 print_subtag_matches (lt.variant, False)
979 write (' /* %s */' % bcp_47.get_name (lt))
982 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
984 print (' *count = 1;')
986 print (' hb_tag_t possible_tags[] = {')
988 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
991 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
992 print (' tags[i] = possible_tags[i];')
993 print (' *count = i;')
994 print (' return true;')
997 print (' switch (lang_str[0])')
999 for initial, items in sorted (complex_tags.items ()):
1000 if initial == 'und':
1002 print (" case '%s':" % initial)
1003 for lt, tags in items:
1004 print (' if (', end='')
1005 if lt.grandfathered:
1006 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1008 string_literal = lt.language[1:] + '-'
1010 string_literal += lt.script
1013 string_literal += '-' + lt.region
1015 if string_literal[-1] == '-':
1016 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1018 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1019 print_subtag_matches (lt.script, True)
1020 print_subtag_matches (lt.region, True)
1021 print_subtag_matches (lt.variant, True)
1024 write (' /* %s */' % bcp_47.get_name (lt))
1027 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1029 print (' *count = 1;')
1031 print (' unsigned int i;')
1032 print (' hb_tag_t possible_tags[] = {')
1034 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1037 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1038 print ('\ttags[i] = possible_tags[i];')
1039 print (' *count = i;')
1040 print (' return true;')
1045 print (' return false;')
1049 print (' * hb_ot_ambiguous_tag_to_language')
1050 print (' * @tag: A language tag.')
1052 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1053 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1054 print (' * the best tag consists of multiple subtags.')
1056 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1057 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1059 print ('static hb_language_t')
1060 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1062 print (' switch (tag)')
1065 def verify_disambiguation_dict ():
1066 """Verify and normalize ``disambiguation``.
1068 ``disambiguation`` is a map of ambiguous OpenType language system
1069 tags to the particular BCP 47 tags they correspond to. This function
1070 checks that all its keys really are ambiguous and that each key's
1071 value is valid for that key. It checks that no ambiguous tag is
1072 missing, except when it can figure out which BCP 47 tag is the best
1075 It modifies ``disambiguation`` to remove keys whose values are the
1076 same as those that the fallback would return anyway, and to add
1077 ambiguous keys whose disambiguations it determined automatically.
1080 AssertionError: Verification failed.
1083 global disambiguation
1085 for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1086 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1087 if len (primary_tags) == 1:
1088 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1089 if '-' in primary_tags[0]:
1090 disambiguation[ot_tag] = primary_tags[0]
1091 elif len (primary_tags) == 0:
1092 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1094 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1095 if len (macrolanguages) != 1:
1096 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1097 if len (macrolanguages) != 1:
1098 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1099 if len (macrolanguages) != 1:
1100 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1101 expect (disambiguation[ot_tag] in bcp_47_tags,
1102 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1103 elif ot_tag not in disambiguation:
1104 disambiguation[ot_tag] = macrolanguages[0]
1105 if disambiguation[ot_tag] == sorted (primary_tags)[0] and '-' not in disambiguation[ot_tag]:
1106 del disambiguation[ot_tag]
1107 for ot_tag in disambiguation.keys ():
1108 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1110 verify_disambiguation_dict ()
1111 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1112 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1114 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1118 print (' return HB_LANGUAGE_INVALID;')
1123 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1125 print ('/* == End of generated table == */')