3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
6 It creates a ``const LangTag[]``, matching the tags from the OpenType
7 languages system tag list to the language subtags of the BCP 47 language
8 subtag registry, with some manual adjustments. The mappings are
9 supplemented with macrolanguages' sublanguages and retired codes'
10 replacements, according to BCP 47 and some manual additions where BCP 47
11 omits a retired code entirely.
13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16 multiple BCP 47 tags) are listed here, except when the alphabetically
17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
18 case, the fallback behavior will choose the right tag anyway.
20 usage: ./gen-tag-table.py languagetags language-subtag-registry
23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
28 from html.parser import HTMLParser
31 sys.stdout.buffer.write (s.encode ('utf-8'))
37 if len (sys.argv) != 3:
40 from html import unescape
41 def html_unescape (parser, entity):
42 return unescape (entity)
44 def expect (condition, message=None):
48 raise AssertionError (message)
50 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
239 class LanguageTag (object):
240 """A BCP 47 language tag.
243 subtags (List[str]): The list of subtags in this tag.
244 grandfathered (bool): Whether this tag is grandfathered. If
245 ``true``, the entire lowercased tag is the ``language``
246 and the other subtag fields are empty.
247 language (str): The language subtag.
248 script (str): The script subtag.
249 region (str): The region subtag.
250 variant (str): The variant subtag.
253 tag (str): A BCP 47 language tag.
256 def __init__ (self, tag):
258 self.subtags = tag.lower ().split ('-')
259 self.grandfathered = tag.lower () in bcp_47.grandfathered
260 if self.grandfathered:
261 self.language = tag.lower ()
266 self.language = self.subtags[0]
267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
272 return '-'.join(self.subtags)
275 return 'LanguageTag(%r)' % str(self)
278 def _find_first (function, sequence):
280 return next (iter (filter (function, sequence)))
281 except StopIteration:
284 def is_complex (self):
285 """Return whether this tag is too complex to represent as a
286 ``LangTag`` in the generated code.
288 Complex tags need to be handled in
289 ``hb_ot_tags_from_complex_language``.
292 Whether this tag is complex.
294 return not (len (self.subtags) == 1
295 or self.grandfathered
296 and len (self.subtags[1]) != 3
297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
299 def get_group (self):
300 """Return the group into which this tag should be categorized in
301 ``hb_ot_tags_from_complex_language``.
303 The group is the first letter of the tag, or ``'und'`` if this tag
304 should not be matched in a ``switch`` statement in the generated
311 if (self.language == 'und'
312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313 else self.language[0])
315 class OpenTypeRegistryParser (HTMLParser):
316 """A parser for the OpenType language system tag registry.
319 header (str): The "last updated" line of the registry.
320 names (Mapping[str, str]): A map of language system tags to the
321 names they are given in the registry.
322 ranks (DefaultDict[str, int]): A map of language system tags to
323 numbers. If a single BCP 47 tag corresponds to multiple
324 OpenType tags, the tags are ordered in increasing order by
325 rank. The rank is based on the number of BCP 47 tags
326 associated with a tag, though it may be manually modified.
327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328 OpenType language system tags to sets of BCP 47 tags.
329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330 inverted. Its values start as unsorted sets;
331 ``sort_languages`` converts them to sorted lists.
335 HTMLParser.__init__ (self)
338 self.ranks = collections.defaultdict (int)
339 self.to_bcp_47 = collections.defaultdict (set)
340 self.from_bcp_47 = collections.defaultdict (set)
341 # Whether the parser is in a <td> element
343 # The text of the <td> elements of the current <tr> element.
344 self._current_tr = []
346 def handle_starttag (self, tag, attrs):
348 for attr, value in attrs:
349 if attr == 'name' and value == 'updated_at':
350 self.header = self.get_starttag_text ()
354 self._current_tr.append ('')
356 self._current_tr = []
358 def handle_endtag (self, tag):
361 elif tag == 'tr' and self._current_tr:
362 expect (2 <= len (self._current_tr) <= 3)
363 name = self._current_tr[0].strip ()
364 tag = self._current_tr[1].strip ("\t\n\v\f\r '")
367 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
368 name += ' (deprecated)'
369 tag = tag.split (' ')[0]
371 self.names[tag] = re.sub (' languages$', '', name)
372 if not self._current_tr[2]:
374 iso_codes = self._current_tr[2].strip ()
375 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
376 rank += 2 * len (self.to_bcp_47[tag])
377 self.ranks[tag] = rank
379 def handle_data (self, data):
381 self._current_tr[-1] += data
383 def handle_charref (self, name):
384 self.handle_data (html_unescape (self, '&#%s;' % name))
386 def handle_entityref (self, name):
387 self.handle_data (html_unescape (self, '&%s;' % name))
389 def parse (self, filename):
390 """Parse the OpenType language system tag registry.
393 filename (str): The file name of the registry.
395 with open (filename, encoding='utf-8') as f:
396 self.feed (f.read ())
398 for tag, iso_codes in self.to_bcp_47.items ():
399 for iso_code in iso_codes:
400 self.from_bcp_47[iso_code].add (tag)
402 def add_language (self, bcp_47_tag, ot_tag):
403 """Add a language as if it were in the registry.
406 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
407 a language subtag, and if the language subtag is a
408 macrolanguage, then new languages are added corresponding
409 to the macrolanguages' individual languages with the
410 remainder of the tag appended.
411 ot_tag (str): An OpenType language system tag.
414 self.to_bcp_47[ot_tag].add (bcp_47_tag)
415 self.from_bcp_47[bcp_47_tag].add (ot_tag)
416 if bcp_47_tag.lower () not in bcp_47.grandfathered:
418 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
419 if macrolanguage in bcp_47.macrolanguages:
421 for language in bcp_47.macrolanguages[macrolanguage]:
422 if language.lower () not in bcp_47.grandfathered:
423 s.add ('%s-%s' % (language, suffix))
424 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
429 def _remove_language (tag_1, dict_1, dict_2):
430 for tag_2 in dict_1.pop (tag_1):
431 dict_2[tag_2].remove (tag_1)
432 if not dict_2[tag_2]:
435 def remove_language_ot (self, ot_tag):
436 """Remove an OpenType tag from the registry.
439 ot_tag (str): An OpenType tag.
441 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
443 def remove_language_bcp_47 (self, bcp_47_tag):
444 """Remove a BCP 47 tag from the registry.
447 bcp_47_tag (str): A BCP 47 tag.
449 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
451 def inherit_from_macrolanguages (self):
452 """Copy mappings from macrolanguages to individual languages.
454 If a BCP 47 tag for an individual mapping has no OpenType
455 mapping but its macrolanguage does, the mapping is copied to
456 the individual language. For example, als (Tosk Albanian) has no
457 explicit mapping, so it inherits from sq (Albanian) the mapping
460 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
461 all of its individual languages do and they all map to the same
462 tags, the mapping is copied to the macrolanguage.
465 original_ot_from_bcp_47 = dict (self.from_bcp_47)
466 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
467 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
468 if ot_macrolanguages:
469 for ot_macrolanguage in ot_macrolanguages:
470 for language in languages:
471 # Remove the following condition if e.g. nn should map to NYN,NOR
472 # instead of just NYN.
473 if language not in original_ot_from_bcp_47:
474 self.add_language (language, ot_macrolanguage)
475 self.ranks[ot_macrolanguage] += 1
477 for language in languages:
478 if language in original_ot_from_bcp_47:
479 if ot_macrolanguages:
480 ml = original_ot_from_bcp_47[language]
482 ot_macrolanguages &= ml
486 ot_macrolanguages |= original_ot_from_bcp_47[language]
488 ot_macrolanguages.clear ()
489 if not ot_macrolanguages:
491 for ot_macrolanguage in ot_macrolanguages:
492 self.add_language (macrolanguage, ot_macrolanguage)
494 def sort_languages (self):
495 """Sort the values of ``from_bcp_47`` in ascending rank order."""
496 for language, tags in self.from_bcp_47.items ():
497 self.from_bcp_47[language] = sorted (tags,
498 key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
500 ot = OpenTypeRegistryParser ()
502 class BCP47Parser (object):
503 """A parser for the BCP 47 subtag registry.
506 header (str): The "File-Date" line of the registry.
507 names (Mapping[str, str]): A map of subtags to the names they
508 are given in the registry. Each value is a
509 ``'\\n'``-separated list of names.
510 scopes (Mapping[str, str]): A map of language subtags to strings
511 suffixed to language names, including suffixes to explain
513 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
514 language subtags to the sets of language subtags which
515 inherit from them. See
516 ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
517 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
518 subtags to their prefixes.
519 grandfathered (AbstractSet[str]): The set of grandfathered tags,
520 normalized to lowercase.
527 self.macrolanguages = collections.defaultdict (set)
528 self.prefixes = collections.defaultdict (set)
529 self.grandfathered = set ()
531 def parse (self, filename):
532 """Parse the BCP 47 subtag registry.
535 filename (str): The file name of the registry.
537 with open (filename, encoding='utf-8') as f:
541 has_preferred_value = False
543 for line in itertools.chain (f, ['']):
544 line = line.rstrip ()
545 if line.startswith (' '):
546 line_buffer += line[1:]
548 line, line_buffer = line_buffer, line
549 if line.startswith ('Type: '):
550 subtag_type = line.split (' ')[1]
552 has_preferred_value = False
553 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
554 subtag = line.split (' ')[1]
555 if subtag_type == 'grandfathered':
556 self.grandfathered.add (subtag.lower ())
557 elif line.startswith ('Description: '):
558 description = line.split (' ', 1)[1].replace (' (individual language)', '')
559 description = re.sub (' (\((individual |macro)language\)|languages)$', '',
561 if subtag in self.names:
562 self.names[subtag] += '\n' + description
564 self.names[subtag] = description
565 elif subtag_type == 'language' or subtag_type == 'grandfathered':
566 if line.startswith ('Scope: '):
567 scope = line.split (' ')[1]
568 if scope == 'macrolanguage':
569 scope = ' [macrolanguage]'
570 elif scope == 'collection':
574 self.scopes[subtag] = scope
575 elif line.startswith ('Deprecated: '):
576 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
578 elif deprecated and line.startswith ('Comments: see '):
579 # If a subtag is split into multiple replacement subtags,
580 # it essentially represents a macrolanguage.
581 for language in line.replace (',', '').split (' ')[2:]:
582 self._add_macrolanguage (subtag, language)
583 elif line.startswith ('Preferred-Value: '):
584 # If a subtag is deprecated in favor of a single replacement subtag,
585 # it is either a dialect or synonym of the preferred subtag. Either
586 # way, it is close enough to the truth to consider the replacement
587 # the macrolanguage of the deprecated language.
588 has_preferred_value = True
589 macrolanguage = line.split (' ')[1]
590 self._add_macrolanguage (macrolanguage, subtag)
591 elif not has_preferred_value and line.startswith ('Macrolanguage: '):
592 self._add_macrolanguage (line.split (' ')[1], subtag)
593 elif subtag_type == 'variant':
594 if line.startswith ('Prefix: '):
595 self.prefixes[subtag].add (line.split (' ')[1])
596 elif line.startswith ('File-Date: '):
600 def _add_macrolanguage (self, macrolanguage, language):
602 if language not in ot.from_bcp_47:
603 for l in self.macrolanguages.get (language, set ()):
604 self._add_macrolanguage (macrolanguage, l)
605 if macrolanguage not in ot.from_bcp_47:
606 for ls in list (self.macrolanguages.values ()):
607 if macrolanguage in ls:
610 self.macrolanguages[macrolanguage].add (language)
612 def remove_extra_macrolanguages (self):
613 """Make every language have at most one macrolanguage."""
614 inverted = collections.defaultdict (list)
615 for macrolanguage, languages in self.macrolanguages.items ():
616 for language in languages:
617 inverted[language].append (macrolanguage)
618 for language, macrolanguages in inverted.items ():
619 if len (macrolanguages) > 1:
620 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
621 biggest_macrolanguage = macrolanguages.pop ()
622 for macrolanguage in macrolanguages:
623 self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
625 def get_name (self, lt):
626 """Return the names of the subtags in a language tag.
629 lt (LanguageTag): A BCP 47 language tag.
632 The name form of ``lt``.
634 name = self.names[lt.language].split ('\n')[0]
636 name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
638 name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
640 name += '; ' + self.names[lt.variant].split ('\n')[0]
643 bcp_47 = BCP47Parser ()
645 ot.parse (sys.argv[1])
646 bcp_47.parse (sys.argv[2])
648 ot.add_language ('ary', 'MOR')
650 ot.add_language ('ath', 'ATH')
652 ot.add_language ('bai', 'BML')
654 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
656 ot.add_language ('ber', 'BBR')
658 ot.remove_language_ot ('PGR')
659 ot.add_language ('el-polyton', 'PGR')
661 bcp_47.macrolanguages['et'] = {'ekk'}
663 bcp_47.names['flm'] = 'Falam Chin'
664 bcp_47.scopes['flm'] = ' (retired code)'
665 bcp_47.macrolanguages['flm'] = {'cfm'}
667 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
669 ot.add_language ('und-fonipa', 'IPPH')
671 ot.add_language ('und-fonnapa', 'APPH')
673 ot.remove_language_ot ('IRT')
674 ot.add_language ('ga-Latg', 'IRT')
676 ot.remove_language_ot ('KGE')
677 ot.add_language ('und-Geok', 'KGE')
679 ot.add_language ('guk', 'GUK')
680 ot.names['GUK'] = 'Gumuz (SIL fonts)'
681 ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
683 bcp_47.macrolanguages['id'] = {'in'}
685 bcp_47.macrolanguages['ijo'] = {'ijc'}
687 ot.add_language ('kht', 'KHN')
688 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
689 ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
690 ot.ranks['KHN'] = ot.ranks['KHT']
693 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
695 ot.names['MAL'] = 'Malayalam Traditional'
698 bcp_47.names['mhv'] = 'Arakanese'
699 bcp_47.scopes['mhv'] = ' (retired code)'
701 ot.add_language ('no', 'NOR')
703 ot.add_language ('oc-provenc', 'PRO')
705 ot.add_language ('qu', 'QUZ')
706 ot.add_language ('qub', 'QWH')
707 ot.add_language ('qud', 'QVI')
708 ot.add_language ('qug', 'QVI')
709 ot.add_language ('qup', 'QVI')
710 ot.add_language ('qur', 'QWH')
711 ot.add_language ('qus', 'QUH')
712 ot.add_language ('quw', 'QVI')
713 ot.add_language ('qux', 'QWH')
714 ot.add_language ('qva', 'QWH')
715 ot.add_language ('qvh', 'QWH')
716 ot.add_language ('qvj', 'QVI')
717 ot.add_language ('qvl', 'QWH')
718 ot.add_language ('qvm', 'QWH')
719 ot.add_language ('qvn', 'QWH')
720 ot.add_language ('qvo', 'QVI')
721 ot.add_language ('qvp', 'QWH')
722 ot.add_language ('qvw', 'QWH')
723 ot.add_language ('qvz', 'QVI')
724 ot.add_language ('qwa', 'QWH')
725 ot.add_language ('qws', 'QWH')
726 ot.add_language ('qxa', 'QWH')
727 ot.add_language ('qxc', 'QWH')
728 ot.add_language ('qxh', 'QWH')
729 ot.add_language ('qxl', 'QVI')
730 ot.add_language ('qxn', 'QWH')
731 ot.add_language ('qxo', 'QWH')
732 ot.add_language ('qxr', 'QVI')
733 ot.add_language ('qxt', 'QWH')
734 ot.add_language ('qxw', 'QWH')
736 bcp_47.macrolanguages['ro'].remove ('mo')
737 bcp_47.macrolanguages['ro-MD'].add ('mo')
739 ot.add_language ('sgw', 'SGW')
740 ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
741 ot.ranks['SGW'] = ot.ranks['CHG'] + 1
743 ot.remove_language_ot ('SYRE')
744 ot.remove_language_ot ('SYRJ')
745 ot.remove_language_ot ('SYRN')
746 ot.add_language ('und-Syre', 'SYRE')
747 ot.add_language ('und-Syrj', 'SYRJ')
748 ot.add_language ('und-Syrn', 'SYRN')
750 bcp_47.names['xst'] = "Silt'e"
751 bcp_47.scopes['xst'] = ' (retired code)'
752 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
754 ot.add_language ('xwo', 'TOD')
756 ot.remove_language_ot ('ZHH')
757 ot.remove_language_ot ('ZHP')
758 ot.remove_language_ot ('ZHT')
759 bcp_47.macrolanguages['zh'].remove ('lzh')
760 bcp_47.macrolanguages['zh'].remove ('yue')
761 ot.add_language ('zh-Hant-MO', 'ZHH')
762 ot.add_language ('zh-Hant-HK', 'ZHH')
763 ot.add_language ('zh-Hans', 'ZHS')
764 ot.add_language ('zh-Hant', 'ZHT')
765 ot.add_language ('zh-HK', 'ZHH')
766 ot.add_language ('zh-MO', 'ZHH')
767 ot.add_language ('zh-TW', 'ZHT')
768 ot.add_language ('lzh', 'ZHT')
769 ot.add_language ('lzh-Hans', 'ZHS')
770 ot.add_language ('yue', 'ZHH')
771 ot.add_language ('yue-Hans', 'ZHS')
773 bcp_47.macrolanguages['zom'] = {'yos'}
775 def rank_delta (bcp_47, ot):
776 """Return a delta to apply to a BCP 47 tag's rank.
778 Most OpenType tags have a constant rank, but a few have ranks that
779 depend on the BCP 47 tag.
782 bcp_47 (str): A BCP 47 tag.
783 ot (str): An OpenType tag to.
786 A number to add to ``ot``'s rank when sorting ``bcp_47``'s
787 OpenType equivalents.
789 if bcp_47 == 'ak' and ot == 'AKA':
791 if bcp_47 == 'tw' and ot == 'TWI':
827 ot.inherit_from_macrolanguages ()
828 bcp_47.remove_extra_macrolanguages ()
829 ot.inherit_from_macrolanguages ()
832 print ('/* == Start of generated table == */')
834 print (' * The following table is generated by running:')
836 print (' * %s languagetags language-subtag-registry' % sys.argv[0])
838 print (' * on files with these headers:')
840 print (' * %s' % ot.header.strip ())
841 print (' * %s' % bcp_47.header)
844 print ('#ifndef HB_OT_TAG_TABLE_HH')
845 print ('#define HB_OT_TAG_TABLE_HH')
847 print ('static const LangTag ot_languages[] = {')
850 """Convert a tag to ``HB_TAG`` form.
853 tag (str): An OpenType tag.
856 A snippet of C++ representing ``tag``.
858 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
860 def get_variant_set (name):
861 """Return a set of variant language names from a name.
864 name (str): A list of language names from the BCP 47 registry,
868 A set of normalized language names.
870 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
871 .encode ('ASCII', 'ignore')
873 for n in re.split ('[\n(),]', name) if n)
875 def language_name_intersection (a, b):
876 """Return the names in common between two language names.
879 a (str): A list of language names from the BCP 47 registry,
881 b (str): A list of language names from the BCP 47 registry,
885 The normalized language names shared by ``a`` and ``b``.
887 return get_variant_set (a).intersection (get_variant_set (b))
889 def get_matching_language_name (intersection, candidates):
890 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
892 def same_tag (bcp_47_tag, ot_tags):
893 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
895 for language, tags in sorted (ot.from_bcp_47.items ()):
896 if language == '' or '-' in language:
898 commented_out = same_tag (language, tags)
899 for i, tag in enumerate (tags, start=1):
900 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
903 print ('\t/* ', end='')
904 bcp_47_name = bcp_47.names.get (language, '')
905 bcp_47_name_candidates = bcp_47_name.split ('\n')
906 intersection = language_name_intersection (bcp_47_name, ot.names[tag])
907 scope = bcp_47.scopes.get (language, '')
909 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
911 name = get_matching_language_name (intersection, bcp_47_name_candidates)
912 bcp_47.names[language] = name
913 write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
920 print (' * hb_ot_tags_from_complex_language:')
921 print (' * @lang_str: a BCP 47 language tag to convert.')
922 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
923 print (' * conversion.')
924 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
925 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
926 print (' * @tags: array of size at least @language_count to store the language tag')
929 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
931 print (' * Return value: Whether any language systems were retrieved.')
933 print ('static bool')
934 print ('hb_ot_tags_from_complex_language (const char *lang_str,')
935 print ('\t\t\t\t const char *limit,')
936 print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
937 print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
940 def print_subtag_matches (subtag, new_line):
944 print ('\t&& ', end='')
945 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
947 complex_tags = collections.defaultdict (list)
948 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
949 (LanguageTag (language), tags)
950 for language, tags in sorted (ot.from_bcp_47.items (),
951 key=lambda i: (-len (i[0]), i[0]))
952 ] if lt_tags[0].is_complex ()),
953 key=lambda lt_tags: lt_tags[0].get_group ()):
954 complex_tags[initial] += group
956 for initial, items in sorted (complex_tags.items ()):
959 for lt, tags in items:
960 if lt.variant in bcp_47.prefixes:
961 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
962 '%s is not a valid prefix of %s' % (lt.language, lt.variant))
963 print (' if (', end='')
964 print_subtag_matches (lt.script, False)
965 print_subtag_matches (lt.region, False)
966 print_subtag_matches (lt.variant, False)
969 write (' /* %s */' % bcp_47.get_name (lt))
972 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
974 print (' *count = 1;')
976 print (' hb_tag_t possible_tags[] = {')
978 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
981 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
982 print (' tags[i] = possible_tags[i];')
983 print (' *count = i;')
984 print (' return true;')
987 print (' switch (lang_str[0])')
989 for initial, items in sorted (complex_tags.items ()):
992 print (" case '%s':" % initial)
993 for lt, tags in items:
994 print (' if (', end='')
996 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
998 string_literal = lt.language[1:] + '-'
1000 string_literal += lt.script
1003 string_literal += '-' + lt.region
1005 if string_literal[-1] == '-':
1006 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1008 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1009 print_subtag_matches (lt.script, True)
1010 print_subtag_matches (lt.region, True)
1011 print_subtag_matches (lt.variant, True)
1014 write (' /* %s */' % bcp_47.get_name (lt))
1017 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1019 print (' *count = 1;')
1021 print (' unsigned int i;')
1022 print (' hb_tag_t possible_tags[] = {')
1024 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1027 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1028 print ('\ttags[i] = possible_tags[i];')
1029 print (' *count = i;')
1030 print (' return true;')
1035 print (' return false;')
1039 print (' * hb_ot_ambiguous_tag_to_language')
1040 print (' * @tag: A language tag.')
1042 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1043 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1044 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1045 print (' * in #ot_languages.')
1047 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1048 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1050 print ('static hb_language_t')
1051 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1053 print (' switch (tag)')
1056 def verify_disambiguation_dict ():
1057 """Verify and normalize ``disambiguation``.
1059 ``disambiguation`` is a map of ambiguous OpenType language system
1060 tags to the particular BCP 47 tags they correspond to. This function
1061 checks that all its keys really are ambiguous and that each key's
1062 value is valid for that key. It checks that no ambiguous tag is
1063 missing, except when it can figure out which BCP 47 tag is the best
1066 It modifies ``disambiguation`` to remove keys whose values are the
1067 same as those that the fallback would return anyway, and to add
1068 ambiguous keys whose disambiguations it determined automatically.
1071 AssertionError: Verification failed.
1074 global disambiguation
1076 for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1077 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1078 if len (primary_tags) == 1:
1079 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1080 if '-' in primary_tags[0]:
1081 disambiguation[ot_tag] = primary_tags[0]
1082 elif len (primary_tags) == 0:
1083 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1085 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1086 if len (macrolanguages) != 1:
1087 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1088 if len (macrolanguages) != 1:
1089 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1090 if len (macrolanguages) != 1:
1091 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1092 expect (disambiguation[ot_tag] in bcp_47_tags,
1093 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1094 elif ot_tag not in disambiguation:
1095 disambiguation[ot_tag] = macrolanguages[0]
1096 different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1097 if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1098 del disambiguation[ot_tag]
1099 for ot_tag in disambiguation.keys ():
1100 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1102 verify_disambiguation_dict ()
1103 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1104 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1106 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1110 print (' return HB_LANGUAGE_INVALID;')
1115 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1117 print ('/* == End of generated table == */')