Imported Upstream version 3.4.0
[platform/upstream/harfbuzz.git] / src / gen-tag-table.py
index 401f4ca..f8fb05f 100755 (executable)
@@ -25,10 +25,8 @@ Input files:
 """
 
 import collections
+import html
 from html.parser import HTMLParser
-def write (s):
-       sys.stdout.flush ()
-       sys.stdout.buffer.write (s.encode ('utf-8'))
 import itertools
 import re
 import sys
@@ -37,16 +35,18 @@ import unicodedata
 if len (sys.argv) != 3:
        sys.exit (__doc__)
 
-from html import unescape
-def html_unescape (parser, entity):
-       return unescape (entity)
-
 def expect (condition, message=None):
        if not condition:
                if message is None:
                        raise AssertionError
                raise AssertionError (message)
 
+def write (s):
+       sys.stdout.flush ()
+       sys.stdout.buffer.write (s.encode ('utf-8'))
+
+DEFAULT_LANGUAGE_SYSTEM = ''
+
 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
 ISO_639_3_TO_1 = {
        'aar': 'aa',
@@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser):
                from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
                        inverted. Its values start as unsorted sets;
                        ``sort_languages`` converts them to sorted lists.
+               from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
+                       A copy of ``from_bcp_47``. It starts as ``None`` and is
+                       populated at the beginning of the first call to
+                       ``inherit_from_macrolanguages``.
 
        """
        def __init__ (self):
@@ -338,13 +342,18 @@ class OpenTypeRegistryParser (HTMLParser):
                self.ranks = collections.defaultdict (int)
                self.to_bcp_47 = collections.defaultdict (set)
                self.from_bcp_47 = collections.defaultdict (set)
+               self.from_bcp_47_uninherited = None
                # Whether the parser is in a <td> element
                self._td = False
+               # Whether the parser is after a <br> element within the current <tr> element
+               self._br = False
                # The text of the <td> elements of the current <tr> element.
                self._current_tr = []
 
        def handle_starttag (self, tag, attrs):
-               if tag == 'meta':
+               if tag == 'br':
+                       self._br = True
+               elif tag == 'meta':
                        for attr, value in attrs:
                                if attr == 'name' and value == 'updated_at':
                                        self.header = self.get_starttag_text ()
@@ -353,6 +362,7 @@ class OpenTypeRegistryParser (HTMLParser):
                        self._td = True
                        self._current_tr.append ('')
                elif tag == 'tr':
+                       self._br = False
                        self._current_tr = []
 
        def handle_endtag (self, tag):
@@ -377,14 +387,14 @@ class OpenTypeRegistryParser (HTMLParser):
                        self.ranks[tag] = rank
 
        def handle_data (self, data):
-               if self._td:
+               if self._td and not self._br:
                        self._current_tr[-1] += data
 
        def handle_charref (self, name):
-               self.handle_data (html_unescape (self, '&#%s;' % name))
+               self.handle_data (html.unescape ('&#%s;' % name))
 
        def handle_entityref (self, name):
-               self.handle_data (html_unescape (self, '&%s;' % name))
+               self.handle_data (html.unescape ('&%s;' % name))
 
        def parse (self, filename):
                """Parse the OpenType language system tag registry.
@@ -457,33 +467,51 @@ class OpenTypeRegistryParser (HTMLParser):
                explicit mapping, so it inherits from sq (Albanian) the mapping
                to SQI.
 
+               However, if an OpenType tag maps to a BCP 47 macrolanguage and
+               some but not all of its individual languages, the mapping is not
+               inherited from the macrolanguage to the missing individual
+               languages. For example, INUK (Nunavik Inuktitut) is mapped to
+               ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
+               ikt (Inuinnaqtun, which is an individual language of iu), so
+               this method does not add a mapping from ikt to INUK.
+
                If a BCP 47 tag for a macrolanguage has no OpenType mapping but
-               all of its individual languages do and they all map to the same
-               tags, the mapping is copied to the macrolanguage.
+               some of its individual languages do, their mappings are copied
+               to the macrolanguage.
                """
                global bcp_47
-               original_ot_from_bcp_47 = dict (self.from_bcp_47)
+               first_time = self.from_bcp_47_uninherited is None
+               if first_time:
+                       self.from_bcp_47_uninherited = dict (self.from_bcp_47)
                for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
-                       ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
+                       ot_macrolanguages = {
+                               ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
+                       }
+                       blocked_ot_macrolanguages = set ()
+                       if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
+                               for ot_macrolanguage in ot_macrolanguages:
+                                       round_trip_macrolanguages = {
+                                               l for l in self.to_bcp_47[ot_macrolanguage]
+                                               if 'retired code' not in bcp_47.scopes.get (l, '')
+                                       }
+                                       round_trip_languages = {
+                                               l for l in languages
+                                               if 'retired code' not in bcp_47.scopes.get (l, '')
+                                       }
+                                       intersection = round_trip_macrolanguages & round_trip_languages
+                                       if intersection and intersection != round_trip_languages:
+                                               blocked_ot_macrolanguages.add (ot_macrolanguage)
                        if ot_macrolanguages:
                                for ot_macrolanguage in ot_macrolanguages:
-                                       for language in languages:
-                                               # Remove the following condition if e.g. nn should map to NYN,NOR
-                                               # instead of just NYN.
-                                               if language not in original_ot_from_bcp_47:
+                                       if ot_macrolanguage not in blocked_ot_macrolanguages:
+                                               for language in languages:
                                                        self.add_language (language, ot_macrolanguage)
-                                                       self.ranks[ot_macrolanguage] += 1
-                       else:
+                                                       if not blocked_ot_macrolanguages:
+                                                               self.ranks[ot_macrolanguage] += 1
+                       elif first_time:
                                for language in languages:
-                                       if language in original_ot_from_bcp_47:
-                                               if ot_macrolanguages:
-                                                       ml = original_ot_from_bcp_47[language]
-                                                       if ml:
-                                                               ot_macrolanguages &= ml
-                                                       else:
-                                                               pass
-                                               else:
-                                                       ot_macrolanguages |= original_ot_from_bcp_47[language]
+                                       if language in self.from_bcp_47_uninherited:
+                                               ot_macrolanguages |= self.from_bcp_47_uninherited[language]
                                        else:
                                                ot_macrolanguages.clear ()
                                        if not ot_macrolanguages:
@@ -556,7 +584,7 @@ class BCP47Parser (object):
                                                self.grandfathered.add (subtag.lower ())
                                elif line.startswith ('Description: '):
                                        description = line.split (' ', 1)[1].replace (' (individual language)', '')
-                                       description = re.sub (' (\((individual |macro)language\)|languages)$', '',
+                                       description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
                                                        description)
                                        if subtag in self.names:
                                                self.names[subtag] += '\n' + description
@@ -568,7 +596,7 @@ class BCP47Parser (object):
                                                if scope == 'macrolanguage':
                                                        scope = ' [macrolanguage]'
                                                elif scope == 'collection':
-                                                       scope = ' [family]'
+                                                       scope = ' [collection]'
                                                else:
                                                        continue
                                                self.scopes[subtag] = scope
@@ -591,7 +619,9 @@ class BCP47Parser (object):
                                        elif not has_preferred_value and line.startswith ('Macrolanguage: '):
                                                self._add_macrolanguage (line.split (' ')[1], subtag)
                                elif subtag_type == 'variant':
-                                       if line.startswith ('Prefix: '):
+                                       if line.startswith ('Deprecated: '):
+                                               self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
+                                       elif line.startswith ('Prefix: '):
                                                self.prefixes[subtag].add (line.split (' ')[1])
                                elif line.startswith ('File-Date: '):
                                        self.header = line
@@ -622,6 +652,17 @@ class BCP47Parser (object):
                                for macrolanguage in macrolanguages:
                                        self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
 
+       def _get_name_piece (self, subtag):
+               """Return the first name of a subtag plus its scope suffix.
+
+               Args:
+                       subtag (str): A BCP 47 subtag.
+
+               Returns:
+                       The name form of ``subtag``.
+               """
+               return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
+
        def get_name (self, lt):
                """Return the names of the subtags in a language tag.
 
@@ -631,13 +672,13 @@ class BCP47Parser (object):
                Returns:
                        The name form of ``lt``.
                """
-               name = self.names[lt.language].split ('\n')[0]
+               name = self._get_name_piece (lt.language)
                if lt.script:
-                       name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.script.title ())
                if lt.region:
-                       name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.region.upper ())
                if lt.variant:
-                       name += '; ' + self.names[lt.variant].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.variant)
                return name
 
 bcp_47 = BCP47Parser ()
@@ -673,22 +714,18 @@ ot.add_language ('und-fonnapa', 'APPH')
 ot.remove_language_ot ('IRT')
 ot.add_language ('ga-Latg', 'IRT')
 
+ot.add_language ('hy-arevmda', 'HYE')
+
 ot.remove_language_ot ('KGE')
 ot.add_language ('und-Geok', 'KGE')
 
-ot.add_language ('guk', 'GUK')
-ot.names['GUK'] = 'Gumuz (SIL fonts)'
-ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
-
 bcp_47.macrolanguages['id'] = {'in'}
 
 bcp_47.macrolanguages['ijo'] = {'ijc'}
 
 ot.add_language ('kht', 'KHN')
 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
-ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
-ot.ranks['KHN'] = ot.ranks['KHT']
-ot.ranks['KHT'] += 1
+ot.ranks['KHN'] = ot.ranks['KHT'] + 1
 
 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
 
@@ -698,14 +735,18 @@ ot.ranks['MLR'] += 1
 bcp_47.names['mhv'] = 'Arakanese'
 bcp_47.scopes['mhv'] = ' (retired code)'
 
+ot.add_language ('mnw-TH', 'MONT')
+
 ot.add_language ('no', 'NOR')
 
 ot.add_language ('oc-provenc', 'PRO')
 
+ot.remove_language_ot ('QUZ')
 ot.add_language ('qu', 'QUZ')
 ot.add_language ('qub', 'QWH')
 ot.add_language ('qud', 'QVI')
 ot.add_language ('qug', 'QVI')
+ot.add_language ('qul', 'QUH')
 ot.add_language ('qup', 'QVI')
 ot.add_language ('qur', 'QWH')
 ot.add_language ('qus', 'QUH')
@@ -733,13 +774,8 @@ ot.add_language ('qxr', 'QVI')
 ot.add_language ('qxt', 'QWH')
 ot.add_language ('qxw', 'QWH')
 
-bcp_47.macrolanguages['ro'].remove ('mo')
 bcp_47.macrolanguages['ro-MD'].add ('mo')
 
-ot.add_language ('sgw', 'SGW')
-ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
-ot.ranks['SGW'] = ot.ranks['CHG'] + 1
-
 ot.remove_language_ot ('SYRE')
 ot.remove_language_ot ('SYRJ')
 ot.remove_language_ot ('SYRN')
@@ -756,14 +792,17 @@ ot.add_language ('xwo', 'TOD')
 ot.remove_language_ot ('ZHH')
 ot.remove_language_ot ('ZHP')
 ot.remove_language_ot ('ZHT')
+ot.remove_language_ot ('ZHTM')
 bcp_47.macrolanguages['zh'].remove ('lzh')
 bcp_47.macrolanguages['zh'].remove ('yue')
 ot.add_language ('zh-Hant-MO', 'ZHH')
+ot.add_language ('zh-Hant-MO', 'ZHTM')
 ot.add_language ('zh-Hant-HK', 'ZHH')
 ot.add_language ('zh-Hans', 'ZHS')
 ot.add_language ('zh-Hant', 'ZHT')
 ot.add_language ('zh-HK', 'ZHH')
 ot.add_language ('zh-MO', 'ZHH')
+ot.add_language ('zh-MO', 'ZHTM')
 ot.add_language ('zh-TW', 'ZHT')
 ot.add_language ('lzh', 'ZHT')
 ot.add_language ('lzh-Hans', 'ZHS')
@@ -795,6 +834,7 @@ def rank_delta (bcp_47, ot):
 disambiguation = {
        'ALT': 'alt',
        'ARK': 'rki',
+       'ATH': 'ath',
        'BHI': 'bhb',
        'BLN': 'bjt',
        'BTI': 'beb',
@@ -806,6 +846,7 @@ disambiguation = {
        'ECR': 'crj',
        'HAL': 'cfm',
        'HND': 'hnd',
+       'HYE': 'hyw',
        'KIS': 'kqs',
        'KUI': 'uki',
        'LRC': 'bqi',
@@ -818,15 +859,24 @@ disambiguation = {
        'QVI': 'qvi',
        'QWH': 'qwh',
        'SIG': 'stv',
-       'TNE': 'yrk',
+       'SRB': 'sr',
+       'SXT': 'xnj',
        'ZHH': 'zh-HK',
        'ZHS': 'zh-Hans',
        'ZHT': 'zh-Hant',
+       'ZHTM': 'zh-MO',
 }
 
 ot.inherit_from_macrolanguages ()
 bcp_47.remove_extra_macrolanguages ()
 ot.inherit_from_macrolanguages ()
+ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
+ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
+for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
+       possible_bcp_47_tag = tricky_ot_tag.lower ()
+       if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
+               ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
+               bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
 ot.sort_languages ()
 
 print ('/* == Start of generated table == */')
@@ -855,6 +905,8 @@ def hb_tag (tag):
        Returns:
                A snippet of C++ representing ``tag``.
        """
+       if tag == DEFAULT_LANGUAGE_SYSTEM:
+               return 'HB_TAG_NONE\t       '
        return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
 
 def get_variant_set (name):
@@ -903,14 +955,18 @@ for language, tags in sorted (ot.from_bcp_47.items ()):
                print ('\t/* ', end='')
                bcp_47_name = bcp_47.names.get (language, '')
                bcp_47_name_candidates = bcp_47_name.split ('\n')
-               intersection = language_name_intersection (bcp_47_name, ot.names[tag])
+               ot_name = ot.names[tag]
                scope = bcp_47.scopes.get (language, '')
-               if not intersection:
-                       write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
+               if tag == DEFAULT_LANGUAGE_SYSTEM:
+                       write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
                else:
-                       name = get_matching_language_name (intersection, bcp_47_name_candidates)
-                       bcp_47.names[language] = name
-                       write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
+                       intersection = language_name_intersection (bcp_47_name, ot_name)
+                       if not intersection:
+                               write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
+                       else:
+                               name = get_matching_language_name (intersection, bcp_47_name_candidates)
+                               bcp_47.names[language] = name
+                               write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
                print (' */')
 
 print ('};')
@@ -957,6 +1013,8 @@ for initial, items in sorted (complex_tags.items ()):
        if initial != 'und':
                continue
        for lt, tags in items:
+               if not tags:
+                       continue
                if lt.variant in bcp_47.prefixes:
                        expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
                                        '%s is not a valid prefix of %s' % (lt.language, lt.variant))
@@ -991,23 +1049,27 @@ for initial, items in sorted (complex_tags.items ()):
                continue
        print ("  case '%s':" % initial)
        for lt, tags in items:
+               if not tags:
+                       continue
                print ('    if (', end='')
+               script = lt.script
+               region = lt.region
                if lt.grandfathered:
                        print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
                else:
                        string_literal = lt.language[1:] + '-'
-                       if lt.script:
-                               string_literal += lt.script
-                               lt.script = None
-                               if lt.region:
-                                       string_literal += '-' + lt.region
-                                       lt.region = None
+                       if script:
+                               string_literal += script
+                               script = None
+                               if region:
+                                       string_literal += '-' + region
+                                       region = None
                        if string_literal[-1] == '-':
                                print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
                        else:
                                print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
-               print_subtag_matches (lt.script, True)
-               print_subtag_matches (lt.region, True)
+               print_subtag_matches (script, True)
+               print_subtag_matches (region, True)
                print_subtag_matches (lt.variant, True)
                print (')')
                print ('    {')
@@ -1074,17 +1136,28 @@ def verify_disambiguation_dict ():
        global disambiguation
        global ot
        for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
-               primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
+               if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
+                       primary_tags = []
+               else:
+                       primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
                if len (primary_tags) == 1:
                        expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
                        if '-' in primary_tags[0]:
                                disambiguation[ot_tag] = primary_tags[0]
+                       else:
+                               first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
+                               if primary_tags[0] != first_tag:
+                                       disambiguation[ot_tag] = primary_tags[0]
                elif len (primary_tags) == 0:
                        expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
                else:
-                       macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
+                       original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
+                       if len (original_languages) == 1:
+                               macrolanguages = original_languages
+                       else:
+                               macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
                        if len (macrolanguages) != 1:
-                               macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
+                               macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
                        if len (macrolanguages) != 1:
                                macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
                        if len (macrolanguages) != 1:
@@ -1093,8 +1166,8 @@ def verify_disambiguation_dict ():
                                                '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
                        elif ot_tag not in disambiguation:
                                disambiguation[ot_tag] = macrolanguages[0]
-                       different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
-                       if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
+                       different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
+                       if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
                                del disambiguation[ot_tag]
        for ot_tag in disambiguation.keys ():
                expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)