Imported Upstream version 3.4.0

[platform/upstream/harfbuzz.git] / src / gen-tag-table.py
diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py

index 401f4ca..f8fb05f 100755 (executable)
--- a/src/gen-tag-table.py
+++ b/src/gen-tag-table.py
@@ -25,10 +25,8 @@ Input files:
  """
  
  import collections
+import html
  from html.parser import HTMLParser
-def write (s):
-       sys.stdout.flush ()
-       sys.stdout.buffer.write (s.encode ('utf-8'))
  import itertools
  import re
  import sys
@@ -37,16 +35,18 @@ import unicodedata
  if len (sys.argv) != 3:
         sys.exit (__doc__)
  
-from html import unescape
-def html_unescape (parser, entity):
-       return unescape (entity)
-
  def expect (condition, message=None):
         if not condition:
                 if message is None:
                         raise AssertionError
                 raise AssertionError (message)
  
+def write (s):
+       sys.stdout.flush ()
+       sys.stdout.buffer.write (s.encode ('utf-8'))
+
+DEFAULT_LANGUAGE_SYSTEM = ''
+
  # from https://www-01.sil.org/iso639-3/iso-639-3.tab
  ISO_639_3_TO_1 = {
         'aar': 'aa',
@@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser):
                 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
                         inverted. Its values start as unsorted sets;
                         ``sort_languages`` converts them to sorted lists.
+               from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
+                       A copy of ``from_bcp_47``. It starts as ``None`` and is
+                       populated at the beginning of the first call to
+                       ``inherit_from_macrolanguages``.
  
         """
         def __init__ (self):
@@ -338,13 +342,18 @@ class OpenTypeRegistryParser (HTMLParser):
                 self.ranks = collections.defaultdict (int)
                 self.to_bcp_47 = collections.defaultdict (set)
                 self.from_bcp_47 = collections.defaultdict (set)
+               self.from_bcp_47_uninherited = None
                 # Whether the parser is in a <td> element
                 self._td = False
+               # Whether the parser is after a <br> element within the current <tr> element
+               self._br = False
                 # The text of the <td> elements of the current <tr> element.
                 self._current_tr = []
  
         def handle_starttag (self, tag, attrs):
-               if tag == 'meta':
+               if tag == 'br':
+                       self._br = True
+               elif tag == 'meta':
                         for attr, value in attrs:
                                 if attr == 'name' and value == 'updated_at':
                                         self.header = self.get_starttag_text ()
@@ -353,6 +362,7 @@ class OpenTypeRegistryParser (HTMLParser):
                         self._td = True
                         self._current_tr.append ('')
                 elif tag == 'tr':
+                       self._br = False
                         self._current_tr = []
  
         def handle_endtag (self, tag):
@@ -377,14 +387,14 @@ class OpenTypeRegistryParser (HTMLParser):
                         self.ranks[tag] = rank
  
         def handle_data (self, data):
-               if self._td:
+               if self._td and not self._br:
                         self._current_tr[-1] += data
  
         def handle_charref (self, name):
-               self.handle_data (html_unescape (self, '&#%s;' % name))
+               self.handle_data (html.unescape ('&#%s;' % name))
  
         def handle_entityref (self, name):
-               self.handle_data (html_unescape (self, '&%s;' % name))
+               self.handle_data (html.unescape ('&%s;' % name))
  
         def parse (self, filename):
                 """Parse the OpenType language system tag registry.
@@ -457,33 +467,51 @@ class OpenTypeRegistryParser (HTMLParser):
                 explicit mapping, so it inherits from sq (Albanian) the mapping
                 to SQI.
  
+               However, if an OpenType tag maps to a BCP 47 macrolanguage and
+               some but not all of its individual languages, the mapping is not
+               inherited from the macrolanguage to the missing individual
+               languages. For example, INUK (Nunavik Inuktitut) is mapped to
+               ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
+               ikt (Inuinnaqtun, which is an individual language of iu), so
+               this method does not add a mapping from ikt to INUK.
+
                 If a BCP 47 tag for a macrolanguage has no OpenType mapping but
-               all of its individual languages do and they all map to the same
-               tags, the mapping is copied to the macrolanguage.
+               some of its individual languages do, their mappings are copied
+               to the macrolanguage.
                 """
                 global bcp_47
-               original_ot_from_bcp_47 = dict (self.from_bcp_47)
+               first_time = self.from_bcp_47_uninherited is None
+               if first_time:
+                       self.from_bcp_47_uninherited = dict (self.from_bcp_47)
                 for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
-                       ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
+                       ot_macrolanguages = {
+                               ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
+                       }
+                       blocked_ot_macrolanguages = set ()
+                       if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
+                               for ot_macrolanguage in ot_macrolanguages:
+                                       round_trip_macrolanguages = {
+                                               l for l in self.to_bcp_47[ot_macrolanguage]
+                                               if 'retired code' not in bcp_47.scopes.get (l, '')
+                                       }
+                                       round_trip_languages = {
+                                               l for l in languages
+                                               if 'retired code' not in bcp_47.scopes.get (l, '')
+                                       }
+                                       intersection = round_trip_macrolanguages & round_trip_languages
+                                       if intersection and intersection != round_trip_languages:
+                                               blocked_ot_macrolanguages.add (ot_macrolanguage)
                         if ot_macrolanguages:
                                 for ot_macrolanguage in ot_macrolanguages:
-                                       for language in languages:
-                                               # Remove the following condition if e.g. nn should map to NYN,NOR
-                                               # instead of just NYN.
-                                               if language not in original_ot_from_bcp_47:
+                                       if ot_macrolanguage not in blocked_ot_macrolanguages:
+                                               for language in languages:
                                                         self.add_language (language, ot_macrolanguage)
-                                                       self.ranks[ot_macrolanguage] += 1
-                       else:
+                                                       if not blocked_ot_macrolanguages:
+                                                               self.ranks[ot_macrolanguage] += 1
+                       elif first_time:
                                 for language in languages:
-                                       if language in original_ot_from_bcp_47:
-                                               if ot_macrolanguages:
-                                                       ml = original_ot_from_bcp_47[language]
-                                                       if ml:
-                                                               ot_macrolanguages &= ml
-                                                       else:
-                                                               pass
-                                               else:
-                                                       ot_macrolanguages |= original_ot_from_bcp_47[language]
+                                       if language in self.from_bcp_47_uninherited:
+                                               ot_macrolanguages |= self.from_bcp_47_uninherited[language]
                                         else:
                                                 ot_macrolanguages.clear ()
                                         if not ot_macrolanguages:
@@ -556,7 +584,7 @@ class BCP47Parser (object):
                                                 self.grandfathered.add (subtag.lower ())
                                 elif line.startswith ('Description: '):
                                         description = line.split (' ', 1)[1].replace (' (individual language)', '')
-                                       description = re.sub (' (\((individual |macro)language\)|languages)$', '',
+                                       description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
                                                         description)
                                         if subtag in self.names:
                                                 self.names[subtag] += '\n' + description
@@ -568,7 +596,7 @@ class BCP47Parser (object):
                                                 if scope == 'macrolanguage':
                                                         scope = ' [macrolanguage]'
                                                 elif scope == 'collection':
-                                                       scope = ' [family]'
+                                                       scope = ' [collection]'
                                                 else:
                                                         continue
                                                 self.scopes[subtag] = scope
@@ -591,7 +619,9 @@ class BCP47Parser (object):
                                         elif not has_preferred_value and line.startswith ('Macrolanguage: '):
                                                 self._add_macrolanguage (line.split (' ')[1], subtag)
                                 elif subtag_type == 'variant':
-                                       if line.startswith ('Prefix: '):
+                                       if line.startswith ('Deprecated: '):
+                                               self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
+                                       elif line.startswith ('Prefix: '):
                                                 self.prefixes[subtag].add (line.split (' ')[1])
                                 elif line.startswith ('File-Date: '):
                                         self.header = line
@@ -622,6 +652,17 @@ class BCP47Parser (object):
                                 for macrolanguage in macrolanguages:
                                         self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
  
+       def _get_name_piece (self, subtag):
+               """Return the first name of a subtag plus its scope suffix.
+
+               Args:
+                       subtag (str): A BCP 47 subtag.
+
+               Returns:
+                       The name form of ``subtag``.
+               """
+               return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
+
         def get_name (self, lt):
                 """Return the names of the subtags in a language tag.
  
@@ -631,13 +672,13 @@ class BCP47Parser (object):
                 Returns:
                         The name form of ``lt``.
                 """
-               name = self.names[lt.language].split ('\n')[0]
+               name = self._get_name_piece (lt.language)
                 if lt.script:
-                       name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.script.title ())
                 if lt.region:
-                       name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.region.upper ())
                 if lt.variant:
-                       name += '; ' + self.names[lt.variant].split ('\n')[0]
+                       name += '; ' + self._get_name_piece (lt.variant)
                 return name
  
  bcp_47 = BCP47Parser ()
@@ -673,22 +714,18 @@ ot.add_language ('und-fonnapa', 'APPH')
  ot.remove_language_ot ('IRT')
  ot.add_language ('ga-Latg', 'IRT')
  
+ot.add_language ('hy-arevmda', 'HYE')
+
  ot.remove_language_ot ('KGE')
  ot.add_language ('und-Geok', 'KGE')
  
-ot.add_language ('guk', 'GUK')
-ot.names['GUK'] = 'Gumuz (SIL fonts)'
-ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
-
  bcp_47.macrolanguages['id'] = {'in'}
  
  bcp_47.macrolanguages['ijo'] = {'ijc'}
  
  ot.add_language ('kht', 'KHN')
  ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
-ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
-ot.ranks['KHN'] = ot.ranks['KHT']
-ot.ranks['KHT'] += 1
+ot.ranks['KHN'] = ot.ranks['KHT'] + 1
  
  ot.ranks['LCR'] = ot.ranks['MCR'] + 1
  
@@ -698,14 +735,18 @@ ot.ranks['MLR'] += 1
  bcp_47.names['mhv'] = 'Arakanese'
  bcp_47.scopes['mhv'] = ' (retired code)'
  
+ot.add_language ('mnw-TH', 'MONT')
+
  ot.add_language ('no', 'NOR')
  
  ot.add_language ('oc-provenc', 'PRO')
  
+ot.remove_language_ot ('QUZ')
  ot.add_language ('qu', 'QUZ')
  ot.add_language ('qub', 'QWH')
  ot.add_language ('qud', 'QVI')
  ot.add_language ('qug', 'QVI')
+ot.add_language ('qul', 'QUH')
  ot.add_language ('qup', 'QVI')
  ot.add_language ('qur', 'QWH')
  ot.add_language ('qus', 'QUH')
@@ -733,13 +774,8 @@ ot.add_language ('qxr', 'QVI')
  ot.add_language ('qxt', 'QWH')
  ot.add_language ('qxw', 'QWH')
  
-bcp_47.macrolanguages['ro'].remove ('mo')
  bcp_47.macrolanguages['ro-MD'].add ('mo')
  
-ot.add_language ('sgw', 'SGW')
-ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
-ot.ranks['SGW'] = ot.ranks['CHG'] + 1
-
  ot.remove_language_ot ('SYRE')
  ot.remove_language_ot ('SYRJ')
  ot.remove_language_ot ('SYRN')
@@ -756,14 +792,17 @@ ot.add_language ('xwo', 'TOD')
  ot.remove_language_ot ('ZHH')
  ot.remove_language_ot ('ZHP')
  ot.remove_language_ot ('ZHT')
+ot.remove_language_ot ('ZHTM')
  bcp_47.macrolanguages['zh'].remove ('lzh')
  bcp_47.macrolanguages['zh'].remove ('yue')
  ot.add_language ('zh-Hant-MO', 'ZHH')
+ot.add_language ('zh-Hant-MO', 'ZHTM')
  ot.add_language ('zh-Hant-HK', 'ZHH')
  ot.add_language ('zh-Hans', 'ZHS')
  ot.add_language ('zh-Hant', 'ZHT')
  ot.add_language ('zh-HK', 'ZHH')
  ot.add_language ('zh-MO', 'ZHH')
+ot.add_language ('zh-MO', 'ZHTM')
  ot.add_language ('zh-TW', 'ZHT')
  ot.add_language ('lzh', 'ZHT')
  ot.add_language ('lzh-Hans', 'ZHS')
@@ -795,6 +834,7 @@ def rank_delta (bcp_47, ot):
  disambiguation = {
         'ALT': 'alt',
         'ARK': 'rki',
+       'ATH': 'ath',
         'BHI': 'bhb',
         'BLN': 'bjt',
         'BTI': 'beb',
@@ -806,6 +846,7 @@ disambiguation = {
         'ECR': 'crj',
         'HAL': 'cfm',
         'HND': 'hnd',
+       'HYE': 'hyw',
         'KIS': 'kqs',
         'KUI': 'uki',
         'LRC': 'bqi',
@@ -818,15 +859,24 @@ disambiguation = {
         'QVI': 'qvi',
         'QWH': 'qwh',
         'SIG': 'stv',
-       'TNE': 'yrk',
+       'SRB': 'sr',
+       'SXT': 'xnj',
         'ZHH': 'zh-HK',
         'ZHS': 'zh-Hans',
         'ZHT': 'zh-Hant',
+       'ZHTM': 'zh-MO',
  }
  
  ot.inherit_from_macrolanguages ()
  bcp_47.remove_extra_macrolanguages ()
  ot.inherit_from_macrolanguages ()
+ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
+ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
+for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
+       possible_bcp_47_tag = tricky_ot_tag.lower ()
+       if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
+               ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
+               bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
  ot.sort_languages ()
  
  print ('/* == Start of generated table == */')
@@ -855,6 +905,8 @@ def hb_tag (tag):
         Returns:
                 A snippet of C++ representing ``tag``.
         """
+       if tag == DEFAULT_LANGUAGE_SYSTEM:
+               return 'HB_TAG_NONE\t       '
         return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
  
  def get_variant_set (name):
@@ -903,14 +955,18 @@ for language, tags in sorted (ot.from_bcp_47.items ()):
                 print ('\t/* ', end='')
                 bcp_47_name = bcp_47.names.get (language, '')
                 bcp_47_name_candidates = bcp_47_name.split ('\n')
-               intersection = language_name_intersection (bcp_47_name, ot.names[tag])
+               ot_name = ot.names[tag]
                 scope = bcp_47.scopes.get (language, '')
-               if not intersection:
-                       write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
+               if tag == DEFAULT_LANGUAGE_SYSTEM:
+                       write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
                 else:
-                       name = get_matching_language_name (intersection, bcp_47_name_candidates)
-                       bcp_47.names[language] = name
-                       write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
+                       intersection = language_name_intersection (bcp_47_name, ot_name)
+                       if not intersection:
+                               write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
+                       else:
+                               name = get_matching_language_name (intersection, bcp_47_name_candidates)
+                               bcp_47.names[language] = name
+                               write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
                 print (' */')
  
  print ('};')
@@ -957,6 +1013,8 @@ for initial, items in sorted (complex_tags.items ()):
         if initial != 'und':
                 continue
         for lt, tags in items:
+               if not tags:
+                       continue
                 if lt.variant in bcp_47.prefixes:
                         expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
                                         '%s is not a valid prefix of %s' % (lt.language, lt.variant))
@@ -991,23 +1049,27 @@ for initial, items in sorted (complex_tags.items ()):
                 continue
         print ("  case '%s':" % initial)
         for lt, tags in items:
+               if not tags:
+                       continue
                 print ('    if (', end='')
+               script = lt.script
+               region = lt.region
                 if lt.grandfathered:
                         print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
                 else:
                         string_literal = lt.language[1:] + '-'
-                       if lt.script:
-                               string_literal += lt.script
-                               lt.script = None
-                               if lt.region:
-                                       string_literal += '-' + lt.region
-                                       lt.region = None
+                       if script:
+                               string_literal += script
+                               script = None
+                               if region:
+                                       string_literal += '-' + region
+                                       region = None
                         if string_literal[-1] == '-':
                                 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
                         else:
                                 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
-               print_subtag_matches (lt.script, True)
-               print_subtag_matches (lt.region, True)
+               print_subtag_matches (script, True)
+               print_subtag_matches (region, True)
                 print_subtag_matches (lt.variant, True)
                 print (')')
                 print ('    {')
@@ -1074,17 +1136,28 @@ def verify_disambiguation_dict ():
         global disambiguation
         global ot
         for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
-               primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
+               if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
+                       primary_tags = []
+               else:
+                       primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
                 if len (primary_tags) == 1:
                         expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
                         if '-' in primary_tags[0]:
                                 disambiguation[ot_tag] = primary_tags[0]
+                       else:
+                               first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
+                               if primary_tags[0] != first_tag:
+                                       disambiguation[ot_tag] = primary_tags[0]
                 elif len (primary_tags) == 0:
                         expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
                 else:
-                       macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
+                       original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
+                       if len (original_languages) == 1:
+                               macrolanguages = original_languages
+                       else:
+                               macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
                         if len (macrolanguages) != 1:
-                               macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
+                               macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
                         if len (macrolanguages) != 1:
                                 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
                         if len (macrolanguages) != 1:
@@ -1093,8 +1166,8 @@ def verify_disambiguation_dict ():
                                                 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
                         elif ot_tag not in disambiguation:
                                 disambiguation[ot_tag] = macrolanguages[0]
-                       different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
-                       if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
+                       different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
+                       if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
                                 del disambiguation[ot_tag]
         for ot_tag in disambiguation.keys ():
                 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)