"""
import collections
+import html
from html.parser import HTMLParser
-def write (s):
- sys.stdout.flush ()
- sys.stdout.buffer.write (s.encode ('utf-8'))
import itertools
import re
import sys
if len (sys.argv) != 3:
sys.exit (__doc__)
-from html import unescape
-def html_unescape (parser, entity):
- return unescape (entity)
-
def expect (condition, message=None):
if not condition:
if message is None:
raise AssertionError
raise AssertionError (message)
+def write (s):
+ sys.stdout.flush ()
+ sys.stdout.buffer.write (s.encode ('utf-8'))
+
+DEFAULT_LANGUAGE_SYSTEM = ''
+
# from https://www-01.sil.org/iso639-3/iso-639-3.tab
ISO_639_3_TO_1 = {
'aar': 'aa',
from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
inverted. Its values start as unsorted sets;
``sort_languages`` converts them to sorted lists.
+ from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
+ A copy of ``from_bcp_47``. It starts as ``None`` and is
+ populated at the beginning of the first call to
+ ``inherit_from_macrolanguages``.
"""
def __init__ (self):
self.ranks = collections.defaultdict (int)
self.to_bcp_47 = collections.defaultdict (set)
self.from_bcp_47 = collections.defaultdict (set)
+ self.from_bcp_47_uninherited = None
# Whether the parser is in a <td> element
self._td = False
+ # Whether the parser is after a <br> element within the current <tr> element
+ self._br = False
# The text of the <td> elements of the current <tr> element.
self._current_tr = []
def handle_starttag (self, tag, attrs):
- if tag == 'meta':
+ if tag == 'br':
+ self._br = True
+ elif tag == 'meta':
for attr, value in attrs:
if attr == 'name' and value == 'updated_at':
self.header = self.get_starttag_text ()
self._td = True
self._current_tr.append ('')
elif tag == 'tr':
+ self._br = False
self._current_tr = []
def handle_endtag (self, tag):
self.ranks[tag] = rank
def handle_data (self, data):
- if self._td:
+ if self._td and not self._br:
self._current_tr[-1] += data
def handle_charref (self, name):
- self.handle_data (html_unescape (self, '&#%s;' % name))
+ self.handle_data (html.unescape ('&#%s;' % name))
def handle_entityref (self, name):
- self.handle_data (html_unescape (self, '&%s;' % name))
+ self.handle_data (html.unescape ('&%s;' % name))
def parse (self, filename):
"""Parse the OpenType language system tag registry.
explicit mapping, so it inherits from sq (Albanian) the mapping
to SQI.
+ However, if an OpenType tag maps to a BCP 47 macrolanguage and
+ some but not all of its individual languages, the mapping is not
+ inherited from the macrolanguage to the missing individual
+ languages. For example, INUK (Nunavik Inuktitut) is mapped to
+ ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
+ ikt (Inuinnaqtun, which is an individual language of iu), so
+ this method does not add a mapping from ikt to INUK.
+
If a BCP 47 tag for a macrolanguage has no OpenType mapping but
- all of its individual languages do and they all map to the same
- tags, the mapping is copied to the macrolanguage.
+ some of its individual languages do, their mappings are copied
+ to the macrolanguage.
"""
global bcp_47
- original_ot_from_bcp_47 = dict (self.from_bcp_47)
+ first_time = self.from_bcp_47_uninherited is None
+ if first_time:
+ self.from_bcp_47_uninherited = dict (self.from_bcp_47)
for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
- ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
+ ot_macrolanguages = {
+ ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
+ }
+ blocked_ot_macrolanguages = set ()
+ if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
+ for ot_macrolanguage in ot_macrolanguages:
+ round_trip_macrolanguages = {
+ l for l in self.to_bcp_47[ot_macrolanguage]
+ if 'retired code' not in bcp_47.scopes.get (l, '')
+ }
+ round_trip_languages = {
+ l for l in languages
+ if 'retired code' not in bcp_47.scopes.get (l, '')
+ }
+ intersection = round_trip_macrolanguages & round_trip_languages
+ if intersection and intersection != round_trip_languages:
+ blocked_ot_macrolanguages.add (ot_macrolanguage)
if ot_macrolanguages:
for ot_macrolanguage in ot_macrolanguages:
- for language in languages:
- # Remove the following condition if e.g. nn should map to NYN,NOR
- # instead of just NYN.
- if language not in original_ot_from_bcp_47:
+ if ot_macrolanguage not in blocked_ot_macrolanguages:
+ for language in languages:
self.add_language (language, ot_macrolanguage)
- self.ranks[ot_macrolanguage] += 1
- else:
+ if not blocked_ot_macrolanguages:
+ self.ranks[ot_macrolanguage] += 1
+ elif first_time:
for language in languages:
- if language in original_ot_from_bcp_47:
- if ot_macrolanguages:
- ml = original_ot_from_bcp_47[language]
- if ml:
- ot_macrolanguages &= ml
- else:
- pass
- else:
- ot_macrolanguages |= original_ot_from_bcp_47[language]
+ if language in self.from_bcp_47_uninherited:
+ ot_macrolanguages |= self.from_bcp_47_uninherited[language]
else:
ot_macrolanguages.clear ()
if not ot_macrolanguages:
self.grandfathered.add (subtag.lower ())
elif line.startswith ('Description: '):
description = line.split (' ', 1)[1].replace (' (individual language)', '')
- description = re.sub (' (\((individual |macro)language\)|languages)$', '',
+ description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
description)
if subtag in self.names:
self.names[subtag] += '\n' + description
if scope == 'macrolanguage':
scope = ' [macrolanguage]'
elif scope == 'collection':
- scope = ' [family]'
+ scope = ' [collection]'
else:
continue
self.scopes[subtag] = scope
elif not has_preferred_value and line.startswith ('Macrolanguage: '):
self._add_macrolanguage (line.split (' ')[1], subtag)
elif subtag_type == 'variant':
- if line.startswith ('Prefix: '):
+ if line.startswith ('Deprecated: '):
+ self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
+ elif line.startswith ('Prefix: '):
self.prefixes[subtag].add (line.split (' ')[1])
elif line.startswith ('File-Date: '):
self.header = line
for macrolanguage in macrolanguages:
self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
+ def _get_name_piece (self, subtag):
+ """Return the first name of a subtag plus its scope suffix.
+
+ Args:
+ subtag (str): A BCP 47 subtag.
+
+ Returns:
+ The name form of ``subtag``.
+ """
+ return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
+
def get_name (self, lt):
"""Return the names of the subtags in a language tag.
Returns:
The name form of ``lt``.
"""
- name = self.names[lt.language].split ('\n')[0]
+ name = self._get_name_piece (lt.language)
if lt.script:
- name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
+ name += '; ' + self._get_name_piece (lt.script.title ())
if lt.region:
- name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
+ name += '; ' + self._get_name_piece (lt.region.upper ())
if lt.variant:
- name += '; ' + self.names[lt.variant].split ('\n')[0]
+ name += '; ' + self._get_name_piece (lt.variant)
return name
bcp_47 = BCP47Parser ()
ot.remove_language_ot ('IRT')
ot.add_language ('ga-Latg', 'IRT')
+ot.add_language ('hy-arevmda', 'HYE')
+
ot.remove_language_ot ('KGE')
ot.add_language ('und-Geok', 'KGE')
-ot.add_language ('guk', 'GUK')
-ot.names['GUK'] = 'Gumuz (SIL fonts)'
-ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
-
bcp_47.macrolanguages['id'] = {'in'}
bcp_47.macrolanguages['ijo'] = {'ijc'}
ot.add_language ('kht', 'KHN')
ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
-ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
-ot.ranks['KHN'] = ot.ranks['KHT']
-ot.ranks['KHT'] += 1
+ot.ranks['KHN'] = ot.ranks['KHT'] + 1
ot.ranks['LCR'] = ot.ranks['MCR'] + 1
bcp_47.names['mhv'] = 'Arakanese'
bcp_47.scopes['mhv'] = ' (retired code)'
+ot.add_language ('mnw-TH', 'MONT')
+
ot.add_language ('no', 'NOR')
ot.add_language ('oc-provenc', 'PRO')
+ot.remove_language_ot ('QUZ')
ot.add_language ('qu', 'QUZ')
ot.add_language ('qub', 'QWH')
ot.add_language ('qud', 'QVI')
ot.add_language ('qug', 'QVI')
+ot.add_language ('qul', 'QUH')
ot.add_language ('qup', 'QVI')
ot.add_language ('qur', 'QWH')
ot.add_language ('qus', 'QUH')
ot.add_language ('qxt', 'QWH')
ot.add_language ('qxw', 'QWH')
-bcp_47.macrolanguages['ro'].remove ('mo')
bcp_47.macrolanguages['ro-MD'].add ('mo')
-ot.add_language ('sgw', 'SGW')
-ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
-ot.ranks['SGW'] = ot.ranks['CHG'] + 1
-
ot.remove_language_ot ('SYRE')
ot.remove_language_ot ('SYRJ')
ot.remove_language_ot ('SYRN')
ot.remove_language_ot ('ZHH')
ot.remove_language_ot ('ZHP')
ot.remove_language_ot ('ZHT')
+ot.remove_language_ot ('ZHTM')
bcp_47.macrolanguages['zh'].remove ('lzh')
bcp_47.macrolanguages['zh'].remove ('yue')
ot.add_language ('zh-Hant-MO', 'ZHH')
+ot.add_language ('zh-Hant-MO', 'ZHTM')
ot.add_language ('zh-Hant-HK', 'ZHH')
ot.add_language ('zh-Hans', 'ZHS')
ot.add_language ('zh-Hant', 'ZHT')
ot.add_language ('zh-HK', 'ZHH')
ot.add_language ('zh-MO', 'ZHH')
+ot.add_language ('zh-MO', 'ZHTM')
ot.add_language ('zh-TW', 'ZHT')
ot.add_language ('lzh', 'ZHT')
ot.add_language ('lzh-Hans', 'ZHS')
disambiguation = {
'ALT': 'alt',
'ARK': 'rki',
+ 'ATH': 'ath',
'BHI': 'bhb',
'BLN': 'bjt',
'BTI': 'beb',
'ECR': 'crj',
'HAL': 'cfm',
'HND': 'hnd',
+ 'HYE': 'hyw',
'KIS': 'kqs',
'KUI': 'uki',
'LRC': 'bqi',
'QVI': 'qvi',
'QWH': 'qwh',
'SIG': 'stv',
- 'TNE': 'yrk',
+ 'SRB': 'sr',
+ 'SXT': 'xnj',
'ZHH': 'zh-HK',
'ZHS': 'zh-Hans',
'ZHT': 'zh-Hant',
+ 'ZHTM': 'zh-MO',
}
ot.inherit_from_macrolanguages ()
bcp_47.remove_extra_macrolanguages ()
ot.inherit_from_macrolanguages ()
+ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
+ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
+for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
+ possible_bcp_47_tag = tricky_ot_tag.lower ()
+ if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
+ ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
+ bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
ot.sort_languages ()
print ('/* == Start of generated table == */')
Returns:
A snippet of C++ representing ``tag``.
"""
+ if tag == DEFAULT_LANGUAGE_SYSTEM:
+ return 'HB_TAG_NONE\t '
return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
def get_variant_set (name):
print ('\t/* ', end='')
bcp_47_name = bcp_47.names.get (language, '')
bcp_47_name_candidates = bcp_47_name.split ('\n')
- intersection = language_name_intersection (bcp_47_name, ot.names[tag])
+ ot_name = ot.names[tag]
scope = bcp_47.scopes.get (language, '')
- if not intersection:
- write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
+ if tag == DEFAULT_LANGUAGE_SYSTEM:
+ write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
else:
- name = get_matching_language_name (intersection, bcp_47_name_candidates)
- bcp_47.names[language] = name
- write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
+ intersection = language_name_intersection (bcp_47_name, ot_name)
+ if not intersection:
+ write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
+ else:
+ name = get_matching_language_name (intersection, bcp_47_name_candidates)
+ bcp_47.names[language] = name
+ write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
print (' */')
print ('};')
if initial != 'und':
continue
for lt, tags in items:
+ if not tags:
+ continue
if lt.variant in bcp_47.prefixes:
expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
'%s is not a valid prefix of %s' % (lt.language, lt.variant))
continue
print (" case '%s':" % initial)
for lt, tags in items:
+ if not tags:
+ continue
print (' if (', end='')
+ script = lt.script
+ region = lt.region
if lt.grandfathered:
print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
else:
string_literal = lt.language[1:] + '-'
- if lt.script:
- string_literal += lt.script
- lt.script = None
- if lt.region:
- string_literal += '-' + lt.region
- lt.region = None
+ if script:
+ string_literal += script
+ script = None
+ if region:
+ string_literal += '-' + region
+ region = None
if string_literal[-1] == '-':
print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
else:
print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
- print_subtag_matches (lt.script, True)
- print_subtag_matches (lt.region, True)
+ print_subtag_matches (script, True)
+ print_subtag_matches (region, True)
print_subtag_matches (lt.variant, True)
print (')')
print (' {')
global disambiguation
global ot
for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
- primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
+ if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
+ primary_tags = []
+ else:
+ primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
if len (primary_tags) == 1:
expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
if '-' in primary_tags[0]:
disambiguation[ot_tag] = primary_tags[0]
+ else:
+ first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
+ if primary_tags[0] != first_tag:
+ disambiguation[ot_tag] = primary_tags[0]
elif len (primary_tags) == 0:
expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
else:
- macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
+ original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
+ if len (original_languages) == 1:
+ macrolanguages = original_languages
+ else:
+ macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
if len (macrolanguages) != 1:
- macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
+ macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
if len (macrolanguages) != 1:
macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
if len (macrolanguages) != 1:
'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
elif ot_tag not in disambiguation:
disambiguation[ot_tag] = macrolanguages[0]
- different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
- if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
+ different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
+ if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
del disambiguation[ot_tag]
for ot_tag in disambiguation.keys ():
expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)