src/gen-use-table.py

   1 #!/usr/bin/env python
   2 # flake8: noqa
   3
   4 from __future__ import print_function, division, absolute_import
   5
   6 import io
   7 import sys
   8
   9 if len (sys.argv) != 5:
  10         print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
  11         sys.exit (1)
  12
  13 BLACKLISTED_BLOCKS = ["Thai", "Lao"]
  14
  15 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
  16
  17 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  18 headers.append (["UnicodeData.txt does not have a header."])
  19
  20 data = [{} for f in files]
  21 values = [{} for f in files]
  22 for i, f in enumerate (files):
  23         for line in f:
  24
  25                 j = line.find ('#')
  26                 if j >= 0:
  27                         line = line[:j]
  28
  29                 fields = [x.strip () for x in line.split (';')]
  30                 if len (fields) == 1:
  31                         continue
  32
  33                 uu = fields[0].split ('..')
  34                 start = int (uu[0], 16)
  35                 if len (uu) == 1:
  36                         end = start
  37                 else:
  38                         end = int (uu[1], 16)
  39
  40                 t = fields[1 if i != 2 else 2]
  41
  42                 for u in range (start, end + 1):
  43                         data[i][u] = t
  44                 values[i][t] = values[i].get (t, 0) + end - start + 1
  45
  46 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  47
  48 # TODO Characters that are not in Unicode Indic files, but used in USE
  49 data[0][0x034F] = defaults[0]
  50 data[0][0x2060] = defaults[0]
  51 data[0][0x20F0] = defaults[0]
  52 # TODO https://github.com/roozbehp/unicode-data/issues/9
  53 data[0][0x11C44] = 'Consonant_Placeholder'
  54 data[0][0x11C45] = 'Consonant_Placeholder'
  55 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
  56 data[0][0x111C8] = 'Consonant_Placeholder'
  57 for u in range (0xFE00, 0xFE0F + 1):
  58         data[0][u] = defaults[0]
  59
  60 # Merge data into one dict:
  61 for i,v in enumerate (defaults):
  62         values[i][v] = values[i].get (v, 0) + 1
  63 combined = {}
  64 for i,d in enumerate (data):
  65         for u,v in d.items ():
  66                 if i >= 2 and not u in combined:
  67                         continue
  68                 if not u in combined:
  69                         combined[u] = list (defaults)
  70                 combined[u][i] = v
  71 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  72 data = combined
  73 del combined
  74 num = len (data)
  75
  76
  77 property_names = [
  78         # General_Category
  79         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  80         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  81         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  82         # Indic_Syllabic_Category
  83         'Other',
  84         'Bindu',
  85         'Visarga',
  86         'Avagraha',
  87         'Nukta',
  88         'Virama',
  89         'Pure_Killer',
  90         'Invisible_Stacker',
  91         'Vowel_Independent',
  92         'Vowel_Dependent',
  93         'Vowel',
  94         'Consonant_Placeholder',
  95         'Consonant',
  96         'Consonant_Dead',
  97         'Consonant_With_Stacker',
  98         'Consonant_Prefixed',
  99         'Consonant_Preceding_Repha',
 100         'Consonant_Succeeding_Repha',
 101         'Consonant_Subjoined',
 102         'Consonant_Medial',
 103         'Consonant_Final',
 104         'Consonant_Head_Letter',
 105         'Consonant_Initial_Postfixed',
 106         'Modifying_Letter',
 107         'Tone_Letter',
 108         'Tone_Mark',
 109         'Gemination_Mark',
 110         'Cantillation_Mark',
 111         'Register_Shifter',
 112         'Syllable_Modifier',
 113         'Consonant_Killer',
 114         'Non_Joiner',
 115         'Joiner',
 116         'Number_Joiner',
 117         'Number',
 118         'Brahmi_Joining_Number',
 119         # Indic_Positional_Category
 120         'Not_Applicable',
 121         'Right',
 122         'Left',
 123         'Visual_Order_Left',
 124         'Left_And_Right',
 125         'Top',
 126         'Bottom',
 127         'Top_And_Bottom',
 128         'Top_And_Right',
 129         'Top_And_Left',
 130         'Top_And_Left_And_Right',
 131         'Bottom_And_Left',
 132         'Bottom_And_Right',
 133         'Top_And_Bottom_And_Right',
 134         'Overstruck',
 135 ]
 136
 137 try:
 138         basestring
 139 except NameError:
 140         basestring = str
 141
 142 class PropertyValue(object):
 143         def __init__(self, name_):
 144                 self.name = name_
 145         def __str__(self):
 146                 return self.name
 147         def __eq__(self, other):
 148                 return self.name == (other if isinstance(other, basestring) else other.name)
 149         def __ne__(self, other):
 150                 return not (self == other)
 151         def __hash__(self):
 152                 return hash(str(self))
 153
 154 property_values = {}
 155
 156 for name in property_names:
 157         value = PropertyValue(name)
 158         assert value not in property_values
 159         assert value not in globals()
 160         property_values[name] = value
 161 globals().update(property_values)
 162
 163
 164 def is_BASE(U, UISC, UGC):
 165         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 166                         #SPEC-DRAFT Consonant_Placeholder,
 167                         Tone_Letter,
 168                         Vowel_Independent #SPEC-DRAFT
 169                         ] or
 170                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 171                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 172 def is_BASE_IND(U, UISC, UGC):
 173         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 174         return (UISC in [Consonant_Dead, Modifying_Letter] or
 175                 (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
 176                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 177                 )
 178 def is_BASE_NUM(U, UISC, UGC):
 179         return UISC == Brahmi_Joining_Number
 180 def is_BASE_OTHER(U, UISC, UGC):
 181         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 182         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 183         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 184 def is_CGJ(U, UISC, UGC):
 185         return U == 0x034F
 186 def is_CONS_FINAL(U, UISC, UGC):
 187         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 188         return ((UISC == Consonant_Final and UGC != Lo) or
 189                 UISC == Consonant_Initial_Postfixed or
 190                 UISC == Consonant_Succeeding_Repha)
 191 def is_CONS_FINAL_MOD(U, UISC, UGC):
 192         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 193         return  UISC == Syllable_Modifier
 194 def is_CONS_MED(U, UISC, UGC):
 195         return UISC == Consonant_Medial and UGC != Lo
 196 def is_CONS_MOD(U, UISC, UGC):
 197         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 198 def is_CONS_SUB(U, UISC, UGC):
 199         #SPEC-DRAFT return UISC == Consonant_Subjoined
 200         return UISC == Consonant_Subjoined and UGC != Lo
 201 def is_CONS_WITH_STACKER(U, UISC, UGC):
 202         return UISC == Consonant_With_Stacker
 203 def is_HALANT(U, UISC, UGC):
 204         return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
 205 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
 206         # https://github.com/harfbuzz/harfbuzz/issues/1102
 207         # https://github.com/harfbuzz/harfbuzz/issues/1379
 208         return U in [0x11046, 0x1134D]
 209 def is_HALANT_NUM(U, UISC, UGC):
 210         return UISC == Number_Joiner
 211 def is_ZWNJ(U, UISC, UGC):
 212         return UISC == Non_Joiner
 213 def is_ZWJ(U, UISC, UGC):
 214         return UISC == Joiner
 215 def is_Word_Joiner(U, UISC, UGC):
 216         return U == 0x2060
 217 def is_OTHER(U, UISC, UGC):
 218         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 219         return (UISC == Other
 220                 and not is_SYM_MOD(U, UISC, UGC)
 221                 and not is_CGJ(U, UISC, UGC)
 222                 and not is_Word_Joiner(U, UISC, UGC)
 223                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 224         )
 225 def is_Reserved(U, UISC, UGC):
 226         return UGC == 'Cn'
 227 def is_REPHA(U, UISC, UGC):
 228         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 229 def is_SYM(U, UISC, UGC):
 230         if U == 0x25CC: return False #SPEC-DRAFT
 231         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 232         return UGC in [So, Sc]
 233 def is_SYM_MOD(U, UISC, UGC):
 234         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 235 def is_VARIATION_SELECTOR(U, UISC, UGC):
 236         return 0xFE00 <= U <= 0xFE0F
 237 def is_VOWEL(U, UISC, UGC):
 238         # https://github.com/roozbehp/unicode-data/issues/6
 239         return (UISC == Pure_Killer or
 240                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 241 def is_VOWEL_MOD(U, UISC, UGC):
 242         # https://github.com/roozbehp/unicode-data/issues/6
 243         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 244                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 245
 246 use_mapping = {
 247         'B':    is_BASE,
 248         'IND':  is_BASE_IND,
 249         'N':    is_BASE_NUM,
 250         'GB':   is_BASE_OTHER,
 251         'CGJ':  is_CGJ,
 252         'F':    is_CONS_FINAL,
 253         'FM':   is_CONS_FINAL_MOD,
 254         'M':    is_CONS_MED,
 255         'CM':   is_CONS_MOD,
 256         'SUB':  is_CONS_SUB,
 257         'CS':   is_CONS_WITH_STACKER,
 258         'H':    is_HALANT,
 259         'HVM':  is_HALANT_OR_VOWEL_MODIFIER,
 260         'HN':   is_HALANT_NUM,
 261         'ZWNJ': is_ZWNJ,
 262         'ZWJ':  is_ZWJ,
 263         'WJ':   is_Word_Joiner,
 264         'O':    is_OTHER,
 265         'Rsv':  is_Reserved,
 266         'R':    is_REPHA,
 267         'S':    is_SYM,
 268         'SM':   is_SYM_MOD,
 269         'VS':   is_VARIATION_SELECTOR,
 270         'V':    is_VOWEL,
 271         'VM':   is_VOWEL_MOD,
 272 }
 273
 274 use_positions = {
 275         'F': {
 276                 'Abv': [Top],
 277                 'Blw': [Bottom],
 278                 'Pst': [Right],
 279         },
 280         'M': {
 281                 'Abv': [Top],
 282                 'Blw': [Bottom, Bottom_And_Left],
 283                 'Pst': [Right],
 284                 'Pre': [Left],
 285         },
 286         'CM': {
 287                 'Abv': [Top],
 288                 'Blw': [Bottom],
 289         },
 290         'V': {
 291                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 292                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 293                 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 294                 'Pre': [Left],
 295         },
 296         'VM': {
 297                 'Abv': [Top],
 298                 'Blw': [Bottom, Overstruck],
 299                 'Pst': [Right],
 300                 'Pre': [Left],
 301         },
 302         'SM': {
 303                 'Abv': [Top],
 304                 'Blw': [Bottom],
 305         },
 306         'H': None,
 307         'HVM': None,
 308         'B': None,
 309         'FM': None,
 310         'SUB': None,
 311 }
 312
 313 def map_to_use(data):
 314         out = {}
 315         items = use_mapping.items()
 316         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 317
 318                 # Resolve Indic_Syllabic_Category
 319
 320                 # TODO: These don't have UISC assigned in Unicode 8.0, but have UIPC
 321                 if U == 0x17DD: UISC = Vowel_Dependent
 322                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 323
 324                 # Tibetan:
 325                 # TODO: These don't have UISC assigned in Unicode 11.0, but have UIPC
 326                 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
 327                 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
 328                 # Overrides to allow NFC order matching syllable
 329                 # https://github.com/harfbuzz/harfbuzz/issues/1012
 330                 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
 331                         if UIPC == Top:
 332                                 UIPC = Bottom
 333
 334                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982
 335                 # also  https://github.com/harfbuzz/harfbuzz/issues/1012
 336                 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
 337                         if UIPC == Top:
 338                                 UIPC = Bottom
 339                         elif UIPC == Bottom:
 340                                 UIPC = Top
 341
 342                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 343                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 344
 345                 # TODO: U+1CED should only be allowed after some of
 346                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 347                 if U == 0x1CED: UISC = Tone_Mark
 348
 349                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525
 350                 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
 351
 352                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609
 353                 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
 354
 355                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626
 356                 if U == 0xA8B4: UISC = Consonant_Medial
 357
 358                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
 359                 if U == 0x11134: UISC = Gemination_Mark
 360
 361                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1399
 362                 if U == 0x111C9: UISC = Consonant_Final
 363
 364                 values = [k for k,v in items if v(U,UISC,UGC)]
 365                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 366                 USE = values[0]
 367
 368                 # Resolve Indic_Positional_Category
 369
 370                 # TODO: Not in Unicode 8.0 yet, but in spec.
 371                 if U == 0x1B6C: UIPC = Bottom
 372
 373                 # TODO: These should die, but have UIPC in Unicode 8.0
 374                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 375
 376                 # TODO: In USE's override list but not in Unicode 11.0
 377                 if U == 0x103C: UIPC = Left
 378
 379                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0
 380                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 381                 if U == 0x111CA: UIPC = Bottom
 382                 if U == 0x11300: UIPC = Top
 383                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
 384                 if U == 0x11302: UIPC = Top
 385                 if U == 0x1133C: UIPC = Bottom
 386                 if U == 0x1171E: UIPC = Left # Correct?!
 387                 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
 388                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 389                 # https://github.com/roozbehp/unicode-data/issues/8
 390                 if U == 0x0A51: UIPC = Bottom
 391
 392                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 393                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 394
 395                 pos_mapping = use_positions.get(USE, None)
 396                 if pos_mapping:
 397                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 398                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 399                         USE = USE + values[0]
 400
 401                 out[U] = (USE, UBlock)
 402         return out
 403
 404 defaults = ('O', 'No_Block')
 405 data = map_to_use(data)
 406
 407 print ("/* == Start of generated table == */")
 408 print ("/*")
 409 print (" * The following table is generated by running:")
 410 print (" *")
 411 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
 412 print (" *")
 413 print (" * on files with these headers:")
 414 print (" *")
 415 for h in headers:
 416         for l in h:
 417                 print (" * %s" % (l.strip()))
 418 print (" */")
 419 print ()
 420 print ('#include "hb-ot-shape-complex-use.hh"')
 421 print ()
 422
 423 total = 0
 424 used = 0
 425 last_block = None
 426 def print_block (block, start, end, data):
 427         global total, used, last_block
 428         if block and block != last_block:
 429                 print ()
 430                 print ()
 431                 print ("  /* %s */" % block)
 432                 if start % 16:
 433                         print (' ' * (20 + (start % 16 * 6)), end='')
 434         num = 0
 435         assert start % 8 == 0
 436         assert (end+1) % 8 == 0
 437         for u in range (start, end+1):
 438                 if u % 16 == 0:
 439                         print ()
 440                         print ("  /* %04X */" % u, end='')
 441                 if u in data:
 442                         num += 1
 443                 d = data.get (u, defaults)
 444                 print ("%6s," % d[0], end='')
 445
 446         total += end - start + 1
 447         used += num
 448         if block:
 449                 last_block = block
 450
 451 uu = sorted (data.keys ())
 452
 453 last = -100000
 454 num = 0
 455 offset = 0
 456 starts = []
 457 ends = []
 458 print ('#pragma GCC diagnostic push')
 459 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 460 for k,v in sorted(use_mapping.items()):
 461         if k in use_positions and use_positions[k]: continue
 462         print ("#define %s      USE_%s  /* %s */" % (k, k, v.__name__[3:]))
 463 for k,v in sorted(use_positions.items()):
 464         if not v: continue
 465         for suf in v.keys():
 466                 tag = k + suf
 467                 print ("#define %s      USE_%s" % (tag, tag))
 468 print ('#pragma GCC diagnostic pop')
 469 print ("")
 470 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
 471 for u in uu:
 472         if u <= last:
 473                 continue
 474         block = data[u][1]
 475
 476         start = u//8*8
 477         end = start+1
 478         while end in uu and block == data[end][1]:
 479                 end += 1
 480         end = (end-1)//8*8 + 7
 481
 482         if start != last + 1:
 483                 if start - last <= 1+16*3:
 484                         print_block (None, last+1, start-1, data)
 485                         last = start-1
 486                 else:
 487                         if last >= 0:
 488                                 ends.append (last + 1)
 489                                 offset += ends[-1] - starts[-1]
 490                         print ()
 491                         print ()
 492                         print ("#define use_offset_0x%04xu %d" % (start, offset))
 493                         starts.append (start)
 494
 495         print_block (block, start, end, data)
 496         last = end
 497 ends.append (last + 1)
 498 offset += ends[-1] - starts[-1]
 499 print ()
 500 print ()
 501 occupancy = used * 100. / total
 502 page_bits = 12
 503 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 504 print ()
 505 print ("USE_TABLE_ELEMENT_TYPE")
 506 print ("hb_use_get_category (hb_codepoint_t u)")
 507 print ("{")
 508 print ("  switch (u >> %d)" % page_bits)
 509 print ("  {")
 510 pages = set([u>>page_bits for u in starts+ends])
 511 for p in sorted(pages):
 512         print ("    case 0x%0Xu:" % p)
 513         for (start,end) in zip (starts, ends):
 514                 if p not in [start>>page_bits, end>>page_bits]: continue
 515                 offset = "use_offset_0x%04xu" % start
 516                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 517         print ("      break;")
 518         print ("")
 519 print ("    default:")
 520 print ("      break;")
 521 print ("  }")
 522 print ("  return USE_O;")
 523 print ("}")
 524 print ()
 525 for k in sorted(use_mapping.keys()):
 526         if k in use_positions and use_positions[k]: continue
 527         print ("#undef %s" % k)
 528 for k,v in sorted(use_positions.items()):
 529         if not v: continue
 530         for suf in v.keys():
 531                 tag = k + suf
 532                 print ("#undef %s" % tag)
 533 print ()
 534 print ("/* == End of generated table == */")
 535
 536 # Maintain at least 50% occupancy in the table */
 537 if occupancy < 50:
 538         raise Exception ("Table too sparse, please investigate: ", occupancy)