src/gen-use-table.py

   1 #!/usr/bin/env python
   2 # flake8: noqa
   3
   4 from __future__ import print_function, division, absolute_import
   5
   6 import io
   7 import sys
   8
   9 if len (sys.argv) != 5:
  10         print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
  11         sys.exit (1)
  12
  13 BLACKLISTED_BLOCKS = ["Thai", "Lao"]
  14
  15 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
  16
  17 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  18 headers.append (["UnicodeData.txt does not have a header."])
  19
  20 data = [{} for f in files]
  21 values = [{} for f in files]
  22 for i, f in enumerate (files):
  23         for line in f:
  24
  25                 j = line.find ('#')
  26                 if j >= 0:
  27                         line = line[:j]
  28
  29                 fields = [x.strip () for x in line.split (';')]
  30                 if len (fields) == 1:
  31                         continue
  32
  33                 uu = fields[0].split ('..')
  34                 start = int (uu[0], 16)
  35                 if len (uu) == 1:
  36                         end = start
  37                 else:
  38                         end = int (uu[1], 16)
  39
  40                 t = fields[1 if i != 2 else 2]
  41
  42                 for u in range (start, end + 1):
  43                         data[i][u] = t
  44                 values[i][t] = values[i].get (t, 0) + end - start + 1
  45
  46 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  47
  48 # TODO Characters that are not in Unicode Indic files, but used in USE
  49 data[0][0x034F] = defaults[0]
  50 data[0][0x2060] = defaults[0]
  51 # TODO https://github.com/roozbehp/unicode-data/issues/9
  52 data[0][0x11C44] = 'Consonant_Placeholder'
  53 data[0][0x11C45] = 'Consonant_Placeholder'
  54 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
  55 data[0][0x111C8] = 'Consonant_Placeholder'
  56 for u in range (0xFE00, 0xFE0F + 1):
  57         data[0][u] = defaults[0]
  58
  59 # Merge data into one dict:
  60 for i,v in enumerate (defaults):
  61         values[i][v] = values[i].get (v, 0) + 1
  62 combined = {}
  63 for i,d in enumerate (data):
  64         for u,v in d.items ():
  65                 if i >= 2 and not u in combined:
  66                         continue
  67                 if not u in combined:
  68                         combined[u] = list (defaults)
  69                 combined[u][i] = v
  70 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  71 data = combined
  72 del combined
  73 num = len (data)
  74
  75
  76 property_names = [
  77         # General_Category
  78         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  79         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  80         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  81         # Indic_Syllabic_Category
  82         'Other',
  83         'Bindu',
  84         'Visarga',
  85         'Avagraha',
  86         'Nukta',
  87         'Virama',
  88         'Pure_Killer',
  89         'Invisible_Stacker',
  90         'Vowel_Independent',
  91         'Vowel_Dependent',
  92         'Vowel',
  93         'Consonant_Placeholder',
  94         'Consonant',
  95         'Consonant_Dead',
  96         'Consonant_With_Stacker',
  97         'Consonant_Prefixed',
  98         'Consonant_Preceding_Repha',
  99         'Consonant_Succeeding_Repha',
 100         'Consonant_Subjoined',
 101         'Consonant_Medial',
 102         'Consonant_Final',
 103         'Consonant_Head_Letter',
 104         'Consonant_Initial_Postfixed',
 105         'Modifying_Letter',
 106         'Tone_Letter',
 107         'Tone_Mark',
 108         'Gemination_Mark',
 109         'Cantillation_Mark',
 110         'Register_Shifter',
 111         'Syllable_Modifier',
 112         'Consonant_Killer',
 113         'Non_Joiner',
 114         'Joiner',
 115         'Number_Joiner',
 116         'Number',
 117         'Brahmi_Joining_Number',
 118         # Indic_Positional_Category
 119         'Not_Applicable',
 120         'Right',
 121         'Left',
 122         'Visual_Order_Left',
 123         'Left_And_Right',
 124         'Top',
 125         'Bottom',
 126         'Top_And_Bottom',
 127         'Top_And_Right',
 128         'Top_And_Left',
 129         'Top_And_Left_And_Right',
 130         'Bottom_And_Left',
 131         'Bottom_And_Right',
 132         'Top_And_Bottom_And_Right',
 133         'Overstruck',
 134 ]
 135
 136 try:
 137         basestring
 138 except NameError:
 139         basestring = str
 140
 141 class PropertyValue(object):
 142         def __init__(self, name_):
 143                 self.name = name_
 144         def __str__(self):
 145                 return self.name
 146         def __eq__(self, other):
 147                 return self.name == (other if isinstance(other, basestring) else other.name)
 148         def __ne__(self, other):
 149                 return not (self == other)
 150         def __hash__(self):
 151                 return hash(str(self))
 152
 153 property_values = {}
 154
 155 for name in property_names:
 156         value = PropertyValue(name)
 157         assert value not in property_values
 158         assert value not in globals()
 159         property_values[name] = value
 160 globals().update(property_values)
 161
 162
 163 def is_BASE(U, UISC, UGC):
 164         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 165                         #SPEC-DRAFT Consonant_Placeholder,
 166                         Tone_Letter,
 167                         Vowel_Independent #SPEC-DRAFT
 168                         ] or
 169                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 170                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 171 def is_BASE_IND(U, UISC, UGC):
 172         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 173         return (UISC in [Consonant_Dead, Modifying_Letter] or
 174                 (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
 175                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 176                 )
 177 def is_BASE_NUM(U, UISC, UGC):
 178         return UISC == Brahmi_Joining_Number
 179 def is_BASE_OTHER(U, UISC, UGC):
 180         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 181         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 182         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 183 def is_CGJ(U, UISC, UGC):
 184         return U == 0x034F
 185 def is_CONS_FINAL(U, UISC, UGC):
 186         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 187         return ((UISC == Consonant_Final and UGC != Lo) or
 188                 UISC == Consonant_Initial_Postfixed or
 189                 UISC == Consonant_Succeeding_Repha)
 190 def is_CONS_FINAL_MOD(U, UISC, UGC):
 191         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 192         return  UISC == Syllable_Modifier
 193 def is_CONS_MED(U, UISC, UGC):
 194         return UISC == Consonant_Medial and UGC != Lo
 195 def is_CONS_MOD(U, UISC, UGC):
 196         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 197 def is_CONS_SUB(U, UISC, UGC):
 198         #SPEC-DRAFT return UISC == Consonant_Subjoined
 199         return UISC == Consonant_Subjoined and UGC != Lo
 200 def is_CONS_WITH_STACKER(U, UISC, UGC):
 201         return UISC == Consonant_With_Stacker
 202 def is_HALANT(U, UISC, UGC):
 203         return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
 204 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
 205         # https://github.com/harfbuzz/harfbuzz/issues/1102
 206         # https://github.com/harfbuzz/harfbuzz/issues/1379
 207         return U in [0x11046, 0x1134D]
 208 def is_HALANT_NUM(U, UISC, UGC):
 209         return UISC == Number_Joiner
 210 def is_ZWNJ(U, UISC, UGC):
 211         return UISC == Non_Joiner
 212 def is_ZWJ(U, UISC, UGC):
 213         return UISC == Joiner
 214 def is_Word_Joiner(U, UISC, UGC):
 215         return U == 0x2060
 216 def is_OTHER(U, UISC, UGC):
 217         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 218         return (UISC == Other
 219                 and not is_SYM_MOD(U, UISC, UGC)
 220                 and not is_CGJ(U, UISC, UGC)
 221                 and not is_Word_Joiner(U, UISC, UGC)
 222                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 223         )
 224 def is_Reserved(U, UISC, UGC):
 225         return UGC == 'Cn'
 226 def is_REPHA(U, UISC, UGC):
 227         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 228 def is_SYM(U, UISC, UGC):
 229         if U == 0x25CC: return False #SPEC-DRAFT
 230         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 231         return UGC in [So, Sc]
 232 def is_SYM_MOD(U, UISC, UGC):
 233         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 234 def is_VARIATION_SELECTOR(U, UISC, UGC):
 235         return 0xFE00 <= U <= 0xFE0F
 236 def is_VOWEL(U, UISC, UGC):
 237         # https://github.com/roozbehp/unicode-data/issues/6
 238         return (UISC == Pure_Killer or
 239                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 240 def is_VOWEL_MOD(U, UISC, UGC):
 241         # https://github.com/roozbehp/unicode-data/issues/6
 242         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 243                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 244
 245 use_mapping = {
 246         'B':    is_BASE,
 247         'IND':  is_BASE_IND,
 248         'N':    is_BASE_NUM,
 249         'GB':   is_BASE_OTHER,
 250         'CGJ':  is_CGJ,
 251         'F':    is_CONS_FINAL,
 252         'FM':   is_CONS_FINAL_MOD,
 253         'M':    is_CONS_MED,
 254         'CM':   is_CONS_MOD,
 255         'SUB':  is_CONS_SUB,
 256         'CS':   is_CONS_WITH_STACKER,
 257         'H':    is_HALANT,
 258         'HVM':  is_HALANT_OR_VOWEL_MODIFIER,
 259         'HN':   is_HALANT_NUM,
 260         'ZWNJ': is_ZWNJ,
 261         'ZWJ':  is_ZWJ,
 262         'WJ':   is_Word_Joiner,
 263         'O':    is_OTHER,
 264         'Rsv':  is_Reserved,
 265         'R':    is_REPHA,
 266         'S':    is_SYM,
 267         'SM':   is_SYM_MOD,
 268         'VS':   is_VARIATION_SELECTOR,
 269         'V':    is_VOWEL,
 270         'VM':   is_VOWEL_MOD,
 271 }
 272
 273 use_positions = {
 274         'F': {
 275                 'Abv': [Top],
 276                 'Blw': [Bottom],
 277                 'Pst': [Right],
 278         },
 279         'M': {
 280                 'Abv': [Top],
 281                 'Blw': [Bottom, Bottom_And_Left],
 282                 'Pst': [Right],
 283                 'Pre': [Left],
 284         },
 285         'CM': {
 286                 'Abv': [Top],
 287                 'Blw': [Bottom],
 288         },
 289         'V': {
 290                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 291                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 292                 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 293                 'Pre': [Left],
 294         },
 295         'VM': {
 296                 'Abv': [Top],
 297                 'Blw': [Bottom, Overstruck],
 298                 'Pst': [Right],
 299                 'Pre': [Left],
 300         },
 301         'SM': {
 302                 'Abv': [Top],
 303                 'Blw': [Bottom],
 304         },
 305         'H': None,
 306         'HVM': None,
 307         'B': None,
 308         'FM': None,
 309         'SUB': None,
 310 }
 311
 312 def map_to_use(data):
 313         out = {}
 314         items = use_mapping.items()
 315         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 316
 317                 # Resolve Indic_Syllabic_Category
 318
 319                 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
 320                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 321
 322                 # Tibetan:
 323                 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
 324                 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
 325                 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
 326                 # Overrides to allow NFC order matching syllable
 327                 # https://github.com/harfbuzz/harfbuzz/issues/1012
 328                 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
 329                         if UIPC == Top:
 330                                 UIPC = Bottom
 331
 332                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982
 333                 # also  https://github.com/harfbuzz/harfbuzz/issues/1012
 334                 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
 335                         if UIPC == Top:
 336                                 UIPC = Bottom
 337                         elif UIPC == Bottom:
 338                                 UIPC = Top
 339
 340                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 341                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 342
 343                 # TODO: U+1CED should only be allowed after some of
 344                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 345                 if U == 0x1CED: UISC = Tone_Mark
 346
 347                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525
 348                 if U == 0x1A7F: UISC = Consonant_Final
 349
 350                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
 351                 if U == 0x11134: UISC = Gemination_Mark
 352
 353                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1399
 354                 if U == 0x111C9: UISC = Consonant_Final
 355
 356                 values = [k for k,v in items if v(U,UISC,UGC)]
 357                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 358                 USE = values[0]
 359
 360                 # Resolve Indic_Positional_Category
 361
 362                 # TODO: These should die, but have UIPC in Unicode 12.0
 363                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 364
 365                 # TODO: In USE's override list but not in Unicode 12.0
 366                 if U == 0x103C: UIPC = Left
 367
 368                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0
 369                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 370                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
 371                 #  and https://github.com/harfbuzz/harfbuzz/issues/1631
 372                 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
 373                 if U == 0x1171E: UIPC = Left
 374                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 375
 376                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 377                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 378
 379                 pos_mapping = use_positions.get(USE, None)
 380                 if pos_mapping:
 381                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 382                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 383                         USE = USE + values[0]
 384
 385                 out[U] = (USE, UBlock)
 386         return out
 387
 388 defaults = ('O', 'No_Block')
 389 data = map_to_use(data)
 390
 391 print ("/* == Start of generated table == */")
 392 print ("/*")
 393 print (" * The following table is generated by running:")
 394 print (" *")
 395 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
 396 print (" *")
 397 print (" * on files with these headers:")
 398 print (" *")
 399 for h in headers:
 400         for l in h:
 401                 print (" * %s" % (l.strip()))
 402 print (" */")
 403 print ()
 404 print ('#include "hb-ot-shape-complex-use.hh"')
 405 print ()
 406
 407 total = 0
 408 used = 0
 409 last_block = None
 410 def print_block (block, start, end, data):
 411         global total, used, last_block
 412         if block and block != last_block:
 413                 print ()
 414                 print ()
 415                 print ("  /* %s */" % block)
 416                 if start % 16:
 417                         print (' ' * (20 + (start % 16 * 6)), end='')
 418         num = 0
 419         assert start % 8 == 0
 420         assert (end+1) % 8 == 0
 421         for u in range (start, end+1):
 422                 if u % 16 == 0:
 423                         print ()
 424                         print ("  /* %04X */" % u, end='')
 425                 if u in data:
 426                         num += 1
 427                 d = data.get (u, defaults)
 428                 print ("%6s," % d[0], end='')
 429
 430         total += end - start + 1
 431         used += num
 432         if block:
 433                 last_block = block
 434
 435 uu = sorted (data.keys ())
 436
 437 last = -100000
 438 num = 0
 439 offset = 0
 440 starts = []
 441 ends = []
 442 print ('#pragma GCC diagnostic push')
 443 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 444 for k,v in sorted(use_mapping.items()):
 445         if k in use_positions and use_positions[k]: continue
 446         print ("#define %s      USE_%s  /* %s */" % (k, k, v.__name__[3:]))
 447 for k,v in sorted(use_positions.items()):
 448         if not v: continue
 449         for suf in v.keys():
 450                 tag = k + suf
 451                 print ("#define %s      USE_%s" % (tag, tag))
 452 print ('#pragma GCC diagnostic pop')
 453 print ("")
 454 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
 455 for u in uu:
 456         if u <= last:
 457                 continue
 458         block = data[u][1]
 459
 460         start = u//8*8
 461         end = start+1
 462         while end in uu and block == data[end][1]:
 463                 end += 1
 464         end = (end-1)//8*8 + 7
 465
 466         if start != last + 1:
 467                 if start - last <= 1+16*3:
 468                         print_block (None, last+1, start-1, data)
 469                         last = start-1
 470                 else:
 471                         if last >= 0:
 472                                 ends.append (last + 1)
 473                                 offset += ends[-1] - starts[-1]
 474                         print ()
 475                         print ()
 476                         print ("#define use_offset_0x%04xu %d" % (start, offset))
 477                         starts.append (start)
 478
 479         print_block (block, start, end, data)
 480         last = end
 481 ends.append (last + 1)
 482 offset += ends[-1] - starts[-1]
 483 print ()
 484 print ()
 485 occupancy = used * 100. / total
 486 page_bits = 12
 487 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 488 print ()
 489 print ("USE_TABLE_ELEMENT_TYPE")
 490 print ("hb_use_get_category (hb_codepoint_t u)")
 491 print ("{")
 492 print ("  switch (u >> %d)" % page_bits)
 493 print ("  {")
 494 pages = set([u>>page_bits for u in starts+ends])
 495 for p in sorted(pages):
 496         print ("    case 0x%0Xu:" % p)
 497         for (start,end) in zip (starts, ends):
 498                 if p not in [start>>page_bits, end>>page_bits]: continue
 499                 offset = "use_offset_0x%04xu" % start
 500                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 501         print ("      break;")
 502         print ("")
 503 print ("    default:")
 504 print ("      break;")
 505 print ("  }")
 506 print ("  return USE_O;")
 507 print ("}")
 508 print ()
 509 for k in sorted(use_mapping.keys()):
 510         if k in use_positions and use_positions[k]: continue
 511         print ("#undef %s" % k)
 512 for k,v in sorted(use_positions.items()):
 513         if not v: continue
 514         for suf in v.keys():
 515                 tag = k + suf
 516                 print ("#undef %s" % tag)
 517 print ()
 518 print ("/* == End of generated table == */")
 519
 520 # Maintain at least 50% occupancy in the table */
 521 if occupancy < 50:
 522         raise Exception ("Table too sparse, please investigate: ", occupancy)