src/gen-use-table.py

   1 #!/usr/bin/env python3
   2 # flake8: noqa: F821
   3
   4 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt
   5
   6 Input file:
   7 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
   8 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
   9 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
  10 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
  11 """
  12
  13 import sys
  14
  15 if len (sys.argv) != 5:
  16         sys.exit (__doc__)
  17
  18 BLACKLISTED_BLOCKS = ["Thai", "Lao"]
  19
  20 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
  21
  22 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  23 headers.append (["UnicodeData.txt does not have a header."])
  24
  25 data = [{} for f in files]
  26 values = [{} for f in files]
  27 for i, f in enumerate (files):
  28         for line in f:
  29
  30                 j = line.find ('#')
  31                 if j >= 0:
  32                         line = line[:j]
  33
  34                 fields = [x.strip () for x in line.split (';')]
  35                 if len (fields) == 1:
  36                         continue
  37
  38                 uu = fields[0].split ('..')
  39                 start = int (uu[0], 16)
  40                 if len (uu) == 1:
  41                         end = start
  42                 else:
  43                         end = int (uu[1], 16)
  44
  45                 t = fields[1 if i != 2 else 2]
  46
  47                 for u in range (start, end + 1):
  48                         data[i][u] = t
  49                 values[i][t] = values[i].get (t, 0) + end - start + 1
  50
  51 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  52
  53 # TODO Characters that are not in Unicode Indic files, but used in USE
  54 data[0][0x034F] = defaults[0]
  55 data[0][0x1B61] = defaults[0]
  56 data[0][0x1B63] = defaults[0]
  57 data[0][0x1B64] = defaults[0]
  58 data[0][0x1B65] = defaults[0]
  59 data[0][0x1B66] = defaults[0]
  60 data[0][0x1B67] = defaults[0]
  61 data[0][0x1B69] = defaults[0]
  62 data[0][0x1B6A] = defaults[0]
  63 data[0][0x2060] = defaults[0]
  64 # TODO https://github.com/harfbuzz/harfbuzz/pull/1685
  65 data[0][0x1B5B] = 'Consonant_Placeholder'
  66 data[0][0x1B5C] = 'Consonant_Placeholder'
  67 data[0][0x1B5F] = 'Consonant_Placeholder'
  68 data[0][0x1B62] = 'Consonant_Placeholder'
  69 data[0][0x1B68] = 'Consonant_Placeholder'
  70 # TODO https://github.com/harfbuzz/harfbuzz/issues/1035
  71 data[0][0x11C44] = 'Consonant_Placeholder'
  72 data[0][0x11C45] = 'Consonant_Placeholder'
  73 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
  74 data[0][0x111C8] = 'Consonant_Placeholder'
  75 for u in range (0xFE00, 0xFE0F + 1):
  76         data[0][u] = defaults[0]
  77
  78 # Merge data into one dict:
  79 for i,v in enumerate (defaults):
  80         values[i][v] = values[i].get (v, 0) + 1
  81 combined = {}
  82 for i,d in enumerate (data):
  83         for u,v in d.items ():
  84                 if i >= 2 and not u in combined:
  85                         continue
  86                 if not u in combined:
  87                         combined[u] = list (defaults)
  88                 combined[u][i] = v
  89 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  90 data = combined
  91 del combined
  92 num = len (data)
  93
  94
  95 property_names = [
  96         # General_Category
  97         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  98         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  99         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
 100         # Indic_Syllabic_Category
 101         'Other',
 102         'Bindu',
 103         'Visarga',
 104         'Avagraha',
 105         'Nukta',
 106         'Virama',
 107         'Pure_Killer',
 108         'Invisible_Stacker',
 109         'Vowel_Independent',
 110         'Vowel_Dependent',
 111         'Vowel',
 112         'Consonant_Placeholder',
 113         'Consonant',
 114         'Consonant_Dead',
 115         'Consonant_With_Stacker',
 116         'Consonant_Prefixed',
 117         'Consonant_Preceding_Repha',
 118         'Consonant_Succeeding_Repha',
 119         'Consonant_Subjoined',
 120         'Consonant_Medial',
 121         'Consonant_Final',
 122         'Consonant_Head_Letter',
 123         'Consonant_Initial_Postfixed',
 124         'Modifying_Letter',
 125         'Tone_Letter',
 126         'Tone_Mark',
 127         'Gemination_Mark',
 128         'Cantillation_Mark',
 129         'Register_Shifter',
 130         'Syllable_Modifier',
 131         'Consonant_Killer',
 132         'Non_Joiner',
 133         'Joiner',
 134         'Number_Joiner',
 135         'Number',
 136         'Brahmi_Joining_Number',
 137         # Indic_Positional_Category
 138         'Not_Applicable',
 139         'Right',
 140         'Left',
 141         'Visual_Order_Left',
 142         'Left_And_Right',
 143         'Top',
 144         'Bottom',
 145         'Top_And_Bottom',
 146         'Top_And_Bottom_And_Left',
 147         'Top_And_Right',
 148         'Top_And_Left',
 149         'Top_And_Left_And_Right',
 150         'Bottom_And_Left',
 151         'Bottom_And_Right',
 152         'Top_And_Bottom_And_Right',
 153         'Overstruck',
 154 ]
 155
 156 class PropertyValue(object):
 157         def __init__(self, name_):
 158                 self.name = name_
 159         def __str__(self):
 160                 return self.name
 161         def __eq__(self, other):
 162                 return self.name == (other if isinstance(other, str) else other.name)
 163         def __ne__(self, other):
 164                 return not (self == other)
 165         def __hash__(self):
 166                 return hash(str(self))
 167
 168 property_values = {}
 169
 170 for name in property_names:
 171         value = PropertyValue(name)
 172         assert value not in property_values
 173         assert value not in globals()
 174         property_values[name] = value
 175 globals().update(property_values)
 176
 177
 178 def is_BASE(U, UISC, UGC):
 179         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 180                         #SPEC-DRAFT Consonant_Placeholder,
 181                         Tone_Letter,
 182                         Vowel_Independent #SPEC-DRAFT
 183                         ] or
 184                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 185                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 186 def is_BASE_IND(U, UISC, UGC):
 187         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 188         return (UISC in [Consonant_Dead, Modifying_Letter] or
 189                 (UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
 190                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 191                 )
 192 def is_BASE_NUM(U, UISC, UGC):
 193         return UISC == Brahmi_Joining_Number
 194 def is_BASE_OTHER(U, UISC, UGC):
 195         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 196         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 197         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 198 def is_CGJ(U, UISC, UGC):
 199         return U == 0x034F
 200 def is_CONS_FINAL(U, UISC, UGC):
 201         return ((UISC == Consonant_Final and UGC != Lo) or
 202                 UISC == Consonant_Succeeding_Repha)
 203 def is_CONS_FINAL_MOD(U, UISC, UGC):
 204         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 205         return  UISC == Syllable_Modifier
 206 def is_CONS_MED(U, UISC, UGC):
 207         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 208         return (UISC == Consonant_Medial and UGC != Lo or
 209                 UISC == Consonant_Initial_Postfixed)
 210 def is_CONS_MOD(U, UISC, UGC):
 211         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 212 def is_CONS_SUB(U, UISC, UGC):
 213         #SPEC-DRAFT return UISC == Consonant_Subjoined
 214         return UISC == Consonant_Subjoined and UGC != Lo
 215 def is_CONS_WITH_STACKER(U, UISC, UGC):
 216         return UISC == Consonant_With_Stacker
 217 def is_HALANT(U, UISC, UGC):
 218         return (UISC in [Virama, Invisible_Stacker]
 219                 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
 220                 and not is_SAKOT(U, UISC, UGC))
 221 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
 222         # https://github.com/harfbuzz/harfbuzz/issues/1102
 223         # https://github.com/harfbuzz/harfbuzz/issues/1379
 224         return U in [0x11046, 0x1134D]
 225 def is_HALANT_NUM(U, UISC, UGC):
 226         return UISC == Number_Joiner
 227 def is_ZWNJ(U, UISC, UGC):
 228         return UISC == Non_Joiner
 229 def is_ZWJ(U, UISC, UGC):
 230         return UISC == Joiner
 231 def is_Word_Joiner(U, UISC, UGC):
 232         return U == 0x2060
 233 def is_OTHER(U, UISC, UGC):
 234         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 235         return (UISC == Other
 236                 and not is_SYM(U, UISC, UGC)
 237                 and not is_SYM_MOD(U, UISC, UGC)
 238                 and not is_CGJ(U, UISC, UGC)
 239                 and not is_Word_Joiner(U, UISC, UGC)
 240                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 241         )
 242 def is_Reserved(U, UISC, UGC):
 243         return UGC == 'Cn'
 244 def is_REPHA(U, UISC, UGC):
 245         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 246 def is_SAKOT(U, UISC, UGC):
 247         return U == 0x1A60
 248 def is_SYM(U, UISC, UGC):
 249         if U == 0x25CC: return False #SPEC-DRAFT
 250         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 251         return UGC in [So, Sc] and U not in [0x1B62, 0x1B68]
 252 def is_SYM_MOD(U, UISC, UGC):
 253         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 254 def is_VARIATION_SELECTOR(U, UISC, UGC):
 255         return 0xFE00 <= U <= 0xFE0F
 256 def is_VOWEL(U, UISC, UGC):
 257         # https://github.com/harfbuzz/harfbuzz/issues/376
 258         return (UISC == Pure_Killer or
 259                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 260 def is_VOWEL_MOD(U, UISC, UGC):
 261         # https://github.com/harfbuzz/harfbuzz/issues/376
 262         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 263                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 264
 265 use_mapping = {
 266         'B':    is_BASE,
 267         'IND':  is_BASE_IND,
 268         'N':    is_BASE_NUM,
 269         'GB':   is_BASE_OTHER,
 270         'CGJ':  is_CGJ,
 271         'F':    is_CONS_FINAL,
 272         'FM':   is_CONS_FINAL_MOD,
 273         'M':    is_CONS_MED,
 274         'CM':   is_CONS_MOD,
 275         'SUB':  is_CONS_SUB,
 276         'CS':   is_CONS_WITH_STACKER,
 277         'H':    is_HALANT,
 278         'HVM':  is_HALANT_OR_VOWEL_MODIFIER,
 279         'HN':   is_HALANT_NUM,
 280         'ZWNJ': is_ZWNJ,
 281         'ZWJ':  is_ZWJ,
 282         'WJ':   is_Word_Joiner,
 283         'O':    is_OTHER,
 284         'Rsv':  is_Reserved,
 285         'R':    is_REPHA,
 286         'S':    is_SYM,
 287         'Sk':   is_SAKOT,
 288         'SM':   is_SYM_MOD,
 289         'VS':   is_VARIATION_SELECTOR,
 290         'V':    is_VOWEL,
 291         'VM':   is_VOWEL_MOD,
 292 }
 293
 294 use_positions = {
 295         'F': {
 296                 'Abv': [Top],
 297                 'Blw': [Bottom],
 298                 'Pst': [Right],
 299         },
 300         'M': {
 301                 'Abv': [Top],
 302                 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
 303                 'Pst': [Right],
 304                 'Pre': [Left, Top_And_Bottom_And_Left],
 305         },
 306         'CM': {
 307                 'Abv': [Top],
 308                 'Blw': [Bottom],
 309         },
 310         'V': {
 311                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 312                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 313                 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 314                 'Pre': [Left],
 315         },
 316         'VM': {
 317                 'Abv': [Top],
 318                 'Blw': [Bottom, Overstruck],
 319                 'Pst': [Right],
 320                 'Pre': [Left],
 321         },
 322         'SM': {
 323                 'Abv': [Top],
 324                 'Blw': [Bottom],
 325         },
 326         'H': None,
 327         'HVM': None,
 328         'B': None,
 329         'FM': {
 330                 'Abv': [Top],
 331                 'Blw': [Bottom],
 332                 'Pst': [Not_Applicable],
 333         },
 334         'SUB': None,
 335 }
 336
 337 def map_to_use(data):
 338         out = {}
 339         items = use_mapping.items()
 340         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 341
 342                 # Resolve Indic_Syllabic_Category
 343
 344                 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
 345                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 346
 347                 # Tibetan:
 348                 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
 349                 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
 350                 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
 351                 # Overrides to allow NFC order matching syllable
 352                 # https://github.com/harfbuzz/harfbuzz/issues/1012
 353                 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
 354                         if UIPC == Top:
 355                                 UIPC = Bottom
 356
 357                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982
 358                 # also  https://github.com/harfbuzz/harfbuzz/issues/1012
 359                 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
 360                         if UIPC == Top:
 361                                 UIPC = Bottom
 362                         elif UIPC == Bottom:
 363                                 UIPC = Top
 364
 365                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 366                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 367
 368                 # TODO: U+1CED should only be allowed after some of
 369                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 370                 if U == 0x1CED: UISC = Tone_Mark
 371
 372                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
 373                 if U == 0x11134: UISC = Gemination_Mark
 374
 375                 values = [k for k,v in items if v(U,UISC,UGC)]
 376                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 377                 USE = values[0]
 378
 379                 # Resolve Indic_Positional_Category
 380
 381                 # TODO: These should die, but have UIPC in Unicode 13.0.0
 382                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 383
 384                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/2012
 385                 if U == 0x1C29: UIPC = Left
 386
 387                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
 388                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 389                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
 390                 #  and https://github.com/harfbuzz/harfbuzz/issues/1631
 391                 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
 392                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 393
 394                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 395                         USE == 'R' or
 396                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 397
 398                 pos_mapping = use_positions.get(USE, None)
 399                 if pos_mapping:
 400                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 401                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 402                         USE = USE + values[0]
 403
 404                 out[U] = (USE, UBlock)
 405         return out
 406
 407 defaults = ('O', 'No_Block')
 408 data = map_to_use(data)
 409
 410 print ("/* == Start of generated table == */")
 411 print ("/*")
 412 print (" * The following table is generated by running:")
 413 print (" *")
 414 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
 415 print (" *")
 416 print (" * on files with these headers:")
 417 print (" *")
 418 for h in headers:
 419         for l in h:
 420                 print (" * %s" % (l.strip()))
 421 print (" */")
 422 print ()
 423 print ('#include "hb.hh"')
 424 print ()
 425 print ('#ifndef HB_NO_OT_SHAPE')
 426 print ()
 427 print ('#include "hb-ot-shape-complex-use.hh"')
 428 print ()
 429
 430 total = 0
 431 used = 0
 432 last_block = None
 433 def print_block (block, start, end, data):
 434         global total, used, last_block
 435         if block and block != last_block:
 436                 print ()
 437                 print ()
 438                 print ("  /* %s */" % block)
 439                 if start % 16:
 440                         print (' ' * (20 + (start % 16 * 6)), end='')
 441         num = 0
 442         assert start % 8 == 0
 443         assert (end+1) % 8 == 0
 444         for u in range (start, end+1):
 445                 if u % 16 == 0:
 446                         print ()
 447                         print ("  /* %04X */" % u, end='')
 448                 if u in data:
 449                         num += 1
 450                 d = data.get (u, defaults)
 451                 print ("%6s," % d[0], end='')
 452
 453         total += end - start + 1
 454         used += num
 455         if block:
 456                 last_block = block
 457
 458 uu = sorted (data.keys ())
 459
 460 last = -100000
 461 num = 0
 462 offset = 0
 463 starts = []
 464 ends = []
 465 print ('#pragma GCC diagnostic push')
 466 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 467 for k,v in sorted(use_mapping.items()):
 468         if k in use_positions and use_positions[k]: continue
 469         print ("#define %s      USE_%s  /* %s */" % (k, k, v.__name__[3:]))
 470 for k,v in sorted(use_positions.items()):
 471         if not v: continue
 472         for suf in v.keys():
 473                 tag = k + suf
 474                 print ("#define %s      USE_%s" % (tag, tag))
 475 print ('#pragma GCC diagnostic pop')
 476 print ("")
 477 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
 478 for u in uu:
 479         if u <= last:
 480                 continue
 481         block = data[u][1]
 482
 483         start = u//8*8
 484         end = start+1
 485         while end in uu and block == data[end][1]:
 486                 end += 1
 487         end = (end-1)//8*8 + 7
 488
 489         if start != last + 1:
 490                 if start - last <= 1+16*3:
 491                         print_block (None, last+1, start-1, data)
 492                         last = start-1
 493                 else:
 494                         if last >= 0:
 495                                 ends.append (last + 1)
 496                                 offset += ends[-1] - starts[-1]
 497                         print ()
 498                         print ()
 499                         print ("#define use_offset_0x%04xu %d" % (start, offset))
 500                         starts.append (start)
 501
 502         print_block (block, start, end, data)
 503         last = end
 504 ends.append (last + 1)
 505 offset += ends[-1] - starts[-1]
 506 print ()
 507 print ()
 508 occupancy = used * 100. / total
 509 page_bits = 12
 510 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 511 print ()
 512 print ("USE_TABLE_ELEMENT_TYPE")
 513 print ("hb_use_get_category (hb_codepoint_t u)")
 514 print ("{")
 515 print ("  switch (u >> %d)" % page_bits)
 516 print ("  {")
 517 pages = set([u>>page_bits for u in starts+ends])
 518 for p in sorted(pages):
 519         print ("    case 0x%0Xu:" % p)
 520         for (start,end) in zip (starts, ends):
 521                 if p not in [start>>page_bits, end>>page_bits]: continue
 522                 offset = "use_offset_0x%04xu" % start
 523                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 524         print ("      break;")
 525         print ("")
 526 print ("    default:")
 527 print ("      break;")
 528 print ("  }")
 529 print ("  return USE_O;")
 530 print ("}")
 531 print ()
 532 for k in sorted(use_mapping.keys()):
 533         if k in use_positions and use_positions[k]: continue
 534         print ("#undef %s" % k)
 535 for k,v in sorted(use_positions.items()):
 536         if not v: continue
 537         for suf in v.keys():
 538                 tag = k + suf
 539                 print ("#undef %s" % tag)
 540 print ()
 541 print ()
 542 print ('#endif')
 543 print ("/* == End of generated table == */")
 544
 545 # Maintain at least 50% occupancy in the table */
 546 if occupancy < 50:
 547         raise Exception ("Table too sparse, please investigate: ", occupancy)