src/gen-use-table.py

   1 #!/usr/bin/env python
   2 # flake8: noqa
   3
   4 from __future__ import print_function, division, absolute_import
   5
   6 import io
   7 import sys
   8
   9 if len (sys.argv) != 5:
  10         print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
  11         sys.exit (1)
  12
  13 BLACKLISTED_BLOCKS = ["Thai", "Lao"]
  14
  15 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
  16
  17 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  18 headers.append (["UnicodeData.txt does not have a header."])
  19
  20 data = [{} for f in files]
  21 values = [{} for f in files]
  22 for i, f in enumerate (files):
  23         for line in f:
  24
  25                 j = line.find ('#')
  26                 if j >= 0:
  27                         line = line[:j]
  28
  29                 fields = [x.strip () for x in line.split (';')]
  30                 if len (fields) == 1:
  31                         continue
  32
  33                 uu = fields[0].split ('..')
  34                 start = int (uu[0], 16)
  35                 if len (uu) == 1:
  36                         end = start
  37                 else:
  38                         end = int (uu[1], 16)
  39
  40                 t = fields[1 if i != 2 else 2]
  41
  42                 for u in range (start, end + 1):
  43                         data[i][u] = t
  44                 values[i][t] = values[i].get (t, 0) + end - start + 1
  45
  46 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  47
  48 # TODO Characters that are not in Unicode Indic files, but used in USE
  49 data[0][0x034F] = defaults[0]
  50 data[0][0x1B61] = defaults[0]
  51 data[0][0x1B63] = defaults[0]
  52 data[0][0x1B64] = defaults[0]
  53 data[0][0x1B65] = defaults[0]
  54 data[0][0x1B66] = defaults[0]
  55 data[0][0x1B67] = defaults[0]
  56 data[0][0x1B69] = defaults[0]
  57 data[0][0x1B6A] = defaults[0]
  58 data[0][0x2060] = defaults[0]
  59 # TODO https://github.com/harfbuzz/harfbuzz/pull/1685
  60 data[0][0x1B5B] = 'Consonant_Placeholder'
  61 data[0][0x1B5C] = 'Consonant_Placeholder'
  62 data[0][0x1B5F] = 'Consonant_Placeholder'
  63 data[0][0x1B62] = 'Consonant_Placeholder'
  64 data[0][0x1B68] = 'Consonant_Placeholder'
  65 # TODO https://github.com/harfbuzz/harfbuzz/issues/1035
  66 data[0][0x11C44] = 'Consonant_Placeholder'
  67 data[0][0x11C45] = 'Consonant_Placeholder'
  68 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
  69 data[0][0x111C8] = 'Consonant_Placeholder'
  70 for u in range (0xFE00, 0xFE0F + 1):
  71         data[0][u] = defaults[0]
  72
  73 # Merge data into one dict:
  74 for i,v in enumerate (defaults):
  75         values[i][v] = values[i].get (v, 0) + 1
  76 combined = {}
  77 for i,d in enumerate (data):
  78         for u,v in d.items ():
  79                 if i >= 2 and not u in combined:
  80                         continue
  81                 if not u in combined:
  82                         combined[u] = list (defaults)
  83                 combined[u][i] = v
  84 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  85 data = combined
  86 del combined
  87 num = len (data)
  88
  89
  90 property_names = [
  91         # General_Category
  92         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  93         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  94         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  95         # Indic_Syllabic_Category
  96         'Other',
  97         'Bindu',
  98         'Visarga',
  99         'Avagraha',
 100         'Nukta',
 101         'Virama',
 102         'Pure_Killer',
 103         'Invisible_Stacker',
 104         'Vowel_Independent',
 105         'Vowel_Dependent',
 106         'Vowel',
 107         'Consonant_Placeholder',
 108         'Consonant',
 109         'Consonant_Dead',
 110         'Consonant_With_Stacker',
 111         'Consonant_Prefixed',
 112         'Consonant_Preceding_Repha',
 113         'Consonant_Succeeding_Repha',
 114         'Consonant_Subjoined',
 115         'Consonant_Medial',
 116         'Consonant_Final',
 117         'Consonant_Head_Letter',
 118         'Consonant_Initial_Postfixed',
 119         'Modifying_Letter',
 120         'Tone_Letter',
 121         'Tone_Mark',
 122         'Gemination_Mark',
 123         'Cantillation_Mark',
 124         'Register_Shifter',
 125         'Syllable_Modifier',
 126         'Consonant_Killer',
 127         'Non_Joiner',
 128         'Joiner',
 129         'Number_Joiner',
 130         'Number',
 131         'Brahmi_Joining_Number',
 132         # Indic_Positional_Category
 133         'Not_Applicable',
 134         'Right',
 135         'Left',
 136         'Visual_Order_Left',
 137         'Left_And_Right',
 138         'Top',
 139         'Bottom',
 140         'Top_And_Bottom',
 141         'Top_And_Right',
 142         'Top_And_Left',
 143         'Top_And_Left_And_Right',
 144         'Bottom_And_Left',
 145         'Bottom_And_Right',
 146         'Top_And_Bottom_And_Right',
 147         'Overstruck',
 148 ]
 149
 150 try:
 151         basestring
 152 except NameError:
 153         basestring = str
 154
 155 class PropertyValue(object):
 156         def __init__(self, name_):
 157                 self.name = name_
 158         def __str__(self):
 159                 return self.name
 160         def __eq__(self, other):
 161                 return self.name == (other if isinstance(other, basestring) else other.name)
 162         def __ne__(self, other):
 163                 return not (self == other)
 164         def __hash__(self):
 165                 return hash(str(self))
 166
 167 property_values = {}
 168
 169 for name in property_names:
 170         value = PropertyValue(name)
 171         assert value not in property_values
 172         assert value not in globals()
 173         property_values[name] = value
 174 globals().update(property_values)
 175
 176
 177 def is_BASE(U, UISC, UGC):
 178         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 179                         #SPEC-DRAFT Consonant_Placeholder,
 180                         Tone_Letter,
 181                         Vowel_Independent #SPEC-DRAFT
 182                         ] or
 183                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 184                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 185 def is_BASE_IND(U, UISC, UGC):
 186         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 187         return (UISC in [Consonant_Dead, Modifying_Letter] or
 188                 (UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
 189                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 190                 )
 191 def is_BASE_NUM(U, UISC, UGC):
 192         return UISC == Brahmi_Joining_Number
 193 def is_BASE_OTHER(U, UISC, UGC):
 194         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 195         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 196         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 197 def is_CGJ(U, UISC, UGC):
 198         return U == 0x034F
 199 def is_CONS_FINAL(U, UISC, UGC):
 200         return ((UISC == Consonant_Final and UGC != Lo) or
 201                 UISC == Consonant_Succeeding_Repha)
 202 def is_CONS_FINAL_MOD(U, UISC, UGC):
 203         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 204         return  UISC == Syllable_Modifier
 205 def is_CONS_MED(U, UISC, UGC):
 206         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 207         return (UISC == Consonant_Medial and UGC != Lo or
 208                 UISC == Consonant_Initial_Postfixed)
 209 def is_CONS_MOD(U, UISC, UGC):
 210         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 211 def is_CONS_SUB(U, UISC, UGC):
 212         #SPEC-DRAFT return UISC == Consonant_Subjoined
 213         return UISC == Consonant_Subjoined and UGC != Lo
 214 def is_CONS_WITH_STACKER(U, UISC, UGC):
 215         return UISC == Consonant_With_Stacker
 216 def is_HALANT(U, UISC, UGC):
 217         return (UISC in [Virama, Invisible_Stacker]
 218                 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
 219                 and not is_SAKOT(U, UISC, UGC))
 220 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
 221         # https://github.com/harfbuzz/harfbuzz/issues/1102
 222         # https://github.com/harfbuzz/harfbuzz/issues/1379
 223         return U in [0x11046, 0x1134D]
 224 def is_HALANT_NUM(U, UISC, UGC):
 225         return UISC == Number_Joiner
 226 def is_ZWNJ(U, UISC, UGC):
 227         return UISC == Non_Joiner
 228 def is_ZWJ(U, UISC, UGC):
 229         return UISC == Joiner
 230 def is_Word_Joiner(U, UISC, UGC):
 231         return U == 0x2060
 232 def is_OTHER(U, UISC, UGC):
 233         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 234         return (UISC == Other
 235                 and not is_SYM(U, UISC, UGC)
 236                 and not is_SYM_MOD(U, UISC, UGC)
 237                 and not is_CGJ(U, UISC, UGC)
 238                 and not is_Word_Joiner(U, UISC, UGC)
 239                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 240         )
 241 def is_Reserved(U, UISC, UGC):
 242         return UGC == 'Cn'
 243 def is_REPHA(U, UISC, UGC):
 244         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 245 def is_SAKOT(U, UISC, UGC):
 246         return U == 0x1A60
 247 def is_SYM(U, UISC, UGC):
 248         if U == 0x25CC: return False #SPEC-DRAFT
 249         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 250         return UGC in [So, Sc] and U not in [0x1B62, 0x1B68]
 251 def is_SYM_MOD(U, UISC, UGC):
 252         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 253 def is_VARIATION_SELECTOR(U, UISC, UGC):
 254         return 0xFE00 <= U <= 0xFE0F
 255 def is_VOWEL(U, UISC, UGC):
 256         # https://github.com/harfbuzz/harfbuzz/issues/376
 257         return (UISC == Pure_Killer or
 258                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 259 def is_VOWEL_MOD(U, UISC, UGC):
 260         # https://github.com/harfbuzz/harfbuzz/issues/376
 261         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 262                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 263
 264 use_mapping = {
 265         'B':    is_BASE,
 266         'IND':  is_BASE_IND,
 267         'N':    is_BASE_NUM,
 268         'GB':   is_BASE_OTHER,
 269         'CGJ':  is_CGJ,
 270         'F':    is_CONS_FINAL,
 271         'FM':   is_CONS_FINAL_MOD,
 272         'M':    is_CONS_MED,
 273         'CM':   is_CONS_MOD,
 274         'SUB':  is_CONS_SUB,
 275         'CS':   is_CONS_WITH_STACKER,
 276         'H':    is_HALANT,
 277         'HVM':  is_HALANT_OR_VOWEL_MODIFIER,
 278         'HN':   is_HALANT_NUM,
 279         'ZWNJ': is_ZWNJ,
 280         'ZWJ':  is_ZWJ,
 281         'WJ':   is_Word_Joiner,
 282         'O':    is_OTHER,
 283         'Rsv':  is_Reserved,
 284         'R':    is_REPHA,
 285         'S':    is_SYM,
 286         'Sk':   is_SAKOT,
 287         'SM':   is_SYM_MOD,
 288         'VS':   is_VARIATION_SELECTOR,
 289         'V':    is_VOWEL,
 290         'VM':   is_VOWEL_MOD,
 291 }
 292
 293 use_positions = {
 294         'F': {
 295                 'Abv': [Top],
 296                 'Blw': [Bottom],
 297                 'Pst': [Right],
 298         },
 299         'M': {
 300                 'Abv': [Top],
 301                 'Blw': [Bottom, Bottom_And_Left],
 302                 'Pst': [Right],
 303                 'Pre': [Left],
 304         },
 305         'CM': {
 306                 'Abv': [Top],
 307                 'Blw': [Bottom],
 308         },
 309         'V': {
 310                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 311                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 312                 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 313                 'Pre': [Left],
 314         },
 315         'VM': {
 316                 'Abv': [Top],
 317                 'Blw': [Bottom, Overstruck],
 318                 'Pst': [Right],
 319                 'Pre': [Left],
 320         },
 321         'SM': {
 322                 'Abv': [Top],
 323                 'Blw': [Bottom],
 324         },
 325         'H': None,
 326         'HVM': None,
 327         'B': None,
 328         'FM': {
 329                 'Abv': [Top],
 330                 'Blw': [Bottom],
 331                 'Pst': [Not_Applicable],
 332         },
 333         'SUB': None,
 334 }
 335
 336 def map_to_use(data):
 337         out = {}
 338         items = use_mapping.items()
 339         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 340
 341                 # Resolve Indic_Syllabic_Category
 342
 343                 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
 344                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 345
 346                 # Tibetan:
 347                 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
 348                 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
 349                 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
 350                 # Overrides to allow NFC order matching syllable
 351                 # https://github.com/harfbuzz/harfbuzz/issues/1012
 352                 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
 353                         if UIPC == Top:
 354                                 UIPC = Bottom
 355
 356                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982
 357                 # also  https://github.com/harfbuzz/harfbuzz/issues/1012
 358                 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
 359                         if UIPC == Top:
 360                                 UIPC = Bottom
 361                         elif UIPC == Bottom:
 362                                 UIPC = Top
 363
 364                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 365                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 366
 367                 # TODO: U+1CED should only be allowed after some of
 368                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 369                 if U == 0x1CED: UISC = Tone_Mark
 370
 371                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
 372                 if U == 0x11134: UISC = Gemination_Mark
 373
 374                 values = [k for k,v in items if v(U,UISC,UGC)]
 375                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 376                 USE = values[0]
 377
 378                 # Resolve Indic_Positional_Category
 379
 380                 # TODO: These should die, but have UIPC in Unicode 12.0
 381                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 382
 383                 # TODO: In USE's override list but not in Unicode 12.0
 384                 if U == 0x103C: UIPC = Left
 385
 386                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/2012
 387                 if U == 0x1C29: UIPC = Left
 388
 389                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0
 390                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 391                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
 392                 #  and https://github.com/harfbuzz/harfbuzz/issues/1631
 393                 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
 394                 if U == 0x1171E: UIPC = Left
 395                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 396
 397                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 398                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 399
 400                 pos_mapping = use_positions.get(USE, None)
 401                 if pos_mapping:
 402                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 403                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 404                         USE = USE + values[0]
 405
 406                 out[U] = (USE, UBlock)
 407         return out
 408
 409 defaults = ('O', 'No_Block')
 410 data = map_to_use(data)
 411
 412 print ("/* == Start of generated table == */")
 413 print ("/*")
 414 print (" * The following table is generated by running:")
 415 print (" *")
 416 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
 417 print (" *")
 418 print (" * on files with these headers:")
 419 print (" *")
 420 for h in headers:
 421         for l in h:
 422                 print (" * %s" % (l.strip()))
 423 print (" */")
 424 print ()
 425 print ('#include "hb.hh"')
 426 print ()
 427 print ('#ifndef HB_NO_OT_SHAPE')
 428 print ()
 429 print ('#include "hb-ot-shape-complex-use.hh"')
 430 print ()
 431
 432 total = 0
 433 used = 0
 434 last_block = None
 435 def print_block (block, start, end, data):
 436         global total, used, last_block
 437         if block and block != last_block:
 438                 print ()
 439                 print ()
 440                 print ("  /* %s */" % block)
 441                 if start % 16:
 442                         print (' ' * (20 + (start % 16 * 6)), end='')
 443         num = 0
 444         assert start % 8 == 0
 445         assert (end+1) % 8 == 0
 446         for u in range (start, end+1):
 447                 if u % 16 == 0:
 448                         print ()
 449                         print ("  /* %04X */" % u, end='')
 450                 if u in data:
 451                         num += 1
 452                 d = data.get (u, defaults)
 453                 print ("%6s," % d[0], end='')
 454
 455         total += end - start + 1
 456         used += num
 457         if block:
 458                 last_block = block
 459
 460 uu = sorted (data.keys ())
 461
 462 last = -100000
 463 num = 0
 464 offset = 0
 465 starts = []
 466 ends = []
 467 print ('#pragma GCC diagnostic push')
 468 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 469 for k,v in sorted(use_mapping.items()):
 470         if k in use_positions and use_positions[k]: continue
 471         print ("#define %s      USE_%s  /* %s */" % (k, k, v.__name__[3:]))
 472 for k,v in sorted(use_positions.items()):
 473         if not v: continue
 474         for suf in v.keys():
 475                 tag = k + suf
 476                 print ("#define %s      USE_%s" % (tag, tag))
 477 print ('#pragma GCC diagnostic pop')
 478 print ("")
 479 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
 480 for u in uu:
 481         if u <= last:
 482                 continue
 483         block = data[u][1]
 484
 485         start = u//8*8
 486         end = start+1
 487         while end in uu and block == data[end][1]:
 488                 end += 1
 489         end = (end-1)//8*8 + 7
 490
 491         if start != last + 1:
 492                 if start - last <= 1+16*3:
 493                         print_block (None, last+1, start-1, data)
 494                         last = start-1
 495                 else:
 496                         if last >= 0:
 497                                 ends.append (last + 1)
 498                                 offset += ends[-1] - starts[-1]
 499                         print ()
 500                         print ()
 501                         print ("#define use_offset_0x%04xu %d" % (start, offset))
 502                         starts.append (start)
 503
 504         print_block (block, start, end, data)
 505         last = end
 506 ends.append (last + 1)
 507 offset += ends[-1] - starts[-1]
 508 print ()
 509 print ()
 510 occupancy = used * 100. / total
 511 page_bits = 12
 512 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 513 print ()
 514 print ("USE_TABLE_ELEMENT_TYPE")
 515 print ("hb_use_get_category (hb_codepoint_t u)")
 516 print ("{")
 517 print ("  switch (u >> %d)" % page_bits)
 518 print ("  {")
 519 pages = set([u>>page_bits for u in starts+ends])
 520 for p in sorted(pages):
 521         print ("    case 0x%0Xu:" % p)
 522         for (start,end) in zip (starts, ends):
 523                 if p not in [start>>page_bits, end>>page_bits]: continue
 524                 offset = "use_offset_0x%04xu" % start
 525                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 526         print ("      break;")
 527         print ("")
 528 print ("    default:")
 529 print ("      break;")
 530 print ("  }")
 531 print ("  return USE_O;")
 532 print ("}")
 533 print ()
 534 for k in sorted(use_mapping.keys()):
 535         if k in use_positions and use_positions[k]: continue
 536         print ("#undef %s" % k)
 537 for k,v in sorted(use_positions.items()):
 538         if not v: continue
 539         for suf in v.keys():
 540                 tag = k + suf
 541                 print ("#undef %s" % tag)
 542 print ()
 543 print ()
 544 print ('#endif')
 545 print ("/* == End of generated table == */")
 546
 547 # Maintain at least 50% occupancy in the table */
 548 if occupancy < 50:
 549         raise Exception ("Table too sparse, please investigate: ", occupancy)