src/gen-use-table.py

   1 #!/usr/bin/env python
   2
   3 from __future__ import print_function, division, absolute_import
   4
   5 import io, sys
   6
   7 if len (sys.argv) != 5:
   8         print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
   9         sys.exit (1)
  10
  11 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
  12
  13 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
  14
  15 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  16 headers.append (["UnicodeData.txt does not have a header."])
  17
  18 data = [{} for f in files]
  19 values = [{} for f in files]
  20 for i, f in enumerate (files):
  21         for line in f:
  22
  23                 j = line.find ('#')
  24                 if j >= 0:
  25                         line = line[:j]
  26
  27                 fields = [x.strip () for x in line.split (';')]
  28                 if len (fields) == 1:
  29                         continue
  30
  31                 uu = fields[0].split ('..')
  32                 start = int (uu[0], 16)
  33                 if len (uu) == 1:
  34                         end = start
  35                 else:
  36                         end = int (uu[1], 16)
  37
  38                 t = fields[1 if i != 2 else 2]
  39
  40                 for u in range (start, end + 1):
  41                         data[i][u] = t
  42                 values[i][t] = values[i].get (t, 0) + end - start + 1
  43
  44 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  45
  46 # TODO Characters that are not in Unicode Indic files, but used in USE
  47 data[0][0x034F] = defaults[0]
  48 data[0][0x2060] = defaults[0]
  49 data[0][0x20F0] = defaults[0]
  50 for u in range (0xFE00, 0xFE0F + 1):
  51         data[0][u] = defaults[0]
  52
  53 # Merge data into one dict:
  54 for i,v in enumerate (defaults):
  55         values[i][v] = values[i].get (v, 0) + 1
  56 combined = {}
  57 for i,d in enumerate (data):
  58         for u,v in d.items ():
  59                 if i >= 2 and not u in combined:
  60                         continue
  61                 if not u in combined:
  62                         combined[u] = list (defaults)
  63                 combined[u][i] = v
  64 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  65 data = combined
  66 del combined
  67 num = len (data)
  68
  69
  70 property_names = [
  71         # General_Category
  72         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  73         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  74         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  75         # Indic_Syllabic_Category
  76         'Other',
  77         'Bindu',
  78         'Visarga',
  79         'Avagraha',
  80         'Nukta',
  81         'Virama',
  82         'Pure_Killer',
  83         'Invisible_Stacker',
  84         'Vowel_Independent',
  85         'Vowel_Dependent',
  86         'Vowel',
  87         'Consonant_Placeholder',
  88         'Consonant',
  89         'Consonant_Dead',
  90         'Consonant_With_Stacker',
  91         'Consonant_Prefixed',
  92         'Consonant_Preceding_Repha',
  93         'Consonant_Succeeding_Repha',
  94         'Consonant_Subjoined',
  95         'Consonant_Medial',
  96         'Consonant_Final',
  97         'Consonant_Head_Letter',
  98         'Consonant_Initial_Postfixed',
  99         'Modifying_Letter',
 100         'Tone_Letter',
 101         'Tone_Mark',
 102         'Gemination_Mark',
 103         'Cantillation_Mark',
 104         'Register_Shifter',
 105         'Syllable_Modifier',
 106         'Consonant_Killer',
 107         'Non_Joiner',
 108         'Joiner',
 109         'Number_Joiner',
 110         'Number',
 111         'Brahmi_Joining_Number',
 112         # Indic_Positional_Category
 113         'Not_Applicable',
 114         'Right',
 115         'Left',
 116         'Visual_Order_Left',
 117         'Left_And_Right',
 118         'Top',
 119         'Bottom',
 120         'Top_And_Bottom',
 121         'Top_And_Right',
 122         'Top_And_Left',
 123         'Top_And_Left_And_Right',
 124         'Bottom_And_Left',
 125         'Bottom_And_Right',
 126         'Top_And_Bottom_And_Right',
 127         'Overstruck',
 128 ]
 129
 130 try:
 131         basestring
 132 except NameError:
 133         basestring = str
 134
 135 class PropertyValue(object):
 136         def __init__(self, name_):
 137                 self.name = name_
 138         def __str__(self):
 139                 return self.name
 140         def __eq__(self, other):
 141                 return self.name == (other if isinstance(other, basestring) else other.name)
 142         def __ne__(self, other):
 143                 return not (self == other)
 144         def __hash__(self):
 145                 return hash(str(self))
 146
 147 property_values = {}
 148
 149 for name in property_names:
 150         value = PropertyValue(name)
 151         assert value not in property_values
 152         assert value not in globals()
 153         property_values[name] = value
 154 globals().update(property_values)
 155
 156
 157 def is_BASE(U, UISC, UGC):
 158         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 159                         #SPEC-DRAFT Consonant_Placeholder,
 160                         Tone_Letter,
 161                         Vowel_Independent #SPEC-DRAFT
 162                         ] or
 163                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 164                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 165 def is_BASE_IND(U, UISC, UGC):
 166         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 167         return (UISC in [Consonant_Dead, Modifying_Letter] or
 168                 (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x11A3F, 0x11A45]) or
 169                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 170                 )
 171 def is_BASE_NUM(U, UISC, UGC):
 172         return UISC == Brahmi_Joining_Number
 173 def is_BASE_OTHER(U, UISC, UGC):
 174         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 175         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 176         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 177 def is_CGJ(U, UISC, UGC):
 178         return U == 0x034F
 179 def is_CONS_FINAL(U, UISC, UGC):
 180         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 181         return ((UISC == Consonant_Final and UGC != Lo) or
 182                 UISC == Consonant_Initial_Postfixed or
 183                 UISC == Consonant_Succeeding_Repha)
 184 def is_CONS_FINAL_MOD(U, UISC, UGC):
 185         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 186         return  UISC == Syllable_Modifier
 187 def is_CONS_MED(U, UISC, UGC):
 188         return UISC == Consonant_Medial and UGC != Lo
 189 def is_CONS_MOD(U, UISC, UGC):
 190         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 191 def is_CONS_SUB(U, UISC, UGC):
 192         #SPEC-DRAFT return UISC == Consonant_Subjoined
 193         return UISC == Consonant_Subjoined and UGC != Lo
 194 def is_CONS_WITH_STACKER(U, UISC, UGC):
 195         return UISC == Consonant_With_Stacker
 196 def is_HALANT(U, UISC, UGC):
 197         return UISC in [Virama, Invisible_Stacker]
 198 def is_HALANT_NUM(U, UISC, UGC):
 199         return UISC == Number_Joiner
 200 def is_ZWNJ(U, UISC, UGC):
 201         return UISC == Non_Joiner
 202 def is_ZWJ(U, UISC, UGC):
 203         return UISC == Joiner
 204 def is_Word_Joiner(U, UISC, UGC):
 205         return U == 0x2060
 206 def is_OTHER(U, UISC, UGC):
 207         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 208         return (UISC == Other
 209                 and not is_SYM_MOD(U, UISC, UGC)
 210                 and not is_CGJ(U, UISC, UGC)
 211                 and not is_Word_Joiner(U, UISC, UGC)
 212                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 213         )
 214 def is_Reserved(U, UISC, UGC):
 215         return UGC == 'Cn'
 216 def is_REPHA(U, UISC, UGC):
 217         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 218 def is_SYM(U, UISC, UGC):
 219         if U == 0x25CC: return False #SPEC-DRAFT
 220         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 221         return UGC in [So, Sc]
 222 def is_SYM_MOD(U, UISC, UGC):
 223         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 224 def is_VARIATION_SELECTOR(U, UISC, UGC):
 225         return 0xFE00 <= U <= 0xFE0F
 226 def is_VOWEL(U, UISC, UGC):
 227         # https://github.com/roozbehp/unicode-data/issues/6
 228         return (UISC == Pure_Killer or
 229                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 230 def is_VOWEL_MOD(U, UISC, UGC):
 231         # https://github.com/roozbehp/unicode-data/issues/6
 232         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 233                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 234
 235 use_mapping = {
 236         'B':    is_BASE,
 237         'IND':  is_BASE_IND,
 238         'N':    is_BASE_NUM,
 239         'GB':   is_BASE_OTHER,
 240         'CGJ':  is_CGJ,
 241         'F':    is_CONS_FINAL,
 242         'FM':   is_CONS_FINAL_MOD,
 243         'M':    is_CONS_MED,
 244         'CM':   is_CONS_MOD,
 245         'SUB':  is_CONS_SUB,
 246         'CS':   is_CONS_WITH_STACKER,
 247         'H':    is_HALANT,
 248         'HN':   is_HALANT_NUM,
 249         'ZWNJ': is_ZWNJ,
 250         'ZWJ':  is_ZWJ,
 251         'WJ':   is_Word_Joiner,
 252         'O':    is_OTHER,
 253         'Rsv':  is_Reserved,
 254         'R':    is_REPHA,
 255         'S':    is_SYM,
 256         'SM':   is_SYM_MOD,
 257         'VS':   is_VARIATION_SELECTOR,
 258         'V':    is_VOWEL,
 259         'VM':   is_VOWEL_MOD,
 260 }
 261
 262 use_positions = {
 263         'F': {
 264                 'Abv': [Top],
 265                 'Blw': [Bottom],
 266                 'Pst': [Right],
 267         },
 268         'M': {
 269                 'Abv': [Top],
 270                 'Blw': [Bottom, Bottom_And_Left],
 271                 'Pst': [Right],
 272                 'Pre': [Left],
 273         },
 274         'CM': {
 275                 'Abv': [Top],
 276                 'Blw': [Bottom],
 277         },
 278         'V': {
 279                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 280                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 281                 'Pst': [Right],
 282                 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 283         },
 284         'VM': {
 285                 'Abv': [Top],
 286                 'Blw': [Bottom, Overstruck],
 287                 'Pst': [Right],
 288                 'Pre': [Left],
 289         },
 290         'SM': {
 291                 'Abv': [Top],
 292                 'Blw': [Bottom],
 293         },
 294         'H': None,
 295         'B': None,
 296         'FM': None,
 297         'SUB': None,
 298 }
 299
 300 def map_to_use(data):
 301         out = {}
 302         items = use_mapping.items()
 303         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 304
 305                 # Resolve Indic_Syllabic_Category
 306
 307                 # TODO: These don't have UISC assigned in Unicode 8.0, but
 308                 # have UIPC
 309                 if U == 0x17DD: UISC = Vowel_Dependent
 310                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 311
 312                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 313                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 314
 315                 # TODO: U+1CED should only be allowed after some of
 316                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 317                 if U == 0x1CED: UISC = Tone_Mark
 318
 319                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525
 320                 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
 321
 322                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609
 323                 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
 324
 325                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626
 326                 if U == 0xA8B4: UISC = Consonant_Medial
 327
 328                 values = [k for k,v in items if v(U,UISC,UGC)]
 329                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 330                 USE = values[0]
 331
 332                 # Resolve Indic_Positional_Category
 333
 334                 # TODO: Not in Unicode 8.0 yet, but in spec.
 335                 if U == 0x1B6C: UIPC = Bottom
 336
 337                 # TODO: These should die, but have UIPC in Unicode 8.0
 338                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 339
 340                 # TODO: In USE's override list but not in Unicode 11.0
 341                 if U == 0x103C: UIPC = Left
 342
 343                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0
 344                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 345                 if U == 0x111CA: UIPC = Bottom
 346                 if U == 0x11300: UIPC = Top
 347                 if U == 0x1171E: UIPC = Left # Correct?!
 348                 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
 349                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 350                 # https://github.com/roozbehp/unicode-data/issues/8
 351                 if U == 0x0A51: UIPC = Bottom
 352
 353                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 354                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 355
 356                 pos_mapping = use_positions.get(USE, None)
 357                 if pos_mapping:
 358                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 359                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 360                         USE = USE + values[0]
 361
 362                 out[U] = (USE, UBlock)
 363         return out
 364
 365 defaults = ('O', 'No_Block')
 366 data = map_to_use(data)
 367
 368 print ("/* == Start of generated table == */")
 369 print ("/*")
 370 print (" * The following table is generated by running:")
 371 print (" *")
 372 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
 373 print (" *")
 374 print (" * on files with these headers:")
 375 print (" *")
 376 for h in headers:
 377         for l in h:
 378                 print (" * %s" % (l.strip()))
 379 print (" */")
 380 print ()
 381 print ('#include "hb-ot-shape-complex-use-private.hh"')
 382 print ()
 383
 384 total = 0
 385 used = 0
 386 last_block = None
 387 def print_block (block, start, end, data):
 388         global total, used, last_block
 389         if block and block != last_block:
 390                 print ()
 391                 print ()
 392                 print ("  /* %s */" % block)
 393                 if start % 16:
 394                         print (' ' * (20 + (start % 16 * 6)), end='')
 395         num = 0
 396         assert start % 8 == 0
 397         assert (end+1) % 8 == 0
 398         for u in range (start, end+1):
 399                 if u % 16 == 0:
 400                         print ()
 401                         print ("  /* %04X */" % u, end='')
 402                 if u in data:
 403                         num += 1
 404                 d = data.get (u, defaults)
 405                 print ("%6s," % d[0], end='')
 406
 407         total += end - start + 1
 408         used += num
 409         if block:
 410                 last_block = block
 411
 412 uu = sorted (data.keys ())
 413
 414 last = -100000
 415 num = 0
 416 offset = 0
 417 starts = []
 418 ends = []
 419 for k,v in sorted(use_mapping.items()):
 420         if k in use_positions and use_positions[k]: continue
 421         print ("#define %s      USE_%s  /* %s */" % (k, k, v.__name__[3:]))
 422 for k,v in sorted(use_positions.items()):
 423         if not v: continue
 424         for suf in v.keys():
 425                 tag = k + suf
 426                 print ("#define %s      USE_%s" % (tag, tag))
 427 print ("")
 428 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
 429 for u in uu:
 430         if u <= last:
 431                 continue
 432         block = data[u][1]
 433
 434         start = u//8*8
 435         end = start+1
 436         while end in uu and block == data[end][1]:
 437                 end += 1
 438         end = (end-1)//8*8 + 7
 439
 440         if start != last + 1:
 441                 if start - last <= 1+16*3:
 442                         print_block (None, last+1, start-1, data)
 443                         last = start-1
 444                 else:
 445                         if last >= 0:
 446                                 ends.append (last + 1)
 447                                 offset += ends[-1] - starts[-1]
 448                         print ()
 449                         print ()
 450                         print ("#define use_offset_0x%04xu %d" % (start, offset))
 451                         starts.append (start)
 452
 453         print_block (block, start, end, data)
 454         last = end
 455 ends.append (last + 1)
 456 offset += ends[-1] - starts[-1]
 457 print ()
 458 print ()
 459 occupancy = used * 100. / total
 460 page_bits = 12
 461 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 462 print ()
 463 print ("USE_TABLE_ELEMENT_TYPE")
 464 print ("hb_use_get_category (hb_codepoint_t u)")
 465 print ("{")
 466 print ("  switch (u >> %d)" % page_bits)
 467 print ("  {")
 468 pages = set([u>>page_bits for u in starts+ends])
 469 for p in sorted(pages):
 470         print ("    case 0x%0Xu:" % p)
 471         for (start,end) in zip (starts, ends):
 472                 if p not in [start>>page_bits, end>>page_bits]: continue
 473                 offset = "use_offset_0x%04xu" % start
 474                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 475         print ("      break;")
 476         print ("")
 477 print ("    default:")
 478 print ("      break;")
 479 print ("  }")
 480 print ("  return USE_O;")
 481 print ("}")
 482 print ()
 483 for k in sorted(use_mapping.keys()):
 484         if k in use_positions and use_positions[k]: continue
 485         print ("#undef %s" % k)
 486 for k,v in sorted(use_positions.items()):
 487         if not v: continue
 488         for suf in v.keys():
 489                 tag = k + suf
 490                 print ("#undef %s" % tag)
 491 print ()
 492 print ("/* == End of generated table == */")
 493
 494 # Maintain at least 50% occupancy in the table */
 495 if occupancy < 50:
 496         raise Exception ("Table too sparse, please investigate: ", occupancy)