src/gen-use-table.py

   1 #!/usr/bin/python
   2
   3 import sys
   4
   5 if len (sys.argv) != 5:
   6         print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
   7         sys.exit (1)
   8
   9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
  10
  11 files = [file (x) for x in sys.argv[1:]]
  12
  13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  14 headers.append (["UnicodeData.txt does not have a header."])
  15
  16 data = [{} for f in files]
  17 values = [{} for f in files]
  18 for i, f in enumerate (files):
  19         for line in f:
  20
  21                 j = line.find ('#')
  22                 if j >= 0:
  23                         line = line[:j]
  24
  25                 fields = [x.strip () for x in line.split (';')]
  26                 if len (fields) == 1:
  27                         continue
  28
  29                 uu = fields[0].split ('..')
  30                 start = int (uu[0], 16)
  31                 if len (uu) == 1:
  32                         end = start
  33                 else:
  34                         end = int (uu[1], 16)
  35
  36                 t = fields[1 if i != 2 else 2]
  37
  38                 for u in range (start, end + 1):
  39                         data[i][u] = t
  40                 values[i][t] = values[i].get (t, 0) + end - start + 1
  41
  42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  43
  44 # TODO Characters that are not in Unicode Indic files, but used in USE
  45 data[0][0x034F] = defaults[0]
  46 data[0][0x2060] = defaults[0]
  47 data[0][0x20F0] = defaults[0]
  48 for u in range (0xFE00, 0xFE0F + 1):
  49         data[0][u] = defaults[0]
  50
  51 # Merge data into one dict:
  52 for i,v in enumerate (defaults):
  53         values[i][v] = values[i].get (v, 0) + 1
  54 combined = {}
  55 for i,d in enumerate (data):
  56         for u,v in d.items ():
  57                 if i >= 2 and not u in combined:
  58                         continue
  59                 if not u in combined:
  60                         combined[u] = list (defaults)
  61                 combined[u][i] = v
  62 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  63 data = combined
  64 del combined
  65 num = len (data)
  66
  67
  68 property_names = [
  69         # General_Category
  70         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  71         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  72         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  73         # Indic_Syllabic_Category
  74         'Other',
  75         'Bindu',
  76         'Visarga',
  77         'Avagraha',
  78         'Nukta',
  79         'Virama',
  80         'Pure_Killer',
  81         'Invisible_Stacker',
  82         'Vowel_Independent',
  83         'Vowel_Dependent',
  84         'Vowel',
  85         'Consonant_Placeholder',
  86         'Consonant',
  87         'Consonant_Dead',
  88         'Consonant_With_Stacker',
  89         'Consonant_Prefixed',
  90         'Consonant_Preceding_Repha',
  91         'Consonant_Succeeding_Repha',
  92         'Consonant_Subjoined',
  93         'Consonant_Medial',
  94         'Consonant_Final',
  95         'Consonant_Head_Letter',
  96         'Modifying_Letter',
  97         'Tone_Letter',
  98         'Tone_Mark',
  99         'Gemination_Mark',
 100         'Cantillation_Mark',
 101         'Register_Shifter',
 102         'Syllable_Modifier',
 103         'Consonant_Killer',
 104         'Non_Joiner',
 105         'Joiner',
 106         'Number_Joiner',
 107         'Number',
 108         'Brahmi_Joining_Number',
 109         # Indic_Positional_Category
 110         'Not_Applicable',
 111         'Right',
 112         'Left',
 113         'Visual_Order_Left',
 114         'Left_And_Right',
 115         'Top',
 116         'Bottom',
 117         'Top_And_Bottom',
 118         'Top_And_Right',
 119         'Top_And_Left',
 120         'Top_And_Left_And_Right',
 121         'Bottom_And_Left',
 122         'Bottom_And_Right',
 123         'Top_And_Bottom_And_Right',
 124         'Overstruck',
 125 ]
 126
 127 class PropertyValue(object):
 128         def __init__(self, name_):
 129                 self.name = name_
 130         def __str__(self):
 131                 return self.name
 132         def __eq__(self, other):
 133                 return self.name == (other if isinstance(other, basestring) else other.name)
 134         def __ne__(self, other):
 135                 return not (self == other)
 136
 137 property_values = {}
 138
 139 for name in property_names:
 140         value = PropertyValue(name)
 141         assert value not in property_values
 142         assert value not in globals()
 143         property_values[name] = value
 144 globals().update(property_values)
 145
 146
 147 def is_BASE(U, UISC, UGC):
 148         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 149                         #SPEC-DRAFT Consonant_Placeholder,
 150                         Tone_Letter,
 151                         Vowel_Independent #SPEC-DRAFT
 152                         ] or
 153                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 154                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 155 def is_BASE_IND(U, UISC, UGC):
 156         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 157         return (UISC in [Consonant_Dead, Modifying_Letter] or
 158                 (UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or
 159                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 160                 )
 161 def is_BASE_NUM(U, UISC, UGC):
 162         return UISC == Brahmi_Joining_Number
 163 def is_BASE_OTHER(U, UISC, UGC):
 164         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 165         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 166         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 167 def is_CGJ(U, UISC, UGC):
 168         return U == 0x034F
 169 def is_CONS_FINAL(U, UISC, UGC):
 170         return ((UISC == Consonant_Final and UGC != Lo) or
 171                 UISC == Consonant_Succeeding_Repha)
 172 def is_CONS_FINAL_MOD(U, UISC, UGC):
 173         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 174         return  UISC == Syllable_Modifier
 175 def is_CONS_MED(U, UISC, UGC):
 176         return UISC == Consonant_Medial and UGC != Lo
 177 def is_CONS_MOD(U, UISC, UGC):
 178         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 179 def is_CONS_SUB(U, UISC, UGC):
 180         #SPEC-DRAFT return UISC == Consonant_Subjoined
 181         return UISC == Consonant_Subjoined and UGC != Lo
 182 def is_CONS_WITH_STACKER(U, UISC, UGC):
 183         return UISC == Consonant_With_Stacker
 184 def is_HALANT(U, UISC, UGC):
 185         return UISC in [Virama, Invisible_Stacker]
 186 def is_HALANT_NUM(U, UISC, UGC):
 187         return UISC == Number_Joiner
 188 def is_ZWNJ(U, UISC, UGC):
 189         return UISC == Non_Joiner
 190 def is_ZWJ(U, UISC, UGC):
 191         return UISC == Joiner
 192 def is_Word_Joiner(U, UISC, UGC):
 193         return U == 0x2060
 194 def is_OTHER(U, UISC, UGC):
 195         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 196         return (UISC == Other
 197                 and not is_SYM_MOD(U, UISC, UGC)
 198                 and not is_CGJ(U, UISC, UGC)
 199                 and not is_Word_Joiner(U, UISC, UGC)
 200                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 201         )
 202 def is_Reserved(U, UISC, UGC):
 203         return UGC == 'Cn'
 204 def is_REPHA(U, UISC, UGC):
 205         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 206 def is_SYM(U, UISC, UGC):
 207         if U == 0x25CC: return False #SPEC-DRAFT
 208         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 209         return UGC in [So, Sc]
 210 def is_SYM_MOD(U, UISC, UGC):
 211         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 212 def is_VARIATION_SELECTOR(U, UISC, UGC):
 213         return 0xFE00 <= U <= 0xFE0F
 214 def is_VOWEL(U, UISC, UGC):
 215         # https://github.com/roozbehp/unicode-data/issues/6
 216         return (UISC == Pure_Killer or
 217                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
 218 def is_VOWEL_MOD(U, UISC, UGC):
 219         # https://github.com/roozbehp/unicode-data/issues/6
 220         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 221                 (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 222
 223 use_mapping = {
 224         'B':    is_BASE,
 225         'IND':  is_BASE_IND,
 226         'N':    is_BASE_NUM,
 227         'GB':   is_BASE_OTHER,
 228         'CGJ':  is_CGJ,
 229         'F':    is_CONS_FINAL,
 230         'FM':   is_CONS_FINAL_MOD,
 231         'M':    is_CONS_MED,
 232         'CM':   is_CONS_MOD,
 233         'SUB':  is_CONS_SUB,
 234         'CS':   is_CONS_WITH_STACKER,
 235         'H':    is_HALANT,
 236         'HN':   is_HALANT_NUM,
 237         'ZWNJ': is_ZWNJ,
 238         'ZWJ':  is_ZWJ,
 239         'WJ':   is_Word_Joiner,
 240         'O':    is_OTHER,
 241         'Rsv':  is_Reserved,
 242         'R':    is_REPHA,
 243         'S':    is_SYM,
 244         'SM':   is_SYM_MOD,
 245         'VS':   is_VARIATION_SELECTOR,
 246         'V':    is_VOWEL,
 247         'VM':   is_VOWEL_MOD,
 248 }
 249
 250 use_positions = {
 251         'F': {
 252                 'Abv': [Top],
 253                 'Blw': [Bottom],
 254                 'Pst': [Right],
 255         },
 256         'M': {
 257                 'Abv': [Top],
 258                 'Blw': [Bottom, Bottom_And_Left],
 259                 'Pst': [Right],
 260                 'Pre': [Left],
 261         },
 262         'CM': {
 263                 'Abv': [Top],
 264                 'Blw': [Bottom],
 265         },
 266         'V': {
 267                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 268                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 269                 'Pst': [Right],
 270                 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 271         },
 272         'VM': {
 273                 'Abv': [Top],
 274                 'Blw': [Bottom, Overstruck],
 275                 'Pst': [Right],
 276                 'Pre': [Left],
 277         },
 278         'SM': {
 279                 'Abv': [Top],
 280                 'Blw': [Bottom],
 281         },
 282         'H': None,
 283         'B': None,
 284         'FM': None,
 285         'SUB': None,
 286 }
 287
 288 def map_to_use(data):
 289         out = {}
 290         items = use_mapping.items()
 291         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 292
 293                 # Resolve Indic_Syllabic_Category
 294
 295                 # TODO: These don't have UISC assigned in Unicode 8.0, but
 296                 # have UIPC
 297                 if U == 0x17DD: UISC = Vowel_Dependent
 298                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 299
 300                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627
 301                 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
 302
 303                 # TODO: U+1CED should only be allowed after some of
 304                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 305                 if U == 0x1CED: UISC = Tone_Mark
 306
 307                 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525
 308                 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
 309
 310                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609
 311                 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
 312
 313                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626
 314                 if U == 0xA8B4: UISC = Consonant_Medial
 315
 316                 values = [k for k,v in items if v(U,UISC,UGC)]
 317                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 318                 USE = values[0]
 319
 320                 # Resolve Indic_Positional_Category
 321
 322                 # TODO: Not in Unicode 8.0 yet, but in spec.
 323                 if U == 0x1B6C: UIPC = Bottom
 324
 325                 # TODO: These should die, but have UIPC in Unicode 8.0
 326                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 327
 328                 # TODO: In USE's override list but not in Unicode 8.0
 329                 if U == 0x103C: UIPC = Left
 330
 331                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
 332                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 333                 if U == 0x111CA: UIPC = Bottom
 334                 if U == 0x11300: UIPC = Top
 335                 if U == 0x1133C: UIPC = Bottom
 336                 if U == 0x1171E: UIPC = Left # Correct?!
 337                 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
 338                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 339
 340                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 341                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 342
 343                 pos_mapping = use_positions.get(USE, None)
 344                 if pos_mapping:
 345                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 346                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 347                         USE = USE + values[0]
 348
 349                 out[U] = (USE, UBlock)
 350         return out
 351
 352 defaults = ('O', 'No_Block')
 353 data = map_to_use(data)
 354
 355 print "/* == Start of generated table == */"
 356 print "/*"
 357 print " * The following table is generated by running:"
 358 print " *"
 359 print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
 360 print " *"
 361 print " * on files with these headers:"
 362 print " *"
 363 for h in headers:
 364         for l in h:
 365                 print " * %s" % (l.strip())
 366 print " */"
 367 print
 368 print '#include "hb-ot-shape-complex-use-private.hh"'
 369 print
 370
 371 total = 0
 372 used = 0
 373 last_block = None
 374 def print_block (block, start, end, data):
 375         global total, used, last_block
 376         if block and block != last_block:
 377                 print
 378                 print
 379                 print "  /* %s */" % block
 380                 if start % 16:
 381                         print ' ' * (20 + (start % 16 * 6)),
 382         num = 0
 383         assert start % 8 == 0
 384         assert (end+1) % 8 == 0
 385         for u in range (start, end+1):
 386                 if u % 16 == 0:
 387                         print
 388                         print "  /* %04X */" % u,
 389                 if u in data:
 390                         num += 1
 391                 d = data.get (u, defaults)
 392                 sys.stdout.write ("%6s," % d[0])
 393
 394         total += end - start + 1
 395         used += num
 396         if block:
 397                 last_block = block
 398
 399 uu = data.keys ()
 400 uu.sort ()
 401
 402 last = -100000
 403 num = 0
 404 offset = 0
 405 starts = []
 406 ends = []
 407 for k,v in sorted(use_mapping.items()):
 408         if k in use_positions and use_positions[k]: continue
 409         print "#define %s       USE_%s  /* %s */" % (k, k, v.__name__[3:])
 410 for k,v in sorted(use_positions.items()):
 411         if not v: continue
 412         for suf in v.keys():
 413                 tag = k + suf
 414                 print "#define %s       USE_%s" % (tag, tag)
 415 print ""
 416 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
 417 for u in uu:
 418         if u <= last:
 419                 continue
 420         block = data[u][1]
 421
 422         start = u//8*8
 423         end = start+1
 424         while end in uu and block == data[end][1]:
 425                 end += 1
 426         end = (end-1)//8*8 + 7
 427
 428         if start != last + 1:
 429                 if start - last <= 1+16*3:
 430                         print_block (None, last+1, start-1, data)
 431                         last = start-1
 432                 else:
 433                         if last >= 0:
 434                                 ends.append (last + 1)
 435                                 offset += ends[-1] - starts[-1]
 436                         print
 437                         print
 438                         print "#define use_offset_0x%04xu %d" % (start, offset)
 439                         starts.append (start)
 440
 441         print_block (block, start, end, data)
 442         last = end
 443 ends.append (last + 1)
 444 offset += ends[-1] - starts[-1]
 445 print
 446 print
 447 occupancy = used * 100. / total
 448 page_bits = 12
 449 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
 450 print
 451 print "USE_TABLE_ELEMENT_TYPE"
 452 print "hb_use_get_category (hb_codepoint_t u)"
 453 print "{"
 454 print "  switch (u >> %d)" % page_bits
 455 print "  {"
 456 pages = set([u>>page_bits for u in starts+ends])
 457 for p in sorted(pages):
 458         print "    case 0x%0Xu:" % p
 459         for (start,end) in zip (starts, ends):
 460                 if p not in [start>>page_bits, end>>page_bits]: continue
 461                 offset = "use_offset_0x%04xu" % start
 462                 print "      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
 463         print "      break;"
 464         print ""
 465 print "    default:"
 466 print "      break;"
 467 print "  }"
 468 print "  return USE_O;"
 469 print "}"
 470 print
 471 for k in sorted(use_mapping.keys()):
 472         if k in use_positions and use_positions[k]: continue
 473         print "#undef %s" % k
 474 for k,v in sorted(use_positions.items()):
 475         if not v: continue
 476         for suf in v.keys():
 477                 tag = k + suf
 478                 print "#undef %s" % tag
 479 print
 480 print "/* == End of generated table == */"
 481
 482 # Maintain at least 50% occupancy in the table */
 483 if occupancy < 50:
 484         raise Exception ("Table too sparse, please investigate: ", occupancy)