src/gen-use-table.py

   1 #!/usr/bin/python
   2
   3 import sys
   4
   5 if len (sys.argv) != 5:
   6         print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
   7         sys.exit (1)
   8
   9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
  10
  11 files = [file (x) for x in sys.argv[1:]]
  12
  13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  14 headers.append (["UnicodeData.txt does not have a header."])
  15
  16 data = [{} for f in files]
  17 values = [{} for f in files]
  18 for i, f in enumerate (files):
  19         for line in f:
  20
  21                 j = line.find ('#')
  22                 if j >= 0:
  23                         line = line[:j]
  24
  25                 fields = [x.strip () for x in line.split (';')]
  26                 if len (fields) == 1:
  27                         continue
  28
  29                 uu = fields[0].split ('..')
  30                 start = int (uu[0], 16)
  31                 if len (uu) == 1:
  32                         end = start
  33                 else:
  34                         end = int (uu[1], 16)
  35
  36                 t = fields[1 if i != 2 else 2]
  37
  38                 for u in range (start, end + 1):
  39                         data[i][u] = t
  40                 values[i][t] = values[i].get (t, 0) + end - start + 1
  41
  42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  43
  44 # TODO Characters that are not in Unicode Indic files, but used in USE
  45 data[0][0x034F] = defaults[0]
  46 data[0][0x2060] = defaults[0]
  47 for u in range (0xFE00, 0xFE0F + 1):
  48         data[0][u] = defaults[0]
  49
  50 # Merge data into one dict:
  51 for i,v in enumerate (defaults):
  52         values[i][v] = values[i].get (v, 0) + 1
  53 combined = {}
  54 for i,d in enumerate (data):
  55         for u,v in d.items ():
  56                 if i >= 2 and not u in combined:
  57                         continue
  58                 if not u in combined:
  59                         combined[u] = list (defaults)
  60                 combined[u][i] = v
  61 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  62 data = combined
  63 del combined
  64 num = len (data)
  65
  66
  67 property_names = [
  68         # General_Category
  69         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  70         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  71         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  72         # Indic_Syllabic_Category
  73         'Other',
  74         'Bindu',
  75         'Visarga',
  76         'Avagraha',
  77         'Nukta',
  78         'Virama',
  79         'Pure_Killer',
  80         'Invisible_Stacker',
  81         'Vowel_Independent',
  82         'Vowel_Dependent',
  83         'Vowel',
  84         'Consonant_Placeholder',
  85         'Consonant',
  86         'Consonant_Dead',
  87         'Consonant_With_Stacker',
  88         'Consonant_Prefixed',
  89         'Consonant_Preceding_Repha',
  90         'Consonant_Succeeding_Repha',
  91         'Consonant_Subjoined',
  92         'Consonant_Medial',
  93         'Consonant_Final',
  94         'Consonant_Head_Letter',
  95         'Modifying_Letter',
  96         'Tone_Letter',
  97         'Tone_Mark',
  98         'Gemination_Mark',
  99         'Cantillation_Mark',
 100         'Register_Shifter',
 101         'Syllable_Modifier',
 102         'Consonant_Killer',
 103         'Non_Joiner',
 104         'Joiner',
 105         'Number_Joiner',
 106         'Number',
 107         'Brahmi_Joining_Number',
 108         # Indic_Positional_Category
 109         'Not_Applicable',
 110         'Right',
 111         'Left',
 112         'Visual_Order_Left',
 113         'Left_And_Right',
 114         'Top',
 115         'Bottom',
 116         'Top_And_Bottom',
 117         'Top_And_Right',
 118         'Top_And_Left',
 119         'Top_And_Left_And_Right',
 120         'Bottom_And_Right',
 121         'Top_And_Bottom_And_Right',
 122         'Overstruck',
 123 ]
 124
 125 class PropertyValue(object):
 126         def __init__(self, name_):
 127                 self.name = name_
 128         def __str__(self):
 129                 return self.name
 130         def __eq__(self, other):
 131                 return self.name == (other if isinstance(other, basestring) else other.name)
 132         def __ne__(self, other):
 133                 return not (self == other)
 134
 135 property_values = {}
 136
 137 for name in property_names:
 138         value = PropertyValue(name)
 139         assert value not in property_values
 140         assert value not in globals()
 141         property_values[name] = value
 142 globals().update(property_values)
 143
 144
 145 def is_BASE(U, UISC, UGC):
 146         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 147                         #SPEC-DRAFT Consonant_Placeholder,
 148                         Tone_Letter,
 149                         Vowel_Independent #SPEC-DRAFT
 150                         ] or
 151                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 152                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 153 def is_BASE_IND(U, UISC, UGC):
 154         #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 155         return (UISC in [Consonant_Dead, Modifying_Letter] or
 156                 (UGC == Po and not U in [0x104E, 0x2022]) or
 157                 False # SPEC-DRAFT-OUTDATED! U == 0x002D
 158                 )
 159 def is_BASE_NUM(U, UISC, UGC):
 160         return UISC == Brahmi_Joining_Number
 161 def is_BASE_OTHER(U, UISC, UGC):
 162         if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
 163         #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 164         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 165 def is_CGJ(U, UISC, UGC):
 166         return U == 0x034F
 167 def is_CONS_FINAL(U, UISC, UGC):
 168         return ((UISC == Consonant_Final and UGC != Lo) or
 169                 UISC == Consonant_Succeeding_Repha)
 170 def is_CONS_FINAL_MOD(U, UISC, UGC):
 171         #SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 172         return  UISC == Syllable_Modifier
 173 def is_CONS_MED(U, UISC, UGC):
 174         return UISC == Consonant_Medial and UGC != Lo
 175 def is_CONS_MOD(U, UISC, UGC):
 176         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 177 def is_CONS_SUB(U, UISC, UGC):
 178         #SPEC-DRAFT return UISC == Consonant_Subjoined
 179         return UISC == Consonant_Subjoined and UGC != Lo
 180 def is_HALANT(U, UISC, UGC):
 181         return UISC in [Virama, Invisible_Stacker]
 182 def is_HALANT_NUM(U, UISC, UGC):
 183         return UISC == Number_Joiner
 184 def is_ZWNJ(U, UISC, UGC):
 185         return UISC == Non_Joiner
 186 def is_ZWJ(U, UISC, UGC):
 187         return UISC == Joiner
 188 def is_Word_Joiner(U, UISC, UGC):
 189         return U == 0x2060
 190 def is_OTHER(U, UISC, UGC):
 191         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 192         return (UISC == Other
 193                 and not is_SYM_MOD(U, UISC, UGC)
 194                 and not is_CGJ(U, UISC, UGC)
 195                 and not is_Word_Joiner(U, UISC, UGC)
 196                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 197         )
 198 def is_Reserved(U, UISC, UGC):
 199         return UGC == 'Cn'
 200 def is_REPHA(U, UISC, UGC):
 201         #return UISC == Consonant_Preceding_Repha
 202         #SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed
 203         return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed]
 204 def is_SYM(U, UISC, UGC):
 205         if U == 0x25CC: return False #SPEC-DRAFT
 206         #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
 207         return UGC in [So, Sc]
 208 def is_SYM_MOD(U, UISC, UGC):
 209         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 210 def is_VARIATION_SELECTOR(U, UISC, UGC):
 211         return 0xFE00 <= U <= 0xFE0F
 212 def is_VOWEL(U, UISC, UGC):
 213         return (UISC == Pure_Killer or
 214                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
 215 def is_VOWEL_MOD(U, UISC, UGC):
 216         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 217                 (UGC != Lo and UISC == Bindu))
 218
 219 use_mapping = {
 220         'B':    is_BASE,
 221         'IND':  is_BASE_IND,
 222         'N':    is_BASE_NUM,
 223         'GB':   is_BASE_OTHER,
 224         'CGJ':  is_CGJ,
 225         'F':    is_CONS_FINAL,
 226         'FM':   is_CONS_FINAL_MOD,
 227         'M':    is_CONS_MED,
 228         'CM':   is_CONS_MOD,
 229         'SUB':  is_CONS_SUB,
 230         'H':    is_HALANT,
 231         'HN':   is_HALANT_NUM,
 232         'ZWNJ': is_ZWNJ,
 233         'ZWJ':  is_ZWJ,
 234         'WJ':   is_Word_Joiner,
 235         'O':    is_OTHER,
 236         'Rsv':  is_Reserved,
 237         'R':    is_REPHA,
 238         'S':    is_SYM,
 239         'SM':   is_SYM_MOD,
 240         'VS':   is_VARIATION_SELECTOR,
 241         'V':    is_VOWEL,
 242         'VM':   is_VOWEL_MOD,
 243 }
 244
 245 use_positions = {
 246         'F': {
 247                 'Abv': [Top],
 248                 'Blw': [Bottom],
 249                 'Pst': [Right],
 250         },
 251         'M': {
 252                 'Abv': [Top],
 253                 'Blw': [Bottom],
 254                 'Pst': [Right],
 255                 'Pre': [Left],
 256         },
 257         'CM': {
 258                 'Abv': [Top],
 259                 'Blw': [Bottom],
 260         },
 261         'V': {
 262                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 263                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 264                 'Pst': [Right],
 265                 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 266         },
 267         'VM': {
 268                 'Abv': [Top],
 269                 'Blw': [Bottom, Overstruck],
 270                 'Pst': [Right],
 271                 'Pre': [Left],
 272         },
 273         'SM': {
 274                 'Abv': [Top],
 275                 'Blw': [Bottom],
 276         },
 277         'H': None,
 278         'B': None,
 279         'FM': None,
 280         'SUB': None,
 281 }
 282
 283 def map_to_use(data):
 284         out = {}
 285         items = use_mapping.items()
 286         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 287
 288                 # Resolve Indic_Syllabic_Category
 289
 290                 # TODO: These don't have UISC assigned in Unicode 8.0, but
 291                 # have UIPC
 292                 if U == 0x17DD: UISC = Vowel_Dependent
 293                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 294
 295                 # TODO: U+1CED should only be allowed after some of
 296                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 297                 if U == 0x1CED: UISC = Tone_Mark
 298
 299                 evals = [(k, v(U,UISC,UGC)) for k,v in items]
 300                 values = [k for k,v in evals if v]
 301                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 302                 USE = values[0]
 303
 304                 # Resolve Indic_Positional_Category
 305
 306                 # TODO: Not in Unicode 8.0 yet, but in spec.
 307                 if U == 0x1B6C: UIPC = Bottom
 308
 309                 # TODO: These should die, but have UIPC in Unicode 8.0
 310                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 311
 312                 # TODO: In USE's override list but not in Unicode 8.0
 313                 if U == 0x103C: UIPC = Left
 314
 315                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
 316                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 317                 if U == 0x111CA: UIPC = Bottom
 318                 if U == 0x11300: UIPC = Top
 319                 if U == 0x1133C: UIPC = Bottom
 320                 if U == 0x1171E: UIPC = Left # Correct?!
 321                 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
 322                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 323
 324                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 325                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 326
 327                 pos_mapping = use_positions.get(USE, None)
 328                 if pos_mapping:
 329                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 330                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 331                         USE = USE + values[0]
 332
 333                 out[U] = (USE, UBlock)
 334         return out
 335
 336 defaults = ('O', 'No_Block')
 337 data = map_to_use(data)
 338
 339 # Remove the outliers
 340 singles = {}
 341 for u in [0x034F, 0x25CC, 0x1107F]:
 342         singles[u] = data[u]
 343         del data[u]
 344
 345 print "/* == Start of generated table == */"
 346 print "/*"
 347 print " * The following table is generated by running:"
 348 print " *"
 349 print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
 350 print " *"
 351 print " * on files with these headers:"
 352 print " *"
 353 for h in headers:
 354         for l in h:
 355                 print " * %s" % (l.strip())
 356 print " */"
 357 print
 358 print '#include "hb-ot-shape-complex-use-private.hh"'
 359 print
 360
 361 total = 0
 362 used = 0
 363 last_block = None
 364 def print_block (block, start, end, data):
 365         global total, used, last_block
 366         if block and block != last_block:
 367                 print
 368                 print
 369                 print "  /* %s */" % block
 370                 if start % 16:
 371                         print ' ' * (20 + (start % 16 * 6)),
 372         num = 0
 373         assert start % 8 == 0
 374         assert (end+1) % 8 == 0
 375         for u in range (start, end+1):
 376                 if u % 16 == 0:
 377                         print
 378                         print "  /* %04X */" % u,
 379                 if u in data:
 380                         num += 1
 381                 d = data.get (u, defaults)
 382                 sys.stdout.write ("%6s," % d[0])
 383
 384         total += end - start + 1
 385         used += num
 386         if block:
 387                 last_block = block
 388
 389 uu = data.keys ()
 390 uu.sort ()
 391
 392 last = -100000
 393 num = 0
 394 offset = 0
 395 starts = []
 396 ends = []
 397 for k,v in sorted(use_mapping.items()):
 398         if k in use_positions and use_positions[k]: continue
 399         print "#define %s       USE_%s  /* %s */" % (k, k, v.__name__[3:])
 400 for k,v in sorted(use_positions.items()):
 401         if not v: continue
 402         for suf in v.keys():
 403                 tag = k + suf
 404                 print "#define %s       USE_%s" % (tag, tag)
 405 print ""
 406 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
 407 for u in uu:
 408         if u <= last:
 409                 continue
 410         block = data[u][1]
 411
 412         start = u//8*8
 413         end = start+1
 414         while end in uu and block == data[end][1]:
 415                 end += 1
 416         end = (end-1)//8*8 + 7
 417
 418         if start != last + 1:
 419                 if start - last <= 1+16*3:
 420                         print_block (None, last+1, start-1, data)
 421                         last = start-1
 422                 else:
 423                         if last >= 0:
 424                                 ends.append (last + 1)
 425                                 offset += ends[-1] - starts[-1]
 426                         print
 427                         print
 428                         print "#define use_offset_0x%04xu %d" % (start, offset)
 429                         starts.append (start)
 430
 431         print_block (block, start, end, data)
 432         last = end
 433 ends.append (last + 1)
 434 offset += ends[-1] - starts[-1]
 435 print
 436 print
 437 occupancy = used * 100. / total
 438 page_bits = 12
 439 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
 440 print
 441 print "USE_TABLE_ELEMENT_TYPE"
 442 print "hb_use_get_categories (hb_codepoint_t u)"
 443 print "{"
 444 print "  switch (u >> %d)" % page_bits
 445 print "  {"
 446 pages = set([u>>page_bits for u in starts+ends+singles.keys()])
 447 for p in sorted(pages):
 448         print "    case 0x%0Xu:" % p
 449         for (start,end) in zip (starts, ends):
 450                 if p not in [start>>page_bits, end>>page_bits]: continue
 451                 offset = "use_offset_0x%04xu" % start
 452                 print "      if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
 453         for u,d in singles.items ():
 454                 if p != u>>page_bits: continue
 455                 print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
 456         print "      break;"
 457         print ""
 458 print "    default:"
 459 print "      break;"
 460 print "  }"
 461 print "  return USE_O;"
 462 print "}"
 463 print
 464 for k in sorted(use_mapping.keys()):
 465         if k in use_positions and use_positions[k]: continue
 466         print "#undef %s" % k
 467 for k,v in sorted(use_positions.items()):
 468         if not v: continue
 469         for suf in v.keys():
 470                 tag = k + suf
 471                 print "#undef %s" % tag
 472 print
 473 print "/* == End of generated table == */"
 474
 475 # Maintain at least 50% occupancy in the table */
 476 if occupancy < 50:
 477         raise Exception ("Table too sparse, please investigate: ", occupancy)