src/gen-use-table.py

   1 #!/usr/bin/python
   2
   3 import sys
   4
   5 if len (sys.argv) != 5:
   6         print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
   7         sys.exit (1)
   8
   9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
  10
  11 files = [file (x) for x in sys.argv[1:]]
  12
  13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
  14 headers.append (["UnicodeData.txt does not have a header."])
  15
  16 data = [{} for f in files]
  17 values = [{} for f in files]
  18 for i, f in enumerate (files):
  19         for line in f:
  20
  21                 j = line.find ('#')
  22                 if j >= 0:
  23                         line = line[:j]
  24
  25                 fields = [x.strip () for x in line.split (';')]
  26                 if len (fields) == 1:
  27                         continue
  28
  29                 uu = fields[0].split ('..')
  30                 start = int (uu[0], 16)
  31                 if len (uu) == 1:
  32                         end = start
  33                 else:
  34                         end = int (uu[1], 16)
  35
  36                 t = fields[1 if i != 2 else 2]
  37
  38                 for u in range (start, end + 1):
  39                         data[i][u] = t
  40                 values[i][t] = values[i].get (t, 0) + end - start + 1
  41
  42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
  43
  44 # TODO Characters that are not in Unicode Indic files, but used in USE
  45 data[0][0x034F] = defaults[0]
  46 data[0][0x2060] = defaults[0]
  47 for u in range (0xFE00, 0xFE0F + 1):
  48         data[0][u] = defaults[0]
  49
  50 # Merge data into one dict:
  51 for i,v in enumerate (defaults):
  52         values[i][v] = values[i].get (v, 0) + 1
  53 combined = {}
  54 for i,d in enumerate (data):
  55         for u,v in d.items ():
  56                 if i >= 2 and not u in combined:
  57                         continue
  58                 if not u in combined:
  59                         combined[u] = list (defaults)
  60                 combined[u][i] = v
  61 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
  62 data = combined
  63 del combined
  64 num = len (data)
  65
  66
  67 property_names = [
  68         # General_Category
  69         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
  70         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
  71         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
  72         # Indic_Syllabic_Category
  73         'Other',
  74         'Bindu',
  75         'Visarga',
  76         'Avagraha',
  77         'Nukta',
  78         'Virama',
  79         'Pure_Killer',
  80         'Invisible_Stacker',
  81         'Vowel_Independent',
  82         'Vowel_Dependent',
  83         'Vowel',
  84         'Consonant_Placeholder',
  85         'Consonant',
  86         'Consonant_Dead',
  87         'Consonant_With_Stacker',
  88         'Consonant_Prefixed',
  89         'Consonant_Preceding_Repha',
  90         'Consonant_Succeeding_Repha',
  91         'Consonant_Subjoined',
  92         'Consonant_Medial',
  93         'Consonant_Final',
  94         'Consonant_Head_Letter',
  95         'Modifying_Letter',
  96         'Tone_Letter',
  97         'Tone_Mark',
  98         'Gemination_Mark',
  99         'Cantillation_Mark',
 100         'Register_Shifter',
 101         'Syllable_Modifier',
 102         'Consonant_Killer',
 103         'Non_Joiner',
 104         'Joiner',
 105         'Number_Joiner',
 106         'Number',
 107         'Brahmi_Joining_Number',
 108         # Indic_Positional_Category
 109         'Not_Applicable',
 110         'Right',
 111         'Left',
 112         'Visual_Order_Left',
 113         'Left_And_Right',
 114         'Top',
 115         'Bottom',
 116         'Top_And_Bottom',
 117         'Top_And_Right',
 118         'Top_And_Left',
 119         'Top_And_Left_And_Right',
 120         'Bottom_And_Right',
 121         'Top_And_Bottom_And_Right',
 122         'Overstruck',
 123 ]
 124
 125 class PropertyValue(object):
 126         def __init__(self, name_):
 127                 self.name = name_
 128         def __str__(self):
 129                 return self.name
 130         def __eq__(self, other):
 131                 return self.name == (other if isinstance(other, basestring) else other.name)
 132         def __ne__(self, other):
 133                 return not (self == other)
 134
 135 property_values = {}
 136
 137 for name in property_names:
 138         value = PropertyValue(name)
 139         assert value not in property_values
 140         assert value not in globals()
 141         property_values[name] = value
 142 globals().update(property_values)
 143
 144
 145 def is_BASE(U, UISC, UGC):
 146         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 147                         #SPEC-OUTDATED Consonant_Placeholder,
 148                         Tone_Letter] or
 149                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 150                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 151 def is_BASE_VOWEL(U, UISC, UGC):
 152         return UISC == Vowel_Independent
 153 def is_BASE_IND(U, UISC, UGC):
 154         #SPEC-BROKEN return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
 155         return (UISC in [Consonant_Dead, Modifying_Letter] or
 156                 (UGC == Po and not is_BASE_OTHER(U, UISC, UGC))) # for 104E
 157 def is_BASE_NUM(U, UISC, UGC):
 158         return UISC == Brahmi_Joining_Number
 159 def is_BASE_OTHER(U, UISC, UGC):
 160         if UISC == Consonant_Placeholder: return True #SPEC-OUTDATED
 161         return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC,
 162                      0x25FB, 0x25FC, 0x25FD, 0x25FE]
 163 def is_CGJ(U, UISC, UGC):
 164         return U == 0x034F
 165 def is_CONS_FINAL(U, UISC, UGC):
 166         return ((UISC == Consonant_Final and UGC != Lo) or
 167                 UISC == Consonant_Succeeding_Repha)
 168 def is_CONS_FINAL_MOD(U, UISC, UGC):
 169         #SPEC-OUTDATED return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
 170         return  UISC == Syllable_Modifier
 171 def is_CONS_MED(U, UISC, UGC):
 172         return UISC == Consonant_Medial and UGC != Lo
 173 def is_CONS_MOD(U, UISC, UGC):
 174         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 175 def is_CONS_SUB(U, UISC, UGC):
 176         #SPEC-OUTDATED return UISC == Consonant_Subjoined
 177         return UISC == Consonant_Subjoined and UGC != Lo
 178 def is_HALANT(U, UISC, UGC):
 179         return UISC in [Virama, Invisible_Stacker]
 180 def is_HALANT_NUM(U, UISC, UGC):
 181         return UISC == Number_Joiner
 182 def is_ZWNJ(U, UISC, UGC):
 183         return UISC == Non_Joiner
 184 def is_ZWJ(U, UISC, UGC):
 185         return UISC == Joiner
 186 def is_Word_Joiner(U, UISC, UGC):
 187         return U == 0x2060
 188 def is_OTHER(U, UISC, UGC):
 189         #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
 190         return (UISC == Other
 191                 and not is_SYM_MOD(U, UISC, UGC)
 192                 and not is_CGJ(U, UISC, UGC)
 193                 and not is_Word_Joiner(U, UISC, UGC)
 194                 and not is_VARIATION_SELECTOR(U, UISC, UGC)
 195         )
 196 def is_Reserved(U, UISC, UGC):
 197         return UGC == 'Cn'
 198 def is_REPHA(U, UISC, UGC):
 199         #return UISC == Consonant_Preceding_Repha
 200         #SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed
 201         return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed]
 202 def is_SYM(U, UISC, UGC):
 203         if U == 0x25CC: return False #SPEC-OUTDATED
 204         #SPEC-OUTDATED return UGC in [So, Sc] or UISC == Symbol_Letter
 205         return UGC in [So, Sc]
 206 def is_SYM_MOD(U, UISC, UGC):
 207         return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
 208 def is_VARIATION_SELECTOR(U, UISC, UGC):
 209         return 0xFE00 <= U <= 0xFE0F
 210 def is_VOWEL(U, UISC, UGC):
 211         return (UISC == Pure_Killer or
 212                 (UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
 213 def is_VOWEL_MOD(U, UISC, UGC):
 214         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 215                 (UGC != Lo and UISC == Bindu))
 216
 217 use_mapping = {
 218         'B':    is_BASE,
 219         'IV':   is_BASE_VOWEL,
 220         'IND':  is_BASE_IND,
 221         'N':    is_BASE_NUM,
 222         'GB':   is_BASE_OTHER,
 223         'CGJ':  is_CGJ,
 224         'F':    is_CONS_FINAL,
 225         'FM':   is_CONS_FINAL_MOD,
 226         'M':    is_CONS_MED,
 227         'CM':   is_CONS_MOD,
 228         'SUB':  is_CONS_SUB,
 229         'H':    is_HALANT,
 230         'HN':   is_HALANT_NUM,
 231         'ZWNJ': is_ZWNJ,
 232         'ZWJ':  is_ZWJ,
 233         'WJ':   is_Word_Joiner,
 234         'O':    is_OTHER,
 235         'Rsv':  is_Reserved,
 236         'R':    is_REPHA,
 237         'S':    is_SYM,
 238         'SM':   is_SYM_MOD,
 239         'VS':   is_VARIATION_SELECTOR,
 240         'V':    is_VOWEL,
 241         'VM':   is_VOWEL_MOD,
 242 }
 243
 244 use_positions = {
 245         'F': {
 246                 'Abv': [Top],
 247                 'Blw': [Bottom],
 248                 'Pst': [Right],
 249         },
 250         'M': {
 251                 'Abv': [Top],
 252                 'Blw': [Bottom],
 253                 'Pst': [Right],
 254                 'Pre': [Left],
 255         },
 256         'CM': {
 257                 'Abv': [Top],
 258                 'Blw': [Bottom],
 259         },
 260         'V': {
 261                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 262                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 263                 'Pst': [Right],
 264                 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 265         },
 266         'VM': {
 267                 'Abv': [Top],
 268                 'Blw': [Bottom, Overstruck],
 269                 'Pst': [Right],
 270                 'Pre': [Left],
 271         },
 272         'SM': {
 273                 'Abv': [Top],
 274                 'Blw': [Bottom],
 275         },
 276         'H': None,
 277         'B': None,
 278         'FM': None,
 279         'SUB': None,
 280 }
 281
 282 def map_to_use(data):
 283         out = {}
 284         items = use_mapping.items()
 285         for U,(UISC,UIPC,UGC,UBlock) in data.items():
 286
 287                 # Resolve Indic_Syllabic_Category
 288
 289                 # TODO: These don't have UISC assigned in Unicode 8.0, but
 290                 # have UIPC
 291                 if U == 0x17DD: UISC = Vowel_Dependent
 292                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 293
 294                 # TODO: U+1CED should only be allowed after some of
 295                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 296                 if U == 0x1CED: UISC = Tone_Mark
 297
 298                 evals = [(k, v(U,UISC,UGC)) for k,v in items]
 299                 values = [k for k,v in evals if v]
 300                 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
 301                 USE = values[0]
 302
 303                 # Resolve Indic_Positional_Category
 304
 305                 # TODO: Not in Unicode 8.0 yet, but in spec.
 306                 if U == 0x1B6C: UIPC = Bottom
 307
 308                 # TODO: These should die, but have UIPC in Unicode 8.0
 309                 if U in [0x953, 0x954]: UIPC = Not_Applicable
 310
 311                 # TODO: In USE's override list but not in Unicode 8.0
 312                 if U == 0x103C: UIPC = Left
 313
 314                 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
 315                 if 0xA926 <= U <= 0xA92A: UIPC = Top
 316                 if U == 0x111CA: UIPC = Bottom
 317                 if U == 0x11300: UIPC = Top
 318                 if U == 0x1133C: UIPC = Bottom
 319                 if U == 0x1171E: UIPC = Left # Correct?!
 320                 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
 321                 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 322
 323                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
 324                         USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 325
 326                 pos_mapping = use_positions.get(USE, None)
 327                 if pos_mapping:
 328                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 329                         assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
 330                         USE = USE + values[0]
 331
 332                 out[U] = (USE, UBlock)
 333         return out
 334
 335 defaults = ('O', 'No_Block')
 336 data = map_to_use(data)
 337
 338 # Remove the outliers
 339 singles = {}
 340 for u in [0x034F, 0x25CC, 0x1107F]:
 341         singles[u] = data[u]
 342         del data[u]
 343
 344 print "/* == Start of generated table == */"
 345 print "/*"
 346 print " * The following table is generated by running:"
 347 print " *"
 348 print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
 349 print " *"
 350 print " * on files with these headers:"
 351 print " *"
 352 for h in headers:
 353         for l in h:
 354                 print " * %s" % (l.strip())
 355 print " */"
 356 print
 357 print '#include "hb-ot-shape-complex-use-private.hh"'
 358 print
 359
 360 total = 0
 361 used = 0
 362 last_block = None
 363 def print_block (block, start, end, data):
 364         global total, used, last_block
 365         if block and block != last_block:
 366                 print
 367                 print
 368                 print "  /* %s */" % block
 369                 if start % 16:
 370                         print ' ' * (20 + (start % 16 * 6)),
 371         num = 0
 372         assert start % 8 == 0
 373         assert (end+1) % 8 == 0
 374         for u in range (start, end+1):
 375                 if u % 16 == 0:
 376                         print
 377                         print "  /* %04X */" % u,
 378                 if u in data:
 379                         num += 1
 380                 d = data.get (u, defaults)
 381                 sys.stdout.write ("%6s," % d[0])
 382
 383         total += end - start + 1
 384         used += num
 385         if block:
 386                 last_block = block
 387
 388 uu = data.keys ()
 389 uu.sort ()
 390
 391 last = -100000
 392 num = 0
 393 offset = 0
 394 starts = []
 395 ends = []
 396 for k,v in sorted(use_mapping.items()):
 397         if k in use_positions and use_positions[k]: continue
 398         print "#define %s       USE_%s  /* %s */" % (k, k, v.__name__[3:])
 399 for k,v in sorted(use_positions.items()):
 400         if not v: continue
 401         for suf in v.keys():
 402                 tag = k + suf
 403                 print "#define %s       USE_%s" % (tag, tag)
 404 print ""
 405 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
 406 for u in uu:
 407         if u <= last:
 408                 continue
 409         block = data[u][1]
 410
 411         start = u//8*8
 412         end = start+1
 413         while end in uu and block == data[end][1]:
 414                 end += 1
 415         end = (end-1)//8*8 + 7
 416
 417         if start != last + 1:
 418                 if start - last <= 1+16*3:
 419                         print_block (None, last+1, start-1, data)
 420                         last = start-1
 421                 else:
 422                         if last >= 0:
 423                                 ends.append (last + 1)
 424                                 offset += ends[-1] - starts[-1]
 425                         print
 426                         print
 427                         print "#define use_offset_0x%04xu %d" % (start, offset)
 428                         starts.append (start)
 429
 430         print_block (block, start, end, data)
 431         last = end
 432 ends.append (last + 1)
 433 offset += ends[-1] - starts[-1]
 434 print
 435 print
 436 occupancy = used * 100. / total
 437 page_bits = 12
 438 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
 439 print
 440 print "USE_TABLE_ELEMENT_TYPE"
 441 print "hb_use_get_categories (hb_codepoint_t u)"
 442 print "{"
 443 print "  switch (u >> %d)" % page_bits
 444 print "  {"
 445 pages = set([u>>page_bits for u in starts+ends+singles.keys()])
 446 for p in sorted(pages):
 447         print "    case 0x%0Xu:" % p
 448         for (start,end) in zip (starts, ends):
 449                 if p not in [start>>page_bits, end>>page_bits]: continue
 450                 offset = "use_offset_0x%04xu" % start
 451                 print "      if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
 452         for u,d in singles.items ():
 453                 if p != u>>page_bits: continue
 454                 print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
 455         print "      break;"
 456         print ""
 457 print "    default:"
 458 print "      break;"
 459 print "  }"
 460 print "  return USE_O;"
 461 print "}"
 462 print
 463 for k in sorted(use_mapping.keys()):
 464         if k in use_positions and use_positions[k]: continue
 465         print "#undef %s" % k
 466 for k,v in sorted(use_positions.items()):
 467         if not v: continue
 468         for suf in v.keys():
 469                 tag = k + suf
 470                 print "#undef %s" % tag
 471 print
 472 print "/* == End of generated table == */"
 473
 474 # Maintain at least 50% occupancy in the table */
 475 if occupancy < 50:
 476         raise Exception ("Table too sparse, please investigate: ", occupancy)