src/gen-indic-table.py

   1 #!/usr/bin/env python3
   2
   3 """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
   4
   5 Input files:
   6 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
   7 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
   8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
   9 """
  10
  11 import sys
  12
  13 if len (sys.argv) != 4:
  14         sys.exit (__doc__)
  15
  16 ALLOWED_SINGLES = [0x00A0, 0x25CC]
  17 ALLOWED_BLOCKS = [
  18         'Basic Latin',
  19         'Latin-1 Supplement',
  20         'Devanagari',
  21         'Bengali',
  22         'Gurmukhi',
  23         'Gujarati',
  24         'Oriya',
  25         'Tamil',
  26         'Telugu',
  27         'Kannada',
  28         'Malayalam',
  29         'Myanmar',
  30         'Khmer',
  31         'Vedic Extensions',
  32         'General Punctuation',
  33         'Superscripts and Subscripts',
  34         'Devanagari Extended',
  35         'Myanmar Extended-B',
  36         'Myanmar Extended-A',
  37 ]
  38
  39 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
  40
  41 headers = [[f.readline () for i in range (2)] for f in files]
  42
  43 unicode_data = [{} for _ in files]
  44 for i, f in enumerate (files):
  45         for line in f:
  46
  47                 j = line.find ('#')
  48                 if j >= 0:
  49                         line = line[:j]
  50
  51                 fields = [x.strip () for x in line.split (';')]
  52                 if len (fields) == 1:
  53                         continue
  54
  55                 uu = fields[0].split ('..')
  56                 start = int (uu[0], 16)
  57                 if len (uu) == 1:
  58                         end = start
  59                 else:
  60                         end = int (uu[1], 16)
  61
  62                 t = fields[1]
  63
  64                 for u in range (start, end + 1):
  65                         unicode_data[i][u] = t
  66
  67 # Merge data into one dict:
  68 defaults = ('Other', 'Not_Applicable', 'No_Block')
  69 combined = {}
  70 for i,d in enumerate (unicode_data):
  71         for u,v in d.items ():
  72                 if i == 2 and not u in combined:
  73                         continue
  74                 if not u in combined:
  75                         combined[u] = list (defaults)
  76                 combined[u][i] = v
  77 combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
  78
  79
  80 # Convert categories & positions types
  81
  82 categories = {
  83   'indic' : [
  84     'X',
  85     'C',
  86     'V',
  87     'N',
  88     'H',
  89     'ZWNJ',
  90     'ZWJ',
  91     'M',
  92     'SM',
  93     'A',
  94     'VD',
  95     'PLACEHOLDER',
  96     'DOTTEDCIRCLE',
  97     'RS',
  98     'MPst',
  99     'Repha',
 100     'Ra',
 101     'CM',
 102     'Symbol',
 103     'CS',
 104   ],
 105   'khmer' : [
 106     'VAbv',
 107     'VBlw',
 108     'VPre',
 109     'VPst',
 110
 111     'Robatic',
 112     'Xgroup',
 113     'Ygroup',
 114   ],
 115   'myanmar' : [
 116     'VAbv',
 117     'VBlw',
 118     'VPre',
 119     'VPst',
 120
 121     'IV',
 122     'As',
 123     'DB',
 124     'GB',
 125     'MH',
 126     'MR',
 127     'MW',
 128     'MY',
 129     'PT',
 130     'VS',
 131     'ML',
 132   ],
 133 }
 134
 135 category_map = {
 136   'Other'                       : 'X',
 137   'Avagraha'                    : 'Symbol',
 138   'Bindu'                       : 'SM',
 139   'Brahmi_Joining_Number'       : 'PLACEHOLDER', # Don't care.
 140   'Cantillation_Mark'           : 'A',
 141   'Consonant'                   : 'C',
 142   'Consonant_Dead'              : 'C',
 143   'Consonant_Final'             : 'CM',
 144   'Consonant_Head_Letter'       : 'C',
 145   'Consonant_Initial_Postfixed' : 'C', # TODO
 146   'Consonant_Killer'            : 'M', # U+17CD only.
 147   'Consonant_Medial'            : 'CM',
 148   'Consonant_Placeholder'       : 'PLACEHOLDER',
 149   'Consonant_Preceding_Repha'   : 'Repha',
 150   'Consonant_Prefixed'          : 'X', # Don't care.
 151   'Consonant_Subjoined'         : 'CM',
 152   'Consonant_Succeeding_Repha'  : 'CM',
 153   'Consonant_With_Stacker'      : 'CS',
 154   'Gemination_Mark'             : 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552
 155   'Invisible_Stacker'           : 'H',
 156   'Joiner'                      : 'ZWJ',
 157   'Modifying_Letter'            : 'X',
 158   'Non_Joiner'                  : 'ZWNJ',
 159   'Nukta'                       : 'N',
 160   'Number'                      : 'PLACEHOLDER',
 161   'Number_Joiner'               : 'PLACEHOLDER', # Don't care.
 162   'Pure_Killer'                 : 'M', # Is like a vowel matra.
 163   'Register_Shifter'            : 'RS',
 164   'Syllable_Modifier'           : 'SM',
 165   'Tone_Letter'                 : 'X',
 166   'Tone_Mark'                   : 'N',
 167   'Virama'                      : 'H',
 168   'Visarga'                     : 'SM',
 169   'Vowel'                       : 'V',
 170   'Vowel_Dependent'             : 'M',
 171   'Vowel_Independent'           : 'V',
 172 }
 173 position_map = {
 174   'Not_Applicable'              : 'END',
 175
 176   'Left'                        : 'PRE_C',
 177   'Top'                         : 'ABOVE_C',
 178   'Bottom'                      : 'BELOW_C',
 179   'Right'                       : 'POST_C',
 180
 181   # These should resolve to the position of the last part of the split sequence.
 182   'Bottom_And_Right'            : 'POST_C',
 183   'Left_And_Right'              : 'POST_C',
 184   'Top_And_Bottom'              : 'BELOW_C',
 185   'Top_And_Bottom_And_Left'     : 'BELOW_C',
 186   'Top_And_Bottom_And_Right'    : 'POST_C',
 187   'Top_And_Left'                : 'ABOVE_C',
 188   'Top_And_Left_And_Right'      : 'POST_C',
 189   'Top_And_Right'               : 'POST_C',
 190
 191   'Overstruck'                  : 'AFTER_MAIN',
 192   'Visual_order_left'           : 'PRE_M',
 193 }
 194
 195 category_overrides = {
 196
 197   # These are the variation-selectors. They only appear in the Myanmar grammar
 198   # but are not Myanmar-specific
 199   0xFE00: 'VS',
 200   0xFE01: 'VS',
 201   0xFE02: 'VS',
 202   0xFE03: 'VS',
 203   0xFE04: 'VS',
 204   0xFE05: 'VS',
 205   0xFE06: 'VS',
 206   0xFE07: 'VS',
 207   0xFE08: 'VS',
 208   0xFE09: 'VS',
 209   0xFE0A: 'VS',
 210   0xFE0B: 'VS',
 211   0xFE0C: 'VS',
 212   0xFE0D: 'VS',
 213   0xFE0E: 'VS',
 214   0xFE0F: 'VS',
 215
 216   # These appear in the OT Myanmar spec, but are not Myanmar-specific
 217   0x2015: 'PLACEHOLDER',
 218   0x2022: 'PLACEHOLDER',
 219   0x25FB: 'PLACEHOLDER',
 220   0x25FC: 'PLACEHOLDER',
 221   0x25FD: 'PLACEHOLDER',
 222   0x25FE: 'PLACEHOLDER',
 223
 224
 225   # Indic
 226
 227   0x0930: 'Ra', # Devanagari
 228   0x09B0: 'Ra', # Bengali
 229   0x09F0: 'Ra', # Bengali
 230   0x0A30: 'Ra', # Gurmukhi      No Reph
 231   0x0AB0: 'Ra', # Gujarati
 232   0x0B30: 'Ra', # Oriya
 233   0x0BB0: 'Ra', # Tamil         No Reph
 234   0x0C30: 'Ra', # Telugu        Reph formed only with ZWJ
 235   0x0CB0: 'Ra', # Kannada
 236   0x0D30: 'Ra', # Malayalam     No Reph, Logical Repha
 237
 238   # The following act more like the Bindus.
 239   0x0953: 'SM',
 240   0x0954: 'SM',
 241
 242   # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI.
 243   0x0A40: 'MPst',
 244
 245   # The following act like consonants.
 246   0x0A72: 'C',
 247   0x0A73: 'C',
 248   0x1CF5: 'C',
 249   0x1CF6: 'C',
 250
 251   # TODO: The following should only be allowed after a Visarga.
 252   # For now, just treat them like regular tone marks.
 253   0x1CE2: 'A',
 254   0x1CE3: 'A',
 255   0x1CE4: 'A',
 256   0x1CE5: 'A',
 257   0x1CE6: 'A',
 258   0x1CE7: 'A',
 259   0x1CE8: 'A',
 260
 261   # TODO: The following should only be allowed after some of
 262   # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 263   # For now, just treat them like tone marks.
 264   0x1CED: 'A',
 265
 266   # The following take marks in standalone clusters, similar to Avagraha.
 267   0xA8F2: 'Symbol',
 268   0xA8F3: 'Symbol',
 269   0xA8F4: 'Symbol',
 270   0xA8F5: 'Symbol',
 271   0xA8F6: 'Symbol',
 272   0xA8F7: 'Symbol',
 273   0x1CE9: 'Symbol',
 274   0x1CEA: 'Symbol',
 275   0x1CEB: 'Symbol',
 276   0x1CEC: 'Symbol',
 277   0x1CEE: 'Symbol',
 278   0x1CEF: 'Symbol',
 279   0x1CF0: 'Symbol',
 280   0x1CF1: 'Symbol',
 281
 282   0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524
 283
 284   # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
 285   # so the Indic shaper needs to know their categories.
 286   0x11301: 'SM',
 287   0x11302: 'SM',
 288   0x11303: 'SM',
 289   0x1133B: 'N',
 290   0x1133C: 'N',
 291
 292   0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552
 293   0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849
 294
 295   0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613
 296   0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623
 297   0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511
 298
 299   0x25CC: 'DOTTEDCIRCLE',
 300
 301
 302   # Khmer
 303
 304   0x179A: 'Ra',
 305
 306   0x17CC: 'Robatic',
 307   0x17C9: 'Robatic',
 308   0x17CA: 'Robatic',
 309
 310   0x17C6: 'Xgroup',
 311   0x17CB: 'Xgroup',
 312   0x17CD: 'Xgroup',
 313   0x17CE: 'Xgroup',
 314   0x17CF: 'Xgroup',
 315   0x17D0: 'Xgroup',
 316   0x17D1: 'Xgroup',
 317
 318   0x17C7: 'Ygroup',
 319   0x17C8: 'Ygroup',
 320   0x17DD: 'Ygroup',
 321   0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it.
 322
 323   0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384
 324
 325
 326   # Myanmar
 327
 328   # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze
 329
 330   0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder
 331
 332   0x1004: 'Ra',
 333   0x101B: 'Ra',
 334   0x105A: 'Ra',
 335
 336   0x1032: 'A',
 337   0x1036: 'A',
 338
 339   0x103A: 'As',
 340
 341   #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do.
 342
 343   0x103E: 'MH',
 344   0x1060: 'ML',
 345   0x103C: 'MR',
 346   0x103D: 'MW',
 347   0x1082: 'MW',
 348   0x103B: 'MY',
 349   0x105E: 'MY',
 350   0x105F: 'MY',
 351
 352   0x1063: 'PT',
 353   0x1064: 'PT',
 354   0x1069: 'PT',
 355   0x106A: 'PT',
 356   0x106B: 'PT',
 357   0x106C: 'PT',
 358   0x106D: 'PT',
 359   0xAA7B: 'PT',
 360
 361   0x1038: 'SM',
 362   0x1087: 'SM',
 363   0x1088: 'SM',
 364   0x1089: 'SM',
 365   0x108A: 'SM',
 366   0x108B: 'SM',
 367   0x108C: 'SM',
 368   0x108D: 'SM',
 369   0x108F: 'SM',
 370   0x109A: 'SM',
 371   0x109B: 'SM',
 372   0x109C: 'SM',
 373
 374   0x104A: 'PLACEHOLDER',
 375 }
 376 position_overrides = {
 377
 378   0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524
 379
 380   0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec.
 381 }
 382
 383 def matra_pos_left(u, block):
 384   return "PRE_M"
 385 def matra_pos_right(u, block):
 386   if block == 'Devanagari':     return  'AFTER_SUB'
 387   if block == 'Bengali':        return  'AFTER_POST'
 388   if block == 'Gurmukhi':       return  'AFTER_POST'
 389   if block == 'Gujarati':       return  'AFTER_POST'
 390   if block == 'Oriya':          return  'AFTER_POST'
 391   if block == 'Tamil':          return  'AFTER_POST'
 392   if block == 'Telugu':         return  'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB'
 393   if block == 'Kannada':        return  'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB'
 394   if block == 'Malayalam':      return  'AFTER_POST'
 395   return 'AFTER_SUB'
 396 def matra_pos_top(u, block):
 397   # BENG and MLYM don't have top matras.
 398   if block == 'Devanagari':     return  'AFTER_SUB'
 399   if block == 'Gurmukhi':       return  'AFTER_POST' # Deviate from spec
 400   if block == 'Gujarati':       return  'AFTER_SUB'
 401   if block == 'Oriya':          return  'AFTER_MAIN'
 402   if block == 'Tamil':          return  'AFTER_SUB'
 403   if block == 'Telugu':         return  'BEFORE_SUB'
 404   if block == 'Kannada':        return  'BEFORE_SUB'
 405   return 'AFTER_SUB'
 406 def matra_pos_bottom(u, block):
 407   if block == 'Devanagari':     return  'AFTER_SUB'
 408   if block == 'Bengali':        return  'AFTER_SUB'
 409   if block == 'Gurmukhi':       return  'AFTER_POST'
 410   if block == 'Gujarati':       return  'AFTER_POST'
 411   if block == 'Oriya':          return  'AFTER_SUB'
 412   if block == 'Tamil':          return  'AFTER_POST'
 413   if block == 'Telugu':         return  'BEFORE_SUB'
 414   if block == 'Kannada':        return  'BEFORE_SUB'
 415   if block == 'Malayalam':      return  'AFTER_POST'
 416   return "AFTER_SUB"
 417 def indic_matra_position(u, pos, block): # Reposition matra
 418   if pos == 'PRE_C':    return matra_pos_left(u, block)
 419   if pos == 'POST_C':   return matra_pos_right(u, block)
 420   if pos == 'ABOVE_C':  return matra_pos_top(u, block)
 421   if pos == 'BELOW_C':  return matra_pos_bottom(u, block)
 422   assert (False)
 423
 424 def position_to_category(pos):
 425   if pos == 'PRE_C':    return 'VPre'
 426   if pos == 'ABOVE_C':  return 'VAbv'
 427   if pos == 'BELOW_C':  return 'VBlw'
 428   if pos == 'POST_C':   return 'VPst'
 429   assert(False)
 430
 431
 432 defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2])
 433
 434 indic_data = {}
 435 for k, (cat, pos, block) in combined.items():
 436   cat = category_map[cat]
 437   pos = position_map[pos]
 438   indic_data[k] = (cat, pos, block)
 439
 440 for k,new_cat in category_overrides.items():
 441   (cat, pos, _) = indic_data.get(k, defaults)
 442   indic_data[k] = (new_cat, pos, unicode_data[2][k])
 443
 444 # We only expect position for certain types
 445 positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst')
 446 for k, (cat, pos, block) in indic_data.items():
 447   if cat not in positioned_categories:
 448     pos = 'END'
 449     indic_data[k] = (cat, pos, block)
 450
 451 # Position overrides are more complicated
 452
 453 # Keep in sync with CONSONANT_FLAGS in the shaper
 454 consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE')
 455 matra_categories = ('M', 'MPst')
 456 smvd_categories = ('SM', 'VD', 'A', 'Symbol')
 457 for k, (cat, pos, block) in indic_data.items():
 458   if cat in consonant_categories:
 459     pos = 'BASE_C'
 460   elif cat in matra_categories:
 461     if block.startswith('Khmer') or block.startswith('Myanmar'):
 462       cat = position_to_category(pos)
 463     else:
 464       pos = indic_matra_position(k, pos, block)
 465   elif cat in smvd_categories:
 466     pos = 'SMVD';
 467   indic_data[k] = (cat, pos, block)
 468
 469 for k,new_pos in position_overrides.items():
 470   (cat, pos, _) = indic_data.get(k, defaults)
 471   indic_data[k] = (cat, new_pos, unicode_data[2][k])
 472
 473
 474 values = [{_: 1} for _ in defaults]
 475 for vv in indic_data.values():
 476   for i,v in enumerate(vv):
 477     values[i][v] = values[i].get (v, 0) + 1
 478
 479
 480
 481
 482 # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
 483 singles = {}
 484 for u in ALLOWED_SINGLES:
 485         singles[u] = indic_data[u]
 486         del indic_data[u]
 487
 488 print ("/* == Start of generated table == */")
 489 print ("/*")
 490 print (" * The following table is generated by running:")
 491 print (" *")
 492 print (" *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
 493 print (" *")
 494 print (" * on files with these headers:")
 495 print (" *")
 496 for h in headers:
 497         for l in h:
 498                 print (" * %s" % (l.strip()))
 499 print (" */")
 500 print ()
 501 print ('#include "hb.hh"')
 502 print ()
 503 print ('#ifndef HB_NO_OT_SHAPE')
 504 print ()
 505 print ('#include "hb-ot-shaper-indic.hh"')
 506 print ()
 507 print ('#pragma GCC diagnostic push')
 508 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 509 print ()
 510
 511 # Print categories
 512 for shaper in categories:
 513   print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper)
 514 print ()
 515 done = {}
 516 for shaper, shaper_cats in categories.items():
 517   print ('/* %s */' % shaper)
 518   for cat in shaper_cats:
 519     v = shaper[0].upper()
 520     if cat not in done:
 521       print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat))
 522       done[cat] = v
 523     else:
 524       print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat))
 525 print ()
 526
 527 # Shorten values
 528 short = [{
 529         "Repha":                'Rf',
 530         "PLACEHOLDER":          'GB',
 531         "DOTTEDCIRCLE":         'DC',
 532         "VPst":                 'VR',
 533         "VPre":                 'VL',
 534         "Robatic":              'Rt',
 535         "Xgroup":               'Xg',
 536         "Ygroup":               'Yg',
 537         "As":                   'As',
 538 },{
 539         "END":                  'X',
 540         "BASE_C":               'C',
 541         "ABOVE_C":              'T',
 542         "BELOW_C":              'B',
 543         "POST_C":               'R',
 544         "PRE_C":                'L',
 545         "PRE_M":                'LM',
 546         "AFTER_MAIN":           'A',
 547         "AFTER_SUB":            'AS',
 548         "BEFORE_SUB":           'BS',
 549         "AFTER_POST":           'AP',
 550         "SMVD":                 'SM',
 551 }]
 552 all_shorts = [{},{}]
 553
 554 # Add some of the values, to make them more readable, and to avoid duplicates
 555
 556 for i in range (2):
 557         for v,s in short[i].items ():
 558                 all_shorts[i][s] = v
 559
 560 what = ["OT", "POS"]
 561 what_short = ["_OT", "_POS"]
 562 cat_defs = []
 563 for i in range (2):
 564         vv = sorted (values[i].keys ())
 565         for v in vv:
 566                 v_no_and = v.replace ('_And_', '_')
 567                 if v in short[i]:
 568                         s = short[i][v]
 569                 else:
 570                         s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
 571                         if s in all_shorts[i]:
 572                                 raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
 573                         all_shorts[i][s] = v
 574                         short[i][v] = s
 575                 cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v))
 576
 577 maxlen_s = max ([len (c[0]) for c in cat_defs])
 578 maxlen_l = max ([len (c[1]) for c in cat_defs])
 579 maxlen_n = max ([len (c[2]) for c in cat_defs])
 580 for s in what_short:
 581         print ()
 582         for c in [c for c in cat_defs if s in c[0]]:
 583                 print ("#define %s %s /* %s chars; %s */" %
 584                         (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3]))
 585 print ()
 586 print ('#pragma GCC diagnostic pop')
 587 print ()
 588 print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))")
 589 print ()
 590 print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short))
 591 print ()
 592 print ()
 593
 594 total = 0
 595 used = 0
 596 last_block = None
 597 def print_block (block, start, end, data):
 598         global total, used, last_block
 599         if block and block != last_block:
 600                 print ()
 601                 print ()
 602                 print ("  /* %s */" % block)
 603         num = 0
 604         assert start % 8 == 0
 605         assert (end+1) % 8 == 0
 606         for u in range (start, end+1):
 607                 if u % 8 == 0:
 608                         print ()
 609                         print ("  /* %04X */" % u, end="")
 610                 if u in data:
 611                         num += 1
 612                 d = data.get (u, defaults)
 613                 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
 614
 615         total += end - start + 1
 616         used += num
 617         if block:
 618                 last_block = block
 619
 620 uu = sorted (indic_data)
 621
 622 last = -100000
 623 num = 0
 624 offset = 0
 625 starts = []
 626 ends = []
 627 print ("static const uint16_t indic_table[] = {")
 628 for u in uu:
 629         if u <= last:
 630                 continue
 631         block = indic_data[u][2]
 632
 633         start = u//8*8
 634         end = start+1
 635         while end in uu and block == indic_data[end][2]:
 636                 end += 1
 637         end = (end-1)//8*8 + 7
 638
 639         if start != last + 1:
 640                 if start - last <= 1+16*2:
 641                         print_block (None, last+1, start-1, indic_data)
 642                 else:
 643                         if last >= 0:
 644                                 ends.append (last + 1)
 645                                 offset += ends[-1] - starts[-1]
 646                         print ()
 647                         print ()
 648                         print ("#define indic_offset_0x%04xu %d" % (start, offset))
 649                         starts.append (start)
 650
 651         print_block (block, start, end, indic_data)
 652         last = end
 653 ends.append (last + 1)
 654 offset += ends[-1] - starts[-1]
 655 print ()
 656 print ()
 657 occupancy = used * 100. / total
 658 page_bits = 12
 659 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 660 print ()
 661 print ("uint16_t")
 662 print ("hb_indic_get_categories (hb_codepoint_t u)")
 663 print ("{")
 664 print ("  switch (u >> %d)" % page_bits)
 665 print ("  {")
 666 pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
 667 for p in sorted(pages):
 668         print ("    case 0x%0Xu:" % p)
 669         for u,d in singles.items ():
 670                 if p != u>>page_bits: continue
 671                 print ("      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
 672         for (start,end) in zip (starts, ends):
 673                 if p not in [start>>page_bits, end>>page_bits]: continue
 674                 offset = "indic_offset_0x%04xu" % start
 675                 print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
 676         print ("      break;")
 677         print ("")
 678 print ("    default:")
 679 print ("      break;")
 680 print ("  }")
 681 print ("  return _(X,X);")
 682 print ("}")
 683 print ()
 684 print ("#undef _")
 685 print ("#undef INDIC_COMBINE_CATEGORIES")
 686 for i in range (2):
 687         print ()
 688         vv = sorted (values[i].keys ())
 689         for v in vv:
 690                 print ("#undef %s_%s" %
 691                         (what_short[i], short[i][v]))
 692 print ()
 693 print ('#endif')
 694 print ()
 695 print ("/* == End of generated table == */")
 696
 697 # Maintain at least 50% occupancy in the table */
 698 if occupancy < 50:
 699         raise Exception ("Table too sparse, please investigate: ", occupancy)