src/gen-use-table.py

   1 #!/usr/bin/env python3
   2 # flake8: noqa: F821
   3
   4 import logging
   5 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
   6
   7 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
   8
   9 Input files:
  10 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
  11 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
  12 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
  13 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
  14 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
  15 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
  16 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
  17 * ms-use/IndicSyllabicCategory-Additional.txt
  18 * ms-use/IndicPositionalCategory-Additional.txt
  19 """
  20
  21 import sys
  22
  23 if len (sys.argv) != 10:
  24         sys.exit (__doc__)
  25
  26 DISABLED_SCRIPTS = {
  27         'Arabic',
  28         'Lao',
  29         'Samaritan',
  30         'Syriac',
  31         'Thai',
  32 }
  33
  34 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
  35
  36 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
  37 for j in range(7, 9):
  38         for line in files[j]:
  39                 line = line.rstrip()
  40                 if not line:
  41                         break
  42                 headers[j - 1].append(line)
  43 headers.append (["UnicodeData.txt does not have a header."])
  44
  45 unicode_data = [{} for _ in files]
  46 values = [{} for _ in files]
  47 for i, f in enumerate (files):
  48         for line in f:
  49
  50                 j = line.find ('#')
  51                 if j >= 0:
  52                         line = line[:j]
  53
  54                 fields = [x.strip () for x in line.split (';')]
  55                 if len (fields) == 1:
  56                         continue
  57
  58                 uu = fields[0].split ('..')
  59                 start = int (uu[0], 16)
  60                 if len (uu) == 1:
  61                         end = start
  62                 else:
  63                         end = int (uu[1], 16)
  64
  65                 t = fields[1 if i not in [2, 4] else 2]
  66
  67                 if i == 2:
  68                         t = 'jt_' + t
  69                 elif i == 3 and t != 'Default_Ignorable_Code_Point':
  70                         continue
  71                 elif i == 7 and t == 'Consonant_Final_Modifier':
  72                         # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
  73                         t = 'Syllable_Modifier'
  74                 elif i == 8 and t == 'NA':
  75                         t = 'Not_Applicable'
  76
  77                 i0 = i if i < 7 else i - 7
  78                 for u in range (start, end + 1):
  79                         unicode_data[i0][u] = t
  80                 values[i0][t] = values[i0].get (t, 0) + end - start + 1
  81
  82 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
  83
  84 # Merge data into one dict:
  85 for i,v in enumerate (defaults):
  86         values[i][v] = values[i].get (v, 0) + 1
  87 combined = {}
  88 for i,d in enumerate (unicode_data):
  89         for u,v in d.items ():
  90                 if not u in combined:
  91                         if i >= 4:
  92                                 continue
  93                         combined[u] = list (defaults)
  94                 combined[u][i] = v
  95 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
  96
  97
  98 property_names = [
  99         # General_Category
 100         'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
 101         'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
 102         'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
 103         # Indic_Syllabic_Category
 104         'Other',
 105         'Bindu',
 106         'Visarga',
 107         'Avagraha',
 108         'Nukta',
 109         'Virama',
 110         'Pure_Killer',
 111         'Invisible_Stacker',
 112         'Vowel_Independent',
 113         'Vowel_Dependent',
 114         'Vowel',
 115         'Consonant_Placeholder',
 116         'Consonant',
 117         'Consonant_Dead',
 118         'Consonant_With_Stacker',
 119         'Consonant_Prefixed',
 120         'Consonant_Preceding_Repha',
 121         'Consonant_Succeeding_Repha',
 122         'Consonant_Subjoined',
 123         'Consonant_Medial',
 124         'Consonant_Final',
 125         'Consonant_Head_Letter',
 126         'Consonant_Initial_Postfixed',
 127         'Modifying_Letter',
 128         'Tone_Letter',
 129         'Tone_Mark',
 130         'Gemination_Mark',
 131         'Cantillation_Mark',
 132         'Register_Shifter',
 133         'Syllable_Modifier',
 134         'Consonant_Killer',
 135         'Non_Joiner',
 136         'Joiner',
 137         'Number_Joiner',
 138         'Number',
 139         'Brahmi_Joining_Number',
 140         'Symbol_Modifier',
 141         'Hieroglyph',
 142         'Hieroglyph_Joiner',
 143         'Hieroglyph_Mark_Begin',
 144         'Hieroglyph_Mark_End',
 145         'Hieroglyph_Mirror',
 146         'Hieroglyph_Modifier',
 147         'Hieroglyph_Segment_Begin',
 148         'Hieroglyph_Segment_End',
 149         # Indic_Positional_Category
 150         'Not_Applicable',
 151         'Right',
 152         'Left',
 153         'Visual_Order_Left',
 154         'Left_And_Right',
 155         'Top',
 156         'Bottom',
 157         'Top_And_Bottom',
 158         'Top_And_Bottom_And_Left',
 159         'Top_And_Right',
 160         'Top_And_Left',
 161         'Top_And_Left_And_Right',
 162         'Bottom_And_Left',
 163         'Bottom_And_Right',
 164         'Top_And_Bottom_And_Right',
 165         'Overstruck',
 166         # Joining_Type
 167         'jt_C',
 168         'jt_D',
 169         'jt_L',
 170         'jt_R',
 171         'jt_T',
 172         'jt_U',
 173         'jt_X',
 174 ]
 175
 176 class PropertyValue(object):
 177         def __init__(self, name_):
 178                 self.name = name_
 179         def __str__(self):
 180                 return self.name
 181         def __eq__(self, other):
 182                 return self.name == (other if isinstance(other, str) else other.name)
 183         def __ne__(self, other):
 184                 return not (self == other)
 185         def __hash__(self):
 186                 return hash(str(self))
 187
 188 property_values = {}
 189
 190 for name in property_names:
 191         value = PropertyValue(name)
 192         assert value not in property_values
 193         assert value not in globals()
 194         property_values[name] = value
 195 globals().update(property_values)
 196
 197
 198 def is_BASE(U, UISC, UDI, UGC, AJT):
 199         return (UISC in [Number, Consonant, Consonant_Head_Letter,
 200                         Tone_Letter,
 201                         Vowel_Independent,
 202                         ] or
 203                 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
 204                 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
 205                 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
 206                                         Consonant_Subjoined, Vowel, Vowel_Dependent]))
 207 def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
 208         return UISC == Brahmi_Joining_Number
 209 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
 210         if UISC == Consonant_Placeholder: return True
 211         return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
 212 def is_CGJ(U, UISC, UDI, UGC, AJT):
 213         # Also includes VARIATION_SELECTOR and ZWJ
 214         return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
 215 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
 216         return ((UISC == Consonant_Final and UGC != Lo) or
 217                 UISC == Consonant_Succeeding_Repha)
 218 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
 219         return UISC == Syllable_Modifier
 220 def is_CONS_MED(U, UISC, UDI, UGC, AJT):
 221         # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
 222         return (UISC == Consonant_Medial and UGC != Lo or
 223                 UISC == Consonant_Initial_Postfixed)
 224 def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
 225         return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
 226 def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
 227         return UISC == Consonant_Subjoined and UGC != Lo
 228 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
 229         return UISC == Consonant_With_Stacker
 230 def is_HALANT(U, UISC, UDI, UGC, AJT):
 231         return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
 232 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
 233         # Split off of HALANT
 234         return U == 0x0DCA
 235 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
 236         return UISC == Number_Joiner
 237 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
 238         return UISC == Hieroglyph
 239 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
 240         return UISC == Hieroglyph_Joiner
 241 def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
 242         return UISC == Hieroglyph_Mirror
 243 def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
 244         return UISC == Hieroglyph_Modifier
 245 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
 246         return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
 247 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
 248         return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
 249 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
 250         # Split off of HALANT
 251         return (UISC == Invisible_Stacker
 252                 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
 253         )
 254 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
 255         return UISC == Non_Joiner
 256 def is_OTHER(U, UISC, UDI, UGC, AJT):
 257         # Also includes BASE_IND and SYM
 258         return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
 259                 and not is_BASE(U, UISC, UDI, UGC, AJT)
 260                 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
 261                 and not is_CGJ(U, UISC, UDI, UGC, AJT)
 262                 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
 263                 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
 264         )
 265 def is_REPHA(U, UISC, UDI, UGC, AJT):
 266         return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
 267 def is_SAKOT(U, UISC, UDI, UGC, AJT):
 268         # Split off of HALANT
 269         return U == 0x1A60
 270 def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
 271         return UISC == Symbol_Modifier
 272 def is_VOWEL(U, UISC, UDI, UGC, AJT):
 273         return (UISC == Pure_Killer or
 274                 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
 275 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
 276         return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 277                 UGC != Lo and UISC == Bindu)
 278 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
 279         # Also includes Rsv
 280         return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
 281                 and UISC == Other
 282                 and not is_CGJ(U, UISC, UDI, UGC, AJT)
 283         ) or UGC == Cn
 284
 285 use_mapping = {
 286         'B':    is_BASE,
 287         'N':    is_BASE_NUM,
 288         'GB':   is_BASE_OTHER,
 289         'CGJ':  is_CGJ,
 290         'F':    is_CONS_FINAL,
 291         'FM':   is_CONS_FINAL_MOD,
 292         'M':    is_CONS_MED,
 293         'CM':   is_CONS_MOD,
 294         'SUB':  is_CONS_SUB,
 295         'CS':   is_CONS_WITH_STACKER,
 296         'H':    is_HALANT,
 297         'HVM':  is_HALANT_OR_VOWEL_MODIFIER,
 298         'HN':   is_HALANT_NUM,
 299         'IS':   is_INVISIBLE_STACKER,
 300         'G':    is_HIEROGLYPH,
 301         'HM':   is_HIEROGLYPH_MOD,
 302         'HR':   is_HIEROGLYPH_MIRROR,
 303         'J':    is_HIEROGLYPH_JOINER,
 304         'SB':   is_HIEROGLYPH_SEGMENT_BEGIN,
 305         'SE':   is_HIEROGLYPH_SEGMENT_END,
 306         'ZWNJ': is_ZWNJ,
 307         'O':    is_OTHER,
 308         'R':    is_REPHA,
 309         'Sk':   is_SAKOT,
 310         'SM':   is_SYM_MOD,
 311         'V':    is_VOWEL,
 312         'VM':   is_VOWEL_MOD,
 313         'WJ':   is_Word_Joiner,
 314 }
 315
 316 use_positions = {
 317         'F': {
 318                 'Abv': [Top],
 319                 'Blw': [Bottom],
 320                 'Pst': [Right],
 321         },
 322         'M': {
 323                 'Abv': [Top],
 324                 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
 325                 'Pst': [Right],
 326                 'Pre': [Left, Top_And_Bottom_And_Left],
 327         },
 328         'CM': {
 329                 'Abv': [Top],
 330                 'Blw': [Bottom, Overstruck],
 331         },
 332         'V': {
 333                 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
 334                 'Blw': [Bottom, Overstruck, Bottom_And_Right],
 335                 'Pst': [Right],
 336                 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
 337         },
 338         'VM': {
 339                 'Abv': [Top],
 340                 'Blw': [Bottom, Overstruck],
 341                 'Pst': [Right],
 342                 'Pre': [Left],
 343         },
 344         'SM': {
 345                 'Abv': [Top],
 346                 'Blw': [Bottom],
 347         },
 348         'H': None,
 349         'HM': None,
 350         'HR': None,
 351         'HVM': None,
 352         'IS': None,
 353         'B': None,
 354         'FM': {
 355                 'Abv': [Top],
 356                 'Blw': [Bottom],
 357                 'Pst': [Not_Applicable],
 358         },
 359         'R': None,
 360         'SUB': None,
 361 }
 362
 363 def map_to_use(data):
 364         out = {}
 365         items = use_mapping.items()
 366         for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
 367
 368                 # Resolve Indic_Syllabic_Category
 369
 370                 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
 371                 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
 372
 373                 # Tibetan:
 374                 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
 375                 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
 376
 377                 # TODO: U+1CED should only be allowed after some of
 378                 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
 379                 if U == 0x1CED: UISC = Tone_Mark
 380
 381                 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
 382                 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
 383                 USE = values[0]
 384
 385                 # Resolve Indic_Positional_Category
 386
 387                 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
 388                 #  and https://github.com/harfbuzz/harfbuzz/issues/1631
 389                 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
 390
 391                 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
 392                         USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
 393
 394                 pos_mapping = use_positions.get(USE, None)
 395                 if pos_mapping:
 396                         values = [k for k,v in pos_mapping.items() if v and UIPC in v]
 397                         assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
 398                         USE = USE + values[0]
 399
 400                 out[U] = (USE, UBlock)
 401         return out
 402
 403 use_data = map_to_use(combined)
 404
 405 print ("/* == Start of generated table == */")
 406 print ("/*")
 407 print (" * The following table is generated by running:")
 408 print (" *")
 409 print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
 410 print (" *")
 411 print (" * on files with these headers:")
 412 print (" *")
 413 for h in headers:
 414         for l in h:
 415                 print (" * %s" % (l.strip()))
 416 print (" */")
 417 print ()
 418 print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
 419 print ("#define HB_OT_SHAPER_USE_TABLE_HH")
 420 print ()
 421 print ('#include "hb.hh"')
 422 print ()
 423 print ('#include "hb-ot-shaper-use-machine.hh"')
 424 print ()
 425
 426 total = 0
 427 used = 0
 428 last_block = None
 429 def print_block (block, start, end, use_data):
 430         global total, used, last_block
 431         if block and block != last_block:
 432                 print ()
 433                 print ()
 434                 print ("  /* %s */" % block)
 435                 if start % 16:
 436                         print (' ' * (20 + (start % 16 * 6)), end='')
 437         num = 0
 438         assert start % 8 == 0
 439         assert (end+1) % 8 == 0
 440         for u in range (start, end+1):
 441                 if u % 16 == 0:
 442                         print ()
 443                         print ("  /* %04X */" % u, end='')
 444                 if u in use_data:
 445                         num += 1
 446                 d = use_data.get (u)
 447                 if d is not None:
 448                         d = d[0]
 449                 elif u in unicode_data[4]:
 450                         d = 'O'
 451                 else:
 452                         d = 'WJ'
 453                 print ("%6s," % d, end='')
 454
 455         total += end - start + 1
 456         used += num
 457         if block:
 458                 last_block = block
 459
 460 uu = sorted (use_data.keys ())
 461
 462 last = -100000
 463 num = 0
 464 offset = 0
 465 starts = []
 466 ends = []
 467 print ('#pragma GCC diagnostic push')
 468 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
 469 for k,v in sorted(use_mapping.items()):
 470         if k in use_positions and use_positions[k]: continue
 471         print ("#define %s      USE(%s) /* %s */" % (k, k, v.__name__[3:]))
 472 for k,v in sorted(use_positions.items()):
 473         if not v: continue
 474         for suf in v.keys():
 475                 tag = k + suf
 476                 print ("#define %s      USE(%s)" % (tag, tag))
 477 print ('#pragma GCC diagnostic pop')
 478 print ("")
 479
 480
 481 import packTab
 482 data = {u:v[0] for u,v in use_data.items()}
 483
 484 DEFAULT = 5
 485 COMPACT = 9
 486 for compression in (DEFAULT, COMPACT):
 487
 488     logging.info('  Compression=%d:' % compression)
 489     print()
 490     if compression == DEFAULT:
 491         print('#ifndef HB_OPTIMIZE_SIZE')
 492     elif compression == COMPACT:
 493         print('#else')
 494     else:
 495         assert False
 496     print()
 497
 498     code = packTab.Code('hb_use')
 499     sol = packTab.pack_table(data, compression=compression, default='O')
 500     logging.info('      FullCost=%d' % (sol.fullCost))
 501     sol.genCode(code, f'get_category')
 502     code.print_c(linkage='static inline')
 503     print ()
 504
 505 print('#endif')
 506
 507 print ()
 508 for k in sorted(use_mapping.keys()):
 509         if k in use_positions and use_positions[k]: continue
 510         print ("#undef %s" % k)
 511 for k,v in sorted(use_positions.items()):
 512         if not v: continue
 513         for suf in v.keys():
 514                 tag = k + suf
 515                 print ("#undef %s" % tag)
 516 print ()
 517 print ()
 518 print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
 519 print ("/* == End of generated table == */")