5 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
7 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
10 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
11 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
12 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
13 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
14 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
15 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
16 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
17 * ms-use/IndicSyllabicCategory-Additional.txt
18 * ms-use/IndicPositionalCategory-Additional.txt
23 if len (sys.argv) != 10:
34 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
36 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
42 headers[j - 1].append(line)
43 headers.append (["UnicodeData.txt does not have a header."])
45 unicode_data = [{} for _ in files]
46 values = [{} for _ in files]
47 for i, f in enumerate (files):
54 fields = [x.strip () for x in line.split (';')]
58 uu = fields[0].split ('..')
59 start = int (uu[0], 16)
65 t = fields[1 if i not in [2, 4] else 2]
69 elif i == 3 and t != 'Default_Ignorable_Code_Point':
71 elif i == 7 and t == 'Consonant_Final_Modifier':
72 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
73 t = 'Syllable_Modifier'
74 elif i == 8 and t == 'NA':
77 i0 = i if i < 7 else i - 7
78 for u in range (start, end + 1):
79 unicode_data[i0][u] = t
80 values[i0][t] = values[i0].get (t, 0) + end - start + 1
82 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
84 # Merge data into one dict:
85 for i,v in enumerate (defaults):
86 values[i][v] = values[i].get (v, 0) + 1
88 for i,d in enumerate (unicode_data):
89 for u,v in d.items ():
93 combined[u] = list (defaults)
95 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
100 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
101 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
102 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
103 # Indic_Syllabic_Category
115 'Consonant_Placeholder',
118 'Consonant_With_Stacker',
119 'Consonant_Prefixed',
120 'Consonant_Preceding_Repha',
121 'Consonant_Succeeding_Repha',
122 'Consonant_Subjoined',
125 'Consonant_Head_Letter',
126 'Consonant_Initial_Postfixed',
139 'Brahmi_Joining_Number',
143 'Hieroglyph_Mark_Begin',
144 'Hieroglyph_Mark_End',
146 'Hieroglyph_Modifier',
147 'Hieroglyph_Segment_Begin',
148 'Hieroglyph_Segment_End',
149 # Indic_Positional_Category
158 'Top_And_Bottom_And_Left',
161 'Top_And_Left_And_Right',
164 'Top_And_Bottom_And_Right',
176 class PropertyValue(object):
177 def __init__(self, name_):
181 def __eq__(self, other):
182 return self.name == (other if isinstance(other, str) else other.name)
183 def __ne__(self, other):
184 return not (self == other)
186 return hash(str(self))
190 for name in property_names:
191 value = PropertyValue(name)
192 assert value not in property_values
193 assert value not in globals()
194 property_values[name] = value
195 globals().update(property_values)
198 def is_BASE(U, UISC, UDI, UGC, AJT):
199 return (UISC in [Number, Consonant, Consonant_Head_Letter,
203 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
204 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
205 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
206 Consonant_Subjoined, Vowel, Vowel_Dependent]))
207 def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
208 return UISC == Brahmi_Joining_Number
209 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
210 if UISC == Consonant_Placeholder: return True
211 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
212 def is_CGJ(U, UISC, UDI, UGC, AJT):
213 # Also includes VARIATION_SELECTOR and ZWJ
214 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
215 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
216 return ((UISC == Consonant_Final and UGC != Lo) or
217 UISC == Consonant_Succeeding_Repha)
218 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
219 return UISC == Syllable_Modifier
220 def is_CONS_MED(U, UISC, UDI, UGC, AJT):
221 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
222 return (UISC == Consonant_Medial and UGC != Lo or
223 UISC == Consonant_Initial_Postfixed)
224 def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
225 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
226 def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
227 return UISC == Consonant_Subjoined and UGC != Lo
228 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
229 return UISC == Consonant_With_Stacker
230 def is_HALANT(U, UISC, UDI, UGC, AJT):
231 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
232 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
233 # Split off of HALANT
235 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
236 return UISC == Number_Joiner
237 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
238 return UISC == Hieroglyph
239 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
240 return UISC == Hieroglyph_Joiner
241 def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
242 return UISC == Hieroglyph_Mirror
243 def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
244 return UISC == Hieroglyph_Modifier
245 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
246 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
247 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
248 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
249 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
250 # Split off of HALANT
251 return (UISC == Invisible_Stacker
252 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
254 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
255 return UISC == Non_Joiner
256 def is_OTHER(U, UISC, UDI, UGC, AJT):
257 # Also includes BASE_IND and SYM
258 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
259 and not is_BASE(U, UISC, UDI, UGC, AJT)
260 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
261 and not is_CGJ(U, UISC, UDI, UGC, AJT)
262 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
263 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
265 def is_REPHA(U, UISC, UDI, UGC, AJT):
266 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
267 def is_SAKOT(U, UISC, UDI, UGC, AJT):
268 # Split off of HALANT
270 def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
271 return UISC == Symbol_Modifier
272 def is_VOWEL(U, UISC, UDI, UGC, AJT):
273 return (UISC == Pure_Killer or
274 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
275 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
276 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
277 UGC != Lo and UISC == Bindu)
278 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
280 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
282 and not is_CGJ(U, UISC, UDI, UGC, AJT)
291 'FM': is_CONS_FINAL_MOD,
295 'CS': is_CONS_WITH_STACKER,
297 'HVM': is_HALANT_OR_VOWEL_MODIFIER,
299 'IS': is_INVISIBLE_STACKER,
301 'HM': is_HIEROGLYPH_MOD,
302 'HR': is_HIEROGLYPH_MIRROR,
303 'J': is_HIEROGLYPH_JOINER,
304 'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
305 'SE': is_HIEROGLYPH_SEGMENT_END,
313 'WJ': is_Word_Joiner,
324 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
326 'Pre': [Left, Top_And_Bottom_And_Left],
330 'Blw': [Bottom, Overstruck],
333 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
334 'Blw': [Bottom, Overstruck, Bottom_And_Right],
336 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
340 'Blw': [Bottom, Overstruck],
357 'Pst': [Not_Applicable],
363 def map_to_use(data):
365 items = use_mapping.items()
366 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
368 # Resolve Indic_Syllabic_Category
370 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
371 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
374 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
375 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
377 # TODO: U+1CED should only be allowed after some of
378 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
379 if U == 0x1CED: UISC = Tone_Mark
381 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
382 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
385 # Resolve Indic_Positional_Category
387 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
388 # and https://github.com/harfbuzz/harfbuzz/issues/1631
389 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
391 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
392 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
394 pos_mapping = use_positions.get(USE, None)
396 values = [k for k,v in pos_mapping.items() if v and UIPC in v]
397 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
398 USE = USE + values[0]
400 out[U] = (USE, UBlock)
403 use_data = map_to_use(combined)
405 print ("/* == Start of generated table == */")
407 print (" * The following table is generated by running:")
409 print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
411 print (" * on files with these headers:")
415 print (" * %s" % (l.strip()))
418 print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
419 print ("#define HB_OT_SHAPER_USE_TABLE_HH")
421 print ('#include "hb.hh"')
423 print ('#include "hb-ot-shaper-use-machine.hh"')
429 def print_block (block, start, end, use_data):
430 global total, used, last_block
431 if block and block != last_block:
434 print (" /* %s */" % block)
436 print (' ' * (20 + (start % 16 * 6)), end='')
438 assert start % 8 == 0
439 assert (end+1) % 8 == 0
440 for u in range (start, end+1):
443 print (" /* %04X */" % u, end='')
449 elif u in unicode_data[4]:
453 print ("%6s," % d, end='')
455 total += end - start + 1
460 uu = sorted (use_data.keys ())
467 print ('#pragma GCC diagnostic push')
468 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
469 for k,v in sorted(use_mapping.items()):
470 if k in use_positions and use_positions[k]: continue
471 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:]))
472 for k,v in sorted(use_positions.items()):
476 print ("#define %s USE(%s)" % (tag, tag))
477 print ('#pragma GCC diagnostic pop')
482 data = {u:v[0] for u,v in use_data.items()}
486 for compression in (DEFAULT, COMPACT):
488 logging.info(' Compression=%d:' % compression)
490 if compression == DEFAULT:
491 print('#ifndef HB_OPTIMIZE_SIZE')
492 elif compression == COMPACT:
498 code = packTab.Code('hb_use')
499 sol = packTab.pack_table(data, compression=compression, default='O')
500 logging.info(' FullCost=%d' % (sol.fullCost))
501 sol.genCode(code, f'get_category')
502 code.print_c(linkage='static inline')
508 for k in sorted(use_mapping.keys()):
509 if k in use_positions and use_positions[k]: continue
510 print ("#undef %s" % k)
511 for k,v in sorted(use_positions.items()):
515 print ("#undef %s" % tag)
518 print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
519 print ("/* == End of generated table == */")