3 """usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
9 import os.path, sys, re
11 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
13 if len (sys.argv) not in (2, 3):
16 # https://github.com/harfbuzz/packtab
20 logging.info('Loading UCDXML...')
21 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
24 hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
26 logging.info('Preparing data tables...')
28 gc = [u['gc'] for u in ucd]
29 ccc = [int(u['ccc']) for u in ucd]
30 bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
31 #gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
32 #gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
34 sc = [u['sc'] for u in ucd]
36 dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
37 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
38 ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
40 assert not any(v for v in dm.values() if len(v) not in (1,2))
41 dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
42 assert all((v[0] >> 16) in (0,2) for v in dm1)
43 dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
44 dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
45 dm1_order = {v:i+1 for i,v in enumerate(dm1)}
47 dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
48 for i,v in dm.items() if len(v) == 2)
50 filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
51 (v[1] & 0xFFFFFF80) == 0x0300 and
52 (v[2] & 0xFFF0C000) == 0x0000)
53 dm2_u32_array = [v for v in dm2 if filt(v[0])]
54 dm2_u64_array = [v for v in dm2 if not filt(v[0])]
55 assert dm2_u32_array + dm2_u64_array == dm2
56 dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
57 dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
59 l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
60 dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
63 dm_order.update(dm1_order)
64 dm_order.update(dm2_order)
67 for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
68 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
69 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
75 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
76 for line in open(hb_common_h):
77 m = sc_re.search (line)
80 tag = ''.join(m.group(i) for i in range(2, 6))
91 logging.info('Generating output...')
92 print("/* == Start of generated table == */")
94 print(" * The following table is generated by running:")
96 print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml")
98 print(" * on file with this description:", ucdxml.description)
101 print("#ifndef HB_UCD_TABLE_HH")
102 print("#define HB_UCD_TABLE_HH")
104 print('#include "hb.hh"')
107 code = packTab.Code('_hb_ucd')
108 sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
109 dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
110 dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
111 dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
112 dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
113 code.print_c(linkage='static inline')
116 ('gc', gc, 'Cn', gc_order),
117 ('ccc', ccc, 0, None),
118 ('bmg', bmg, 0, None),
119 ('sc', sc, 'Zzzz', sc_order),
120 ('dm', dm, None, dm_order),
123 for compression in (DEFAULT, COMPACT, SLOPPY):
124 logging.info(' Compression=%d:' % compression)
126 if compression == DEFAULT:
127 print('#ifndef HB_OPTIMIZE_SIZE')
128 elif compression == COMPACT:
129 print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
134 if compression == SLOPPY:
135 for i in range(len(gc)):
136 if (i % 128) and gc[i] == 'Cn':
138 for i in range(len(gc) - 2, -1, -1):
139 if ((i + 1) % 128) and gc[i] == 'Cn':
141 for i in range(len(sc)):
142 if (i % 128) and sc[i] == 'Zzzz':
144 for i in range(len(sc) - 2, -1, -1):
145 if ((i + 1) % 128) and sc[i] == 'Zzzz':
149 code = packTab.Code('_hb_ucd')
151 for name,data,default,mapping in datasets:
152 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
153 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
154 sol.genCode(code, name)
156 code.print_c(linkage='static inline')
164 print("#endif /* HB_UCD_TABLE_HH */")
166 print("/* == End of generated table == */")
167 logging.info('Done.')