3 """usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
11 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
13 if len (sys.argv) not in (2, 3):
16 # https://github.com/harfbuzz/packtab
20 logging.info('Loading UCDXML...')
21 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
24 hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
26 logging.info('Preparing data tables...')
29 # This is how the data is encoded:
31 # General_Category (gc), Canonical_Combining_Class (ccc),
32 # and Script (sc) are encoded as integers.
34 # Mirroring character (bmg) is encoded as difference from
35 # the original character.
37 # Composition & Decomposition (dm) are encoded elaborately,
40 gc = [u['gc'] for u in ucd]
41 ccc = [int(u['ccc']) for u in ucd]
42 bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
43 sc = [u['sc'] for u in ucd]
46 # Prepare Compose / Decompose data
48 # This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
50 dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
51 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
52 ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
54 assert not any(v for v in dm.values() if len(v) not in (1,2))
55 dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
56 assert all((v[0] >> 16) in (0,2) for v in dm1)
57 dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
58 dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
59 dm1_order = {v:i+1 for i,v in enumerate(dm1)}
61 dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
62 for i,v in dm.items() if len(v) == 2)
64 filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
65 (v[1] & 0xFFFFFF80) == 0x0300 and
66 (v[2] & 0xFFF0C000) == 0x0000)
67 dm2_u32_array = [v for v in dm2 if filt(v[0])]
68 dm2_u64_array = [v for v in dm2 if not filt(v[0])]
69 assert dm2_u32_array + dm2_u64_array == dm2
70 dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
71 dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
73 l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
74 dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
77 dm_order.update(dm1_order)
78 dm_order.update(dm2_order)
81 # Prepare General_Category / Script mapping arrays
84 for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
85 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
86 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
92 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
93 for line in open(hb_common_h):
94 m = sc_re.search (line)
97 tag = ''.join(m.group(i) for i in range(2, 6))
101 sc_array.append(name)
104 # Write out main data
110 compression_level = {
116 logging.info('Generating output...')
117 print("/* == Start of generated table == */")
119 print(" * The following table is generated by running:")
121 print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml")
123 print(" * on file with this description:", ucdxml.description)
126 print("#ifndef HB_UCD_TABLE_HH")
127 print("#define HB_UCD_TABLE_HH")
129 print('#include "hb.hh"')
135 code = packTab.Code('_hb_ucd')
136 sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
137 dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
138 dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
139 dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
140 dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
141 code.print_c(linkage='static inline')
144 ('gc', gc, 'Cn', gc_order),
145 ('ccc', ccc, 0, None),
146 ('bmg', bmg, 0, None),
147 ('sc', sc, 'Zzzz', sc_order),
148 ('dm', dm, None, dm_order),
154 for step in (DEFAULT, COMPACT, SLOPPY):
155 compression = compression_level[step]
156 logging.info(' Compression=%d:' % compression)
159 print('#ifndef HB_OPTIMIZE_SIZE')
160 elif step == COMPACT:
161 print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
169 for i in range(len(gc)):
170 if (i % 128) and gc[i] == 'Cn':
172 for i in range(len(gc) - 2, -1, -1):
173 if ((i + 1) % 128) and gc[i] == 'Cn':
175 for i in range(len(sc)):
176 if (i % 128) and sc[i] == 'Zzzz':
178 for i in range(len(sc) - 2, -1, -1):
179 if ((i + 1) % 128) and sc[i] == 'Zzzz':
183 code = packTab.Code('_hb_ucd')
185 for name,data,default,mapping in datasets:
186 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
187 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
188 sol.genCode(code, name)
190 code.print_c(linkage='static inline')
199 print("#endif /* HB_UCD_TABLE_HH */")
201 print("/* == End of generated table == */")
202 logging.info('Done.')