3 from __future__ import print_function, division, absolute_import
5 import io, os.path, sys, re
7 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
9 if len (sys.argv) not in (2, 3):
10 print("usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]", file=sys.stderr)
13 # https://github.com/harfbuzz/packtab
17 logging.info('Loading UCDXML...')
18 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
19 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
21 hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
23 logging.info('Preparing data tables...')
25 gc = [u['gc'] for u in ucd]
26 ccc = [int(u['ccc']) for u in ucd]
27 bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
28 #gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
29 #gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
31 sc = [u['sc'] for u in ucd]
33 dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
34 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
35 ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
37 assert not any(v for v in dm.values() if len(v) not in (1,2))
38 dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
39 assert all((v[0] >> 16) in (0,2) for v in dm1)
40 dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
41 dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
42 dm1_order = {v:i+1 for i,v in enumerate(dm1)}
44 dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
45 for i,v in dm.items() if len(v) == 2)
47 filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
48 (v[1] & 0xFFFFFF80) == 0x0300 and
49 (v[2] & 0xFFF0C000) == 0x0000)
50 dm2_u32_array = [v for v in dm2 if filt(v[0])]
51 dm2_u64_array = [v for v in dm2 if not filt(v[0])]
52 assert dm2_u32_array + dm2_u64_array == dm2
53 dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
54 dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
56 l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
57 dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
60 dm_order.update(dm1_order)
61 dm_order.update(dm2_order)
64 for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
65 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
66 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
72 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
73 for line in open(hb_common_h):
74 m = sc_re.search (line)
77 tag = ''.join(m.group(i) for i in range(2, 6))
88 logging.info('Generating output...')
89 print("/* == Start of generated table == */")
91 print(" * The following table is generated by running:")
93 print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml")
95 print(" * on file with this description:", ucdxml.description)
98 print("#ifndef HB_UCD_TABLE_HH")
99 print("#define HB_UCD_TABLE_HH")
101 print('#include "hb.hh"')
104 code = packTab.Code('_hb_ucd')
105 sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
106 dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
107 dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
108 dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
109 dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
110 code.print_c(linkage='static inline')
113 ('gc', gc, 'Cn', gc_order),
114 ('ccc', ccc, 0, None),
115 ('bmg', bmg, 0, None),
116 ('sc', sc, 'Zzzz', sc_order),
117 ('dm', dm, None, dm_order),
120 for compression in (DEFAULT, COMPACT, SLOPPY):
121 logging.info(' Compression=%d:' % compression)
123 if compression == DEFAULT:
124 print('#ifndef HB_OPTIMIZE_SIZE')
125 elif compression == COMPACT:
126 print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
131 if compression == SLOPPY:
132 for i in range(len(gc)):
133 if (i % 128) and gc[i] == 'Cn':
135 for i in range(len(gc) - 2, -1, -1):
136 if ((i + 1) % 128) and gc[i] == 'Cn':
138 for i in range(len(sc)):
139 if (i % 128) and sc[i] == 'Zzzz':
141 for i in range(len(sc) - 2, -1, -1):
142 if ((i + 1) % 128) and sc[i] == 'Zzzz':
146 code = packTab.Code('_hb_ucd')
148 for name,data,default,mapping in datasets:
149 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
150 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
151 sol.genCode(code, name)
153 code.print_c(linkage='static inline')
161 print("#endif /* HB_UCD_TABLE_HH */")
163 print("/* == End of generated table == */")
164 logging.info('Done.')