3 # fontconfig/fc-lang/fc-lang.py
5 # Copyright © 2001-2002 Keith Packard
6 # Copyright © 2019 Tim-Philipp Müller
8 # Permission to use, copy, modify, distribute, and sell this software and its
9 # documentation for any purpose is hereby granted without fee, provided that
10 # the above copyright notice appear in all copies and that both that
11 # copyright notice and this permission notice appear in supporting
12 # documentation, and that the name of the author(s) not be used in
13 # advertising or publicity pertaining to distribution of the software without
14 # specific, written prior permission. The authors make no
15 # representations about the suitability of this software for any purpose. It
16 # is provided "as is" without express or implied warranty.
18 # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
19 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
20 # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
21 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
22 # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
24 # PERFORMANCE OF THIS SOFTWARE.
28 # Read a set of language orthographies and build C declarations for
29 # charsets which can then be used to identify which languages are
30 # supported by a given font.
32 # TODO: this code is not very pythonic, a lot of it is a 1:1 translation
33 # of the C code and we could probably simplify it a bit
39 # we just store the leaves in a dict, we can order the leaves later if needed
42 self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
44 def add_char(self, ucs4):
45 assert ucs4 < 0x01000000
47 if leaf_num in self.leaves:
48 leaf = self.leaves[leaf_num]
50 leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
51 self.leaves[leaf_num] = leaf
52 leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
53 #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
55 def del_char(self, ucs4):
56 assert ucs4 < 0x01000000
58 if leaf_num in self.leaves:
59 leaf = self.leaves[leaf_num]
60 leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
61 # We don't bother removing the leaf if it's empty */
62 #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
64 def equals(self, other_cs):
65 keys = sorted(self.leaves.keys())
66 other_keys = sorted(other_cs.leaves.keys())
67 if len(keys) != len(other_keys):
69 for k1, k2 in zip(keys, other_keys):
72 if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
76 # Convert a file name into a name suitable for C declarations
77 def get_name(file_name):
78 return file_name.split('.')[0]
80 # Convert a C name into a language name
82 return c_name.replace('_', '-').replace(' ', '').lower()
84 def read_orth_file(file_name):
86 with open(file_name, 'r', encoding='utf-8') as orth_file:
87 for num, line in enumerate(orth_file):
88 if line.startswith('include '):
89 include_fn = line[8:].strip()
90 lines += read_orth_file(include_fn)
92 # remove comments and strip whitespaces
93 line = line.split('#')[0].strip()
94 line = line.split('\t')[0].strip()
97 lines += [(file_name, num, line)]
101 def leaves_equal(leaf1, leaf2):
102 for v1, v2 in zip(leaf1, leaf2):
107 # Build a single charset from a source file
109 # The file format is quite simple, either
110 # a single hex value or a pair separated with a dash
111 def parse_orth_file(file_name, lines):
113 for fn, num, line in lines:
114 delete_char = line.startswith('-')
117 if line.find('-') != -1:
118 parts = line.split('-')
119 elif line.find('..') != -1:
120 parts = line.split('..')
124 start = int(parts.pop(0), 16)
127 end = int(parts.pop(0), 16)
129 print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
131 for ucs4 in range(start, end+1):
133 charset.del_char(ucs4)
135 charset.add_char(ucs4)
137 assert charset.equals(charset) # sanity check for the equals function
141 if __name__=='__main__':
142 parser = argparse.ArgumentParser()
143 parser.add_argument('orth_files', nargs='+', help='List of .orth files')
144 parser.add_argument('--directory', dest='directory', default=None)
145 parser.add_argument('--template', dest='template_file', default=None)
146 parser.add_argument('--output', dest='output_file', default=None)
148 args = parser.parse_args()
161 sys.stdout = open(args.output_file, 'w', encoding='utf-8')
163 # Read the template file
164 if args.template_file:
165 tmpl_file = open(args.template_file, 'r', encoding='utf-8')
167 tmpl_file = sys.stdin
169 # Change into source dir if specified (after opening other files)
171 os.chdir(args.directory)
174 for i, fn in enumerate(args.orth_files):
177 for fn in sorted(orth_entries.keys()):
178 lines = read_orth_file(fn)
179 charset = parse_orth_file(fn, lines)
186 lang = get_lang(name)
188 if lang.find('-') != -1:
189 country.append(orth_entries[fn]) # maps to original index
190 language_family = lang.split('-')[0]
191 if not language_family in LangCountrySets:
192 LangCountrySets[language_family] = []
193 LangCountrySets[language_family] += [orth_entries[fn]]
195 total_leaves += len(charset.leaves)
200 for leaf_num in sorted(s.leaves.keys()):
201 leaf = s.leaves[leaf_num]
203 for existing_leaf in leaves:
204 if leaves_equal(leaf, existing_leaf):
207 #print('unique: ', is_unique)
211 # Find duplicate charsets
213 for i, s in enumerate(sets):
216 for j, s_cmp in enumerate(sets):
223 duplicate.append(dup_num)
227 for i, s in enumerate(sets):
233 # Scan the input until the marker is found
234 # FIXME: this is a bit silly really, might just as well hardcode
235 # the license header in the script and drop the template
236 for line in tmpl_file:
237 if line.strip() == '@@@':
241 print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
243 print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets)))
244 print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
245 print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
246 print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
247 print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))')
248 print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))')
249 print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
250 print('#define fcLangCharSets (fcLangData.langCharSets)')
251 print('#define fcLangCharSetIndices (fcLangData.langIndices)')
252 print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
254 assert len(sets) < 256 # FIXME: need to change index type to 16-bit below then
257 static const struct {{
258 FcLangCharSet langCharSets[{}];
259 FcCharLeaf leaves[{}];
260 uintptr_t leaf_offsets[{}];
261 FcChar16 numbers[{}];
263 {} langIndicesInv[{}];
264 }} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
265 'FcChar8 ', len(sets), 'FcChar8 ', len(sets)))
269 for i, s in enumerate(sets):
274 print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
275 langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
281 for l, leaf in enumerate(leaves):
282 print(' {{ {{ /* {} */'.format(l), end='')
283 for i in range(0, 8): # 256/32 = 8
286 print(' 0x{:08x},'.format(leaf[i]), end='')
292 for i, s in enumerate(sets):
296 print(' /* {} */'.format(names[i]))
298 for n, leaf_num in enumerate(sorted(s.leaves.keys())):
299 leaf = s.leaves[leaf_num]
302 found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
303 assert found, "Couldn't find leaf in unique leaves list!"
304 assert len(found) == 1
305 print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
308 if len(s.leaves) % 4 != 0:
314 for i, s in enumerate(sets):
318 print(' /* {} */'.format(names[i]))
320 for n, leaf_num in enumerate(sorted(s.leaves.keys())):
321 leaf = s.leaves[leaf_num]
324 print(' 0x{:04x},'.format(leaf_num), end='')
327 if len(s.leaves) % 8 != 0:
334 for i, s in enumerate(sets):
335 fn = '{}.orth'.format(names[i])
336 print(' {}, /* {} */'.format(orth_entries[fn], names[i]))
341 for i, k in enumerate(orth_entries.keys()):
343 idx = names.index(name)
344 print(' {}, /* {} */'.format(idx, name))
349 print('#define NUM_LANG_CHAR_SET {}'.format(len(sets)))
350 num_lang_set_map = (len(sets) + 31) // 32;
351 print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map))
353 # Dump indices with country codes
354 assert len(country) > 0
355 assert len(LangCountrySets) > 0
357 print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
358 for k in sorted(LangCountrySets.keys()):
359 langset_map = [0] * num_lang_set_map # initialise all zeros
360 for entries_id in LangCountrySets[k]:
361 langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
363 for v in langset_map:
364 print(' 0x{:08x},'.format(v), end='')
365 print(' }}, /* {} */'.format(k))
368 print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
370 # Find ranges for each letter for faster searching
371 # Dump sets start/finish for the fastpath
372 print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n')
373 for c in string.ascii_lowercase: # a-z
376 for i, s in enumerate(sets):
377 if names[i].startswith(c):
380 print(' {{ {}, {} }}, /* {} */'.format(start, stop, c))
383 # And flush out the rest of the input file
384 for line in tmpl_file: