fc-lang/fc-lang.py

   1 #!/usr/bin/env python3
   2 #
   3 # fontconfig/fc-lang/fc-lang.py
   4 #
   5 # Copyright © 2001-2002 Keith Packard
   6 # Copyright © 2019 Tim-Philipp Müller
   7 #
   8 # Permission to use, copy, modify, distribute, and sell this software and its
   9 # documentation for any purpose is hereby granted without fee, provided that
  10 # the above copyright notice appear in all copies and that both that
  11 # copyright notice and this permission notice appear in supporting
  12 # documentation, and that the name of the author(s) not be used in
  13 # advertising or publicity pertaining to distribution of the software without
  14 # specific, written prior permission.  The authors make no
  15 # representations about the suitability of this software for any purpose.  It
  16 # is provided "as is" without express or implied warranty.
  17 #
  18 # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  19 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
  20 # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
  21 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
  22 # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  23 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  24 # PERFORMANCE OF THIS SOFTWARE.
  25
  26 # fc-lang
  27 #
  28 # Read a set of language orthographies and build C declarations for
  29 # charsets which can then be used to identify which languages are
  30 # supported by a given font.
  31 #
  32 # TODO: this code is not very pythonic, a lot of it is a 1:1 translation
  33 # of the C code and we could probably simplify it a bit
  34 import argparse
  35 import string
  36 import sys
  37 import os
  38
  39 # we just store the leaves in a dict, we can order the leaves later if needed
  40 class CharSet:
  41     def __init__(self):
  42         self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
  43
  44     def add_char(self, ucs4):
  45         assert ucs4 < 0x01000000
  46         leaf_num = ucs4 >> 8
  47         if leaf_num in self.leaves:
  48             leaf = self.leaves[leaf_num]
  49         else:
  50             leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
  51             self.leaves[leaf_num] = leaf
  52         leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
  53         #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
  54
  55     def del_char(self, ucs4):
  56         assert ucs4 < 0x01000000
  57         leaf_num = ucs4 >> 8
  58         if leaf_num in self.leaves:
  59             leaf = self.leaves[leaf_num]
  60             leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
  61             # We don't bother removing the leaf if it's empty */
  62             #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
  63
  64     def equals(self, other_cs):
  65         keys = sorted(self.leaves.keys())
  66         other_keys = sorted(other_cs.leaves.keys())
  67         if len(keys) != len(other_keys):
  68             return False
  69         for k1, k2 in zip(keys, other_keys):
  70             if k1 != k2:
  71                 return False
  72             if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
  73                 return False
  74         return True
  75
  76 # Convert a file name into a name suitable for C declarations
  77 def get_name(file_name):
  78     return file_name.split('.')[0]
  79
  80 # Convert a C name into a language name
  81 def get_lang(c_name):
  82     return c_name.replace('_', '-').replace(' ', '').lower()
  83
  84 def read_orth_file(file_name):
  85     lines = []
  86     with open(file_name, 'r', encoding='utf-8') as orth_file:
  87         for num, line in enumerate(orth_file):
  88             if line.startswith('include '):
  89                 include_fn = line[8:].strip()
  90                 lines += read_orth_file(include_fn)
  91             else:
  92                 # remove comments and strip whitespaces
  93                 line = line.split('#')[0].strip()
  94                 line = line.split('\t')[0].strip()
  95                 # skip empty lines
  96                 if line:
  97                     lines += [(file_name, num, line)]
  98
  99     return lines
 100
 101 def leaves_equal(leaf1, leaf2):
 102     for v1, v2 in zip(leaf1, leaf2):
 103         if v1 != v2:
 104             return False
 105     return True
 106
 107 # Build a single charset from a source file
 108 #
 109 # The file format is quite simple, either
 110 # a single hex value or a pair separated with a dash
 111 def parse_orth_file(file_name, lines):
 112     charset = CharSet()
 113     for fn, num, line in lines:
 114         delete_char = line.startswith('-')
 115         if delete_char:
 116             line = line[1:]
 117         if line.find('-') != -1:
 118             parts = line.split('-')
 119         elif line.find('..') != -1:
 120             parts = line.split('..')
 121         else:
 122             parts = [line]
 123
 124         start = int(parts.pop(0), 16)
 125         end = start
 126         if parts:
 127             end = int(parts.pop(0), 16)
 128         if parts:
 129             print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
 130
 131         for ucs4 in range(start, end+1):
 132             if delete_char:
 133                 charset.del_char(ucs4)
 134             else:
 135                 charset.add_char(ucs4)
 136
 137     assert charset.equals(charset) # sanity check for the equals function
 138
 139     return charset
 140
 141 if __name__=='__main__':
 142     parser = argparse.ArgumentParser()
 143     parser.add_argument('orth_files', nargs='+', help='List of .orth files')
 144     parser.add_argument('--directory', dest='directory', default=None)
 145     parser.add_argument('--template', dest='template_file', default=None)
 146     parser.add_argument('--output', dest='output_file', default=None)
 147
 148     args = parser.parse_args()
 149
 150     sets = []
 151     names = []
 152     langs = []
 153     country = []
 154
 155     total_leaves = 0
 156
 157     LangCountrySets = {}
 158
 159     # Open output file
 160     if args.output_file:
 161         sys.stdout = open(args.output_file, 'w', encoding='utf-8')
 162
 163     # Read the template file
 164     if args.template_file:
 165         tmpl_file = open(args.template_file, 'r', encoding='utf-8')
 166     else:
 167         tmpl_file = sys.stdin
 168
 169     # Change into source dir if specified (after opening other files)
 170     if args.directory:
 171         os.chdir(args.directory)
 172
 173     orth_entries = {}
 174     for i, fn in enumerate(args.orth_files):
 175         orth_entries[fn] = i
 176
 177     for fn in sorted(orth_entries.keys()):
 178         lines = read_orth_file(fn)
 179         charset = parse_orth_file(fn, lines)
 180
 181         sets.append(charset)
 182
 183         name = get_name(fn)
 184         names.append(name)
 185
 186         lang = get_lang(name)
 187         langs.append(lang)
 188         if lang.find('-') != -1:
 189             country.append(orth_entries[fn]) # maps to original index
 190             language_family = lang.split('-')[0]
 191             if not language_family in LangCountrySets:
 192               LangCountrySets[language_family] = []
 193             LangCountrySets[language_family] += [orth_entries[fn]]
 194
 195         total_leaves += len(charset.leaves)
 196
 197     # Find unique leaves
 198     leaves = []
 199     for s in sets:
 200        for leaf_num in sorted(s.leaves.keys()):
 201            leaf = s.leaves[leaf_num]
 202            is_unique = True
 203            for existing_leaf in leaves:
 204                if leaves_equal(leaf, existing_leaf):
 205                   is_unique = False
 206                   break
 207            #print('unique: ', is_unique)
 208            if is_unique:
 209                leaves.append(leaf)
 210
 211     # Find duplicate charsets
 212     duplicate = []
 213     for i, s in enumerate(sets):
 214         dup_num = None
 215         if i >= 1:
 216             for j, s_cmp in enumerate(sets):
 217                 if j >= i:
 218                     break
 219                 if s_cmp.equals(s):
 220                     dup_num = j
 221                     break
 222
 223         duplicate.append(dup_num)
 224
 225     tn = 0
 226     off = {}
 227     for i, s in enumerate(sets):
 228         if duplicate[i]:
 229             continue
 230         off[i] = tn
 231         tn += len(s.leaves)
 232
 233     # Scan the input until the marker is found
 234     # FIXME: this is a bit silly really, might just as well hardcode
 235     #        the license header in the script and drop the template
 236     for line in tmpl_file:
 237         if line.strip() == '@@@':
 238             break
 239         print(line, end='')
 240
 241     print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
 242
 243     print('#define LEAF0       ({} * sizeof (FcLangCharSet))'.format(len(sets)))
 244     print('#define OFF0        (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
 245     print('#define NUM0        (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
 246     print('#define SET(n)      (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
 247     print('#define OFF(s,o)    (OFF0 + o * sizeof (uintptr_t) - SET(s))')
 248     print('#define NUM(s,n)    (NUM0 + n * sizeof (FcChar16) - SET(s))')
 249     print('#define LEAF(o,l)   (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
 250     print('#define fcLangCharSets (fcLangData.langCharSets)')
 251     print('#define fcLangCharSetIndices (fcLangData.langIndices)')
 252     print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
 253
 254     assert len(sets) < 256 # FIXME: need to change index type to 16-bit below then
 255
 256     print('''
 257 static const struct {{
 258     FcLangCharSet  langCharSets[{}];
 259     FcCharLeaf     leaves[{}];
 260     uintptr_t      leaf_offsets[{}];
 261     FcChar16       numbers[{}];
 262     {}       langIndices[{}];
 263     {}       langIndicesInv[{}];
 264 }} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
 265                              'FcChar8 ', len(sets), 'FcChar8 ', len(sets)))
 266
 267     # Dump sets
 268     print('{')
 269     for i, s in enumerate(sets):
 270         if duplicate[i]:
 271             j = duplicate[i]
 272         else:
 273             j = i
 274         print('    {{ "{}",  {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
 275                 langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
 276
 277     print('},')
 278
 279     # Dump leaves
 280     print('{')
 281     for l, leaf in enumerate(leaves):
 282         print('    {{ {{ /* {} */'.format(l), end='')
 283         for i in range(0, 8): # 256/32 = 8
 284             if i % 4 == 0:
 285                 print('\n   ', end='')
 286             print(' 0x{:08x},'.format(leaf[i]), end='')
 287         print('\n    } },')
 288     print('},')
 289
 290     # Dump leaves
 291     print('{')
 292     for i, s in enumerate(sets):
 293         if duplicate[i]:
 294             continue
 295
 296         print('    /* {} */'.format(names[i]))
 297
 298         for n, leaf_num in enumerate(sorted(s.leaves.keys())):
 299             leaf = s.leaves[leaf_num]
 300             if n % 4 == 0:
 301                 print('   ', end='')
 302             found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
 303             assert found, "Couldn't find leaf in unique leaves list!"
 304             assert len(found) == 1
 305             print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
 306             if n % 4 == 3:
 307                 print('')
 308         if len(s.leaves) % 4 != 0:
 309             print('')
 310
 311     print('},')
 312
 313     print('{')
 314     for i, s in enumerate(sets):
 315         if duplicate[i]:
 316             continue
 317
 318         print('    /* {} */'.format(names[i]))
 319
 320         for n, leaf_num in enumerate(sorted(s.leaves.keys())):
 321             leaf = s.leaves[leaf_num]
 322             if n % 8 == 0:
 323                 print('   ', end='')
 324             print(' 0x{:04x},'.format(leaf_num), end='')
 325             if n % 8 == 7:
 326                 print('')
 327         if len(s.leaves) % 8 != 0:
 328             print('')
 329
 330     print('},')
 331
 332     # langIndices
 333     print('{')
 334     for i, s in enumerate(sets):
 335         fn = '{}.orth'.format(names[i])
 336         print('    {}, /* {} */'.format(orth_entries[fn], names[i]))
 337     print('},')
 338
 339     # langIndicesInv
 340     print('{')
 341     for i, k in enumerate(orth_entries.keys()):
 342         name = get_name(k)
 343         idx = names.index(name)
 344         print('    {}, /* {} */'.format(idx, name))
 345     print('}')
 346
 347     print('};\n')
 348
 349     print('#define NUM_LANG_CHAR_SET    {}'.format(len(sets)))
 350     num_lang_set_map = (len(sets) + 31) // 32;
 351     print('#define NUM_LANG_SET_MAP     {}'.format(num_lang_set_map))
 352
 353     # Dump indices with country codes
 354     assert len(country) > 0
 355     assert len(LangCountrySets) > 0
 356     print('')
 357     print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
 358     for k in sorted(LangCountrySets.keys()):
 359         langset_map = [0] * num_lang_set_map # initialise all zeros
 360         for entries_id in LangCountrySets[k]:
 361             langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
 362         print('    {', end='')
 363         for v in langset_map:
 364             print(' 0x{:08x},'.format(v), end='')
 365         print(' }}, /* {} */'.format(k))
 366
 367     print('};\n')
 368     print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
 369
 370     # Find ranges for each letter for faster searching
 371     # Dump sets start/finish for the fastpath
 372     print('static const FcLangCharSetRange  fcLangCharSetRanges[] = {\n')
 373     for c in string.ascii_lowercase: # a-z
 374         start = 9999
 375         stop = -1
 376         for i, s in enumerate(sets):
 377             if names[i].startswith(c):
 378                 start = min(start,i)
 379                 stop = max(stop,i)
 380         print('    {{ {}, {} }}, /* {} */'.format(start, stop, c))
 381     print('};\n')
 382
 383     # And flush out the rest of the input file
 384     for line in tmpl_file:
 385         print(line, end='')
 386
 387     sys.stdout.flush()