- add sources.
[platform/framework/web/crosswalk.git] / src / third_party / gtk+ / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
33
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
36
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
40
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42  * Copyright (C) 2007, 2008 GNOME Foundation
43  *
44  * This library is free software; you can redistribute it and/or
45  * modify it under the terms of the GNU Lesser General Public
46  * License as published by the Free Software Foundation; either
47  * version 2 of the License, or (at your option) any later version.
48  *
49  * This library is distributed in the hope that it will be useful,
50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52  * Lesser General Public License for more details.
53  *
54  * You should have received a copy of the GNU Lesser General Public
55  * License along with this library; if not, write to the
56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57  * Boston, MA 02111-1307, USA.
58  */
59
60 /*
61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62  * using the input files
63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66  *
67  * This table is optimised for space and requires special handling to access the content.
68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69  * 
70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
72  */
73
74 /*
75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
76  * file for a list of people on the GTK+ Team.  See the ChangeLog
77  * files for a list of changes.  These files are distributed with
78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79  */
80
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83
84 /* === These are the original comments of the file; we keep for historical purposes ===
85  *
86  * The following table was generated from the X compose tables include with
87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88  * to obtain the relevant perl scripts.
89  *
90  * The following compose letter letter sequences confliced
91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
97  *
98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99  *   spanish. atilde and otilde are used at least for Portuguese ]
100  *
101  *   at and Aring; resolved to Aring                                          [ AA ]
102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
104  *
105  * This probably should be resolved by first checking an additional set of compose tables
106  * that depend on the locale or selected input method.
107  */
108
109 static const guint16 gtk_compose_seqs_compact[] = {"""
110
111 headerfile_end = """};
112
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114 """
115
116 def stringtohex(str): return atoi(str, 16)
117
118 def factorial(n): 
119         if n <= 1:
120                 return 1
121         else:
122                 return n * factorial(n-1)
123
124 def uniq(*args) :
125         """ Performs a uniq operation on a list or lists """
126         theInputList = []
127         for theList in args:
128            theInputList += theList
129         theFinalList = []
130         for elem in theInputList:
131                 if elem not in theFinalList:
132                         theFinalList.append(elem)
133         return theFinalList
134
135
136
137 def all_permutations(seq):
138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139         """ Produces all permutations of the items of a list """
140         if len(seq) <=1:
141             yield seq
142         else:
143             for perm in all_permutations(seq[1:]):
144                 for i in range(len(perm)+1):
145                     #nb str[0:1] works in both string and list contexts
146                         yield perm[:i] + seq[0:1] + perm[i:]
147
148 def usage():
149         print """compose-parse available parameters:
150         -h, --help              this craft
151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
152         -a, --algorithmic       show sequences saved with algorithmic optimisation
153         -g, --gtk               show entries that go to GTK+
154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
155         -v, --verbose           show verbose output
156         -p, --plane1            show plane1 compose sequences
157         -n, --numeric           when used with --gtk, create file with numeric values only
158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
159         --all-sequences         when used with --gtk, create file with entries rejected by default
160         Default is to show statistics.
161         """
162
163 try: 
164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
166 except: 
167         usage()
168         sys.exit(2)
169
170 opt_statistics = False
171 opt_algorithmic = False
172 opt_gtk = False
173 opt_unicodedatatxt = False
174 opt_verbose = False
175 opt_plane1 = False
176 opt_numeric = False
177 opt_gtkexpanded = False
178 opt_allsequences = False
179
180 for o, a in opts:
181         if o in ("-h", "--help"):
182                 usage()
183                 sys.exit()
184         if o in ("-s", "--statistics"):
185                 opt_statistics = True
186         if o in ("-a", "--algorithmic"):
187                 opt_algorithmic = True
188         if o in ("-g", "--gtk"):
189                 opt_gtk = True  
190         if o in ("-u", "--unicodedatatxt"):
191                 opt_unicodedatatxt = True
192         if o in ("-v", "--verbose"):
193                 opt_verbose = True
194         if o in ("-p", "--plane1"):
195                 opt_plane1 = True
196         if o in ("-n", "--numeric"):
197                 opt_numeric = True
198         if o in ("-e", "--gtk-expanded"):
199                 opt_gtkexpanded = True
200         if o == "--all-sequences":
201                 opt_allsequences = True
202
203 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
204         opt_statistics = True
205
206 def download_hook(blocks_transferred, block_size, file_size):
207         """ A download hook to provide some feedback when downloading """
208         if blocks_transferred == 0:
209                 if file_size > 0:
210                         if opt_verbose:
211                                 print "Downloading", file_size, "bytes: ",
212                 else:   
213                         if opt_verbose:
214                                 print "Downloading: ",
215         sys.stdout.write('#')
216         sys.stdout.flush()
217
218
219 def download_file(url):
220         """ Downloads a file provided a URL. Returns the filename. """
221         """ Borks on failure """
222         localfilename = url.split('/')[-1]
223         if not isfile(localfilename) or getsize(localfilename) <= 0:
224                 if opt_verbose:
225                         print "Downloading ", url, "..."
226                 try: 
227                         urlretrieve(url, localfilename, download_hook)
228                 except IOError, (errno, strerror):
229                         print "I/O error(%s): %s" % (errno, strerror)
230                         sys.exit(-1)
231                 except:
232                         print "Unexpected error: ", sys.exc_info()[0]
233                         sys.exit(-1)
234                 print " done."
235         else:
236                 if opt_verbose:
237                         print "Using cached file for ", url
238         return localfilename
239
240 def process_gdkkeysymsh():
241         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
242         """ Fills up keysymdb with contents """
243         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
244         try: 
245                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
246         except IOError, (errno, strerror):
247                 print "I/O error(%s): %s" % (errno, strerror)
248                 sys.exit(-1)
249         except:
250                 print "Unexpected error: ", sys.exc_info()[0]
251                 sys.exit(-1)
252
253         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
254         linenum_gdkkeysymsh = 0
255         keysymdb = {}
256         for line in gdkkeysymsh.readlines():
257                 linenum_gdkkeysymsh += 1
258                 line = line.strip()
259                 if line == "" or not match('^#define GDK_KEY_', line):
260                         continue
261                 components = split('\s+', line)
262                 if len(components) < 3:
263                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
265                         print "Was expecting 3 items in the line"
266                         sys.exit(-1)
267                 if not match('^GDK_KEY_', components[1]):
268                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
269                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
270                         print "Was expecting a keysym starting with GDK_KEY_"
271                         sys.exit(-1)
272                 if match('^0x[0-9a-fA-F]+$', components[2]):
273                         unival = long(components[2][2:], 16)
274                         if unival == 0:
275                                 continue
276                         keysymdb[components[1][8:]] = unival
277                 else:
278                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
279                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
280                         print "Was expecting a hexadecimal number at the end of the line"
281                         sys.exit(-1)
282         gdkkeysymsh.close()
283
284         """ Patch up the keysymdb with some of our own stuff """
285
286         """ This is for a missing keysym from the currently upstream file """
287         #keysymdb['dead_stroke'] = 0x338
288
289         """ This is for a missing keysym from the currently upstream file """
290         ###keysymdb['dead_belowring'] = 0x323
291         ###keysymdb['dead_belowmacron'] = 0x331
292         ###keysymdb['dead_belowcircumflex'] = 0x32d
293         ###keysymdb['dead_belowtilde'] = 0x330
294         ###keysymdb['dead_belowbreve'] = 0x32e
295         ###keysymdb['dead_belowdiaeresis'] = 0x324
296
297         """ This is^Wwas preferential treatment for Greek """
298         # keysymdb['dead_tilde'] = 0x342                
299         """ This is^was preferential treatment for Greek """
300         #keysymdb['combining_tilde'] = 0x342    
301
302         """ Fixing VoidSymbol """
303         keysymdb['VoidSymbol'] = 0xFFFF
304
305         return keysymdb
306
307 def process_keysymstxt():
308         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
309         """ This file keeps a record between keysyms <-> unicode chars """
310         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
311         try: 
312                 keysymstxt = open(filename_keysymstxt, 'r')
313         except IOError, (errno, strerror):
314                 print "I/O error(%s): %s" % (errno, strerror)
315                 sys.exit(-1)
316         except:
317                 print "Unexpected error: ", sys.exc_info()[0]
318                 sys.exit(-1)
319
320         """ Parse the keysyms.txt file and place content in  keysymdb """
321         linenum_keysymstxt = 0
322         keysymdb = {}
323         for line in keysymstxt.readlines():
324                 linenum_keysymstxt += 1
325                 line = line.strip()
326                 if line == "" or match('^#', line):
327                         continue
328                 components = split('\s+', line)
329                 if len(components) < 5:
330                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
331                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
332                         print "Was expecting 5 items in the line"
333                         sys.exit(-1)
334                 if match('^U[0-9a-fA-F]+$', components[1]):
335                         unival = long(components[1][1:], 16)
336                 if unival == 0:
337                         continue
338                 keysymdb[components[4]] = unival
339         keysymstxt.close()
340
341         """ Patch up the keysymdb with some of our own stuff """
342         """ This is for a missing keysym from the currently upstream file """
343         ###keysymdb['dead_belowring'] = 0x323
344         ###keysymdb['dead_belowmacron'] = 0x331
345         ###keysymdb['dead_belowcircumflex'] = 0x32d
346         ###keysymdb['dead_belowtilde'] = 0x330
347         ###keysymdb['dead_belowbreve'] = 0x32e
348         ###keysymdb['dead_belowdiaeresis'] = 0x324
349
350         """ This is preferential treatment for Greek """
351         """ => we get more savings if used for Greek """
352         # keysymdb['dead_tilde'] = 0x342                
353         """ This is preferential treatment for Greek """
354         # keysymdb['combining_tilde'] = 0x342   
355
356         """ This is for a missing keysym from Markus Kuhn's db """
357         keysymdb['dead_stroke'] = 0x338
358         """ This is for a missing keysym from Markus Kuhn's db """
359         keysymdb['Oslash'] = 0x0d8              
360         """ This is for a missing keysym from Markus Kuhn's db """
361         keysymdb['Ssharp'] = 0x1e9e
362
363         """ This is for a missing (recently added) keysym """
364         keysymdb['dead_psili'] = 0x313          
365         """ This is for a missing (recently added) keysym """
366         keysymdb['dead_dasia'] = 0x314          
367
368         """ Allows to import Multi_key sequences """
369         keysymdb['Multi_key'] = 0xff20
370
371         keysymdb['zerosubscript'] = 0x2080
372         keysymdb['onesubscript'] = 0x2081
373         keysymdb['twosubscript'] = 0x2082
374         keysymdb['threesubscript'] = 0x2083
375         keysymdb['foursubscript'] = 0x2084
376         keysymdb['fivesubscript'] = 0x2085
377         keysymdb['sixsubscript'] = 0x2086
378         keysymdb['sevensubscript'] = 0x2087
379         keysymdb['eightsubscript'] = 0x2088
380         keysymdb['ninesubscript'] = 0x2089
381         keysymdb['dead_doublegrave'] = 0x030F
382         keysymdb['dead_invertedbreve'] = 0x0311
383
384         return keysymdb
385
386 def keysymvalue(keysym, file = "n/a", linenum = 0):
387         """ Extracts a value from the keysym """
388         """ Find the value of keysym, using the data from keysyms """
389         """ Use file and linenum to when reporting errors """
390         if keysym == "":
391                 return 0
392         if keysymdatabase.has_key(keysym):
393                 return keysymdatabase[keysym]
394         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
395                 return atoi(keysym[1:], 16)
396         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
397                 return atoi(keysym[2:], 16)
398         else:
399                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
400                 #return -1
401                 sys.exit(-1)
402
403 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
404         """ Extracts a value from the keysym """
405         """ Find the value of keysym, using the data from keysyms """
406         """ Use file and linenum to when reporting errors """
407         if keysym == "":
408                 return 0
409         if keysymunicodedatabase.has_key(keysym):
410                 return keysymunicodedatabase[keysym]
411         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
412                 return atoi(keysym[1:], 16)
413         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
414                 return atoi(keysym[2:], 16)
415         else:
416                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
417                 sys.exit(-1)
418
419 def rename_combining(seq):
420         filtered_sequence = []
421         for ks in seq:
422                 if findall('^combining_', ks):
423                         ks = sub('^combining_', 'dead_', ks)
424                 if ks == 'dead_double_grave':
425                         ks = 'dead_doublegrave'
426                 if ks == 'dead_inverted_breve':
427                         ks = 'dead_invertedbreve'
428                 filtered_sequence.append(ks)
429         return filtered_sequence
430
431
432 keysymunicodedatabase = process_keysymstxt()
433 keysymdatabase = process_gdkkeysymsh()
434
435 """ Grab and open the compose file from upstream """
436 filename_compose = download_file(URL_COMPOSE)
437 try: 
438         composefile = open(filename_compose, 'r')
439 except IOError, (errno, strerror):
440         print "I/O error(%s): %s" % (errno, strerror)
441         sys.exit(-1)
442 except:
443         print "Unexpected error: ", sys.exc_info()[0]
444         sys.exit(-1)
445
446 """ Look if there is a lookaside (supplementary) compose file in the current
447     directory, and if so, open, then merge with upstream Compose file.
448 """
449 xorg_compose_sequences_raw = []
450 for seq in composefile.readlines():
451         xorg_compose_sequences_raw.append(seq)
452
453 try:
454         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
455         for seq in composefile_lookaside.readlines():
456                 xorg_compose_sequences_raw.append(seq)
457 except IOError, (errno, strerror):
458         if opt_verbose:
459                 print "I/O error(%s): %s" % (errno, strerror)
460                 print "Did not find lookaside compose file. Continuing..."
461 except:
462         print "Unexpected error: ", sys.exc_info()[0]
463         sys.exit(-1)
464
465 """ Parse the compose file in  xorg_compose_sequences"""
466 xorg_compose_sequences = []
467 xorg_compose_sequences_algorithmic = []
468 linenum_compose = 0
469 comment_nest_depth = 0
470 for line in xorg_compose_sequences_raw:
471         linenum_compose += 1
472         line = line.strip()
473         if match("^XCOMM", line) or match("^#", line):
474                 continue
475
476         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
477
478         comment_start = line.find("/*")
479
480         if comment_start >= 0:
481                 if comment_nest_depth == 0:
482                         line = line[:comment_start]
483                 else:
484                         line = ""
485
486                 comment_nest_depth += 1
487         else:
488                 comment_end = line.find("*/")
489
490                 if comment_end >= 0:
491                         comment_nest_depth -= 1
492
493                 if comment_nest_depth < 0:
494                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
495                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
496                         exit(-1)
497
498                 if comment_nest_depth > 0:
499                         line = ""
500                 else:
501                         line = line[comment_end + 2:]
502
503         if line is "":
504                 continue
505
506         #line = line[:-1]
507         components = split(':', line)
508         if len(components) != 2:
509                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
510                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
511                 exit(-1)
512         (seq, val ) = split(':', line)
513         seq = seq.strip()
514         val = val.strip()
515         raw_sequence = findall('\w+', seq)
516         values = split('\s+', val)
517         unichar_temp = split('"', values[0])
518         unichar = unichar_temp[1]
519         if len(values) == 1:
520                 continue
521         codepointstr = values[1]
522         if values[1] == '#':
523                 # No codepoints that are >1 characters yet.
524                 continue
525         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
526                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
527         if  match('^U[0-9a-fA-F]+$', codepointstr):
528                 codepoint = long(codepointstr[1:], 16)
529         elif keysymunicodedatabase.has_key(codepointstr):
530                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
531                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
532                         #print raw_sequence, codepointstr
533                 codepoint = keysymunicodedatabase[codepointstr]
534         else:
535                 print
536                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
537                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
538                 exit(-1)
539         sequence = rename_combining(raw_sequence)
540         reject_this = False
541         for i in sequence:
542                 if keysymvalue(i) > 0xFFFF:
543                         reject_this = True
544                         if opt_plane1:
545                                 print sequence
546                         break
547                 if keysymvalue(i) < 0:
548                         reject_this = True
549                         break
550         if reject_this:
551                 continue
552         if "U0342" in sequence or \
553                 "U0313" in sequence or \
554                 "U0314" in sequence or \
555                 "0x0313" in sequence or \
556                 "0x0342" in sequence or \
557                 "0x0314" in sequence:
558                 continue
559         if "dead_belowring" in sequence or\
560                 "dead_currency" in sequence or\
561                 "dead_belowcomma" in sequence or\
562                 "dead_belowmacron" in sequence or\
563                 "dead_belowtilde" in sequence or\
564                 "dead_belowbreve" in sequence or\
565                 "dead_belowdiaeresis" in sequence or\
566                 "dead_belowcircumflex" in sequence:
567                 continue
568         #for i in range(len(sequence)):
569         #       if sequence[i] == "0x0342":
570         #               sequence[i] = "dead_tilde"
571         if "Multi_key" not in sequence:
572                 """ Ignore for now >0xFFFF keysyms """
573                 if codepoint < 0xFFFF:
574                         original_sequence = copy(sequence)
575                         stats_sequence = copy(sequence)
576                         base = sequence.pop()
577                         basechar = keysymvalue(base, filename_compose, linenum_compose)
578                         
579                         if basechar < 0xFFFF:
580                                 counter = 1
581                                 unisequence = []
582                                 not_normalised = True
583                                 skipping_this = False
584                                 for i in range(0, len(sequence)):
585                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
586                                             because of lack of dead_perispomeni (i.e. conflict)
587                                         """
588                                         bc = basechar
589                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590                                                 skipping_this = True
591                                                 break
592                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593                                                 skipping_this = True
594                                                 break
595                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
596                                                 skipping_this = True
597                                                 break
598                                         if sequence[-1] == "dead_psili":
599                                                 sequence[i] = "dead_horn"
600                                         if sequence[-1] == "dead_dasia":
601                                                 sequence[-1] = "dead_ogonek"
602                                         """
603                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
604                                         
605                                 if skipping_this:
606                                         unisequence = []
607                                 for perm in all_permutations(unisequence):
608                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
609                                         # print counter, map(unichr, perm)
610                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
611                                         if len(normalized) == 1:
612                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
613                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
614                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
615                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
616                                                 stats_sequence_data.append(normalized)
617                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
618                                                 not_normalised = False
619                                                 break;
620                                         counter += 1
621                                 if not_normalised or opt_allsequences:
622                                         original_sequence.append(codepoint)
623                                         xorg_compose_sequences.append(original_sequence)
624                                         """ print xorg_compose_sequences[-1] """
625                                         
626                         else:
627                                 print "Error in base char !?!"
628                                 exit(-2)
629                 else:
630                         print "OVER", sequence
631                         exit(-1)
632         else:
633                 sequence.append(codepoint)
634                 xorg_compose_sequences.append(sequence)
635                 """ print xorg_compose_sequences[-1] """
636
637 def sequence_cmp(x, y):
638         if keysymvalue(x[0]) > keysymvalue(y[0]):
639                 return 1
640         elif keysymvalue(x[0]) < keysymvalue(y[0]):
641                 return -1
642         elif len(x) > len(y):
643                 return 1
644         elif len(x) < len(y):
645                 return -1
646         elif keysymvalue(x[1]) > keysymvalue(y[1]):
647                 return 1
648         elif keysymvalue(x[1]) < keysymvalue(y[1]):
649                 return -1
650         elif len(x) < 4:
651                 return 0
652         elif keysymvalue(x[2]) > keysymvalue(y[2]):
653                 return 1
654         elif keysymvalue(x[2]) < keysymvalue(y[2]):
655                 return -1
656         elif len(x) < 5:
657                 return 0
658         elif keysymvalue(x[3]) > keysymvalue(y[3]):
659                 return 1
660         elif keysymvalue(x[3]) < keysymvalue(y[3]):
661                 return -1
662         elif len(x) < 6:
663                 return 0
664         elif keysymvalue(x[4]) > keysymvalue(y[4]):
665                 return 1
666         elif keysymvalue(x[4]) < keysymvalue(y[4]):
667                 return -1
668         else:
669                 return 0
670
671 def sequence_unicode_cmp(x, y):
672         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
673                 return 1
674         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
675                 return -1
676         elif len(x) > len(y):
677                 return 1
678         elif len(x) < len(y):
679                 return -1
680         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
681                 return 1
682         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
683                 return -1
684         elif len(x) < 4:
685                 return 0
686         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
687                 return 1
688         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
689                 return -1
690         elif len(x) < 5:
691                 return 0
692         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
693                 return 1
694         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
695                 return -1
696         elif len(x) < 6:
697                 return 0
698         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
699                 return 1
700         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
701                 return -1
702         else:
703                 return 0
704
705 def sequence_algorithmic_cmp(x, y):
706         if len(x) < len(y):
707                 return -1
708         elif len(x) > len(y):
709                 return 1
710         else:
711                 for i in range(len(x)):
712                         if x[i] < y[i]:
713                                 return -1
714                         elif x[i] > y[i]:
715                                 return 1
716         return 0
717
718
719 xorg_compose_sequences.sort(sequence_cmp)
720
721 xorg_compose_sequences_uniqued = []
722 first_time = True
723 item = None
724 for next_item in xorg_compose_sequences:
725         if first_time:
726                 first_time = False
727                 item = next_item
728         if sequence_unicode_cmp(item, next_item) != 0:
729                 xorg_compose_sequences_uniqued.append(item)
730         item = next_item
731
732 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
733
734 counter_multikey = 0
735 for item in xorg_compose_sequences:
736         if findall('Multi_key', "".join(item[:-1])) != []:
737                 counter_multikey += 1
738
739 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
740 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
741
742 firstitem = ""
743 num_first_keysyms = 0
744 zeroes = 0
745 num_entries = 0
746 num_algorithmic_greek = 0
747 for sequence in xorg_compose_sequences:
748         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
749                 firstitem = sequence[0]
750                 num_first_keysyms += 1
751         zeroes += 6 - len(sequence) + 1
752         num_entries += 1
753
754 for sequence in xorg_compose_sequences_algorithmic_uniqued:
755         ch = ord(sequence[-1:][0])
756         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
757                 num_algorithmic_greek += 1
758                 
759
760 if opt_algorithmic:
761         for sequence in xorg_compose_sequences_algorithmic_uniqued:
762                 letter = "".join(sequence[-1:])
763                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
764                 for elem in sequence[:-2]:
765                         print "<0x%(keysym)04X>," % { 'keysym': elem },
766                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
767                 print "], recomposed as", letter.encode('utf-8'), "verified"
768
769 def num_of_keysyms(seq):
770         return len(seq) - 1
771
772 def convert_UnotationToHex(arg):
773         if isinstance(arg, str):
774                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
775                         return sub('^U', '0x', arg)
776         return arg
777
778 def addprefix_GDK(arg):
779         if match('^0x', arg):
780                 return '%(arg)s, ' % { 'arg': arg }
781         else:
782                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
783
784 if opt_gtk:
785         first_keysym = ""
786         sequence = []
787         compose_table = []
788         ct_second_part = []
789         ct_sequence_width = 2
790         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
791         we_finished = False
792         counter = 0
793
794         sequence_iterator = iter(xorg_compose_sequences)
795         sequence = sequence_iterator.next()
796         while True:
797                 first_keysym = sequence[0]                                      # Set the first keysym
798                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
799                 while sequence[0] == first_keysym:
800                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
801                         try:
802                                 sequence = sequence_iterator.next()
803                         except StopIteration:
804                                 we_finished = True
805                                 break
806                 if we_finished:
807                         break
808                 counter += 1
809
810         ct_index = start_offset
811         for line_num in range(len(compose_table)):
812                 for i in range(WIDTHOFCOMPOSETABLE):
813                         occurences = compose_table[line_num][i+1]
814                         compose_table[line_num][i+1] = ct_index
815                         ct_index += occurences * (i+2)
816
817         for sequence in xorg_compose_sequences:
818                 ct_second_part.append(map(convert_UnotationToHex, sequence))
819
820         print headerfile_start
821         for i in compose_table:
822                 if opt_gtkexpanded:
823                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
824                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
825                 elif not match('^0x', i[0]):
826                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
827                 else:
828                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
829         for i in ct_second_part:
830                 if opt_numeric:
831                         for ks in i[1:][:-1]:
832                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
833                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
834                         """
835                         for ks in i[:-1]:
836                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
837                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
838                         """
839                 elif opt_gtkexpanded:
840                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
841                 else:
842                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
843         print headerfile_end 
844
845 def redecompose(codepoint):
846         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
847         if decomposition[0] == '' or decomposition[0] == '0':
848                 return [codepoint]
849         if match('<\w+>', decomposition[0]):
850                 numdecomposition = map(stringtohex, decomposition[1:])
851                 return map(redecompose, numdecomposition)
852         numdecomposition = map(stringtohex, decomposition)
853         return map(redecompose, numdecomposition)
854
855 def process_unicodedata_file(verbose = False):
856         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
857         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
858         try: 
859                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
860         except IOError, (errno, strerror):
861                 print "I/O error(%s): %s" % (errno, strerror)
862                 sys.exit(-1)
863         except:
864                 print "Unexpected error: ", sys.exc_info()[0]
865                 sys.exit(-1)
866         for line in unicodedatatxt.readlines():
867                 if line[0] == "" or line[0] == '#':
868                         continue
869                 line = line[:-1]
870                 uniproperties = split(';', line)
871                 codepoint = stringtohex(uniproperties[0])
872                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
873                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
874                         continue
875                 name = uniproperties[1]
876                 category = uniproperties[2]
877                 combiningclass = uniproperties[3]
878                 decomposition = uniproperties[5]
879                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
880         
881         counter_combinations = 0
882         counter_combinations_greek = 0
883         counter_entries = 0
884         counter_entries_greek = 0
885
886         for item in unicodedatabase.keys():
887                 (name, decomposition, combiningclass) = unicodedatabase[item]
888                 if decomposition[0] == '':
889                         continue
890                         print name, "is empty"
891                 elif match('<\w+>', decomposition[0]):
892                         continue
893                         print name, "has weird", decomposition[0]
894                 else:
895                         sequence = map(stringtohex, decomposition)
896                         chrsequence = map(unichr, sequence)
897                         normalized = normalize('NFC', "".join(chrsequence))
898                         
899                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
900                         decomposedsequence = []
901                         for subseq in map(redecompose, sequence):
902                                 for seqitem in subseq:
903                                         if isinstance(seqitem, list):
904                                                 for i in seqitem:
905                                                         if isinstance(i, list):
906                                                                 for j in i:
907                                                                         decomposedsequence.append(j)
908                                                         else:
909                                                                 decomposedsequence.append(i)
910                                         else:
911                                                 decomposedsequence.append(seqitem)
912                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
913                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
914                                 counter_entries += 1
915                                 counter_combinations += factorial(len(decomposedsequence)-1)
916                                 ch = item
917                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
918                                         counter_entries_greek += 1
919                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
920                                 if verbose:
921                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
922                                         print "[",
923                                         for elem in decomposedsequence:
924                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
925                                         print "], recomposed as", recomposedchar,
926                                         if unichr(item) == recomposedchar:
927                                                 print "verified"
928         
929         if verbose == False:
930                 print "Unicode statistics from UnicodeData.txt"
931                 print "Number of entries that can be algorithmically produced     :", counter_entries
932                 print "  of which are for Greek                                   :", counter_entries_greek
933                 print "Number of compose sequence combinations requiring          :", counter_combinations
934                 print "  of which are for Greek                                   :", counter_combinations_greek
935                 print "Note: We do not include partial compositions, "
936                 print "thus the slight discrepancy in the figures"
937                 print
938
939 if opt_unicodedatatxt:
940         process_unicodedata_file(True)
941
942 if opt_statistics:
943         print
944         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
945         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
946         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
947         print "    of which have Multi_key                                :", counter_multikey
948         print 
949         print "Algorithmic (stats for Xorg Compose file)"
950         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
951         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
952         print "  of which are for Greek                                   :", num_algorithmic_greek
953         print 
954         process_unicodedata_file()
955         print "Not algorithmic (stats from Xorg Compose file)"
956         print "Number of sequences                                        :", len(xorg_compose_sequences) 
957         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
958         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
959         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
960         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
961         print "Number of different first items                            :", num_first_keysyms
962         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
963         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
964         print 
965         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
966         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
967         print
968         print "Existing (old) implementation in GTK+"
969         print "Number of sequences in old gtkimcontextsimple.c            :", 691
970         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"