Fix RPMLINT error
[platform/core/uifw/ise-engine-sunpinyin.git] / python / pinyin_data.py
1 #!/usr/bin/python 
2
3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
5 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
7 # The contents of this file are subject to the terms of either the GNU Lesser
8 # General Public License Version 2.1 only ("LGPL") or the Common Development and
9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 # file except in compliance with the License. You can obtain a copy of the CDDL at
11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 
13 # specific language governing permissions and limitations under the License. When
14 # distributing the software, include this License Header Notice in each file and
15 # include the full text of the License in the License file as well as the
16 # following notice:
17
18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 # (CDDL)
20 # For Covered Software in this distribution, this License shall be governed by the
21 # laws of the State of California (excluding conflict-of-law provisions).
22 # Any litigation relating to this License shall be subject to the jurisdiction of
23 # the Federal Courts of the Northern District of California and the state courts
24 # of the State of California, with venue lying in Santa Clara County, California.
25
26 # Contributor(s):
27
28 # If you wish your version of this file to be governed by only the CDDL or only
29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 # license." If you don't indicate a single choice of license, a recipient has the
32 # option to distribute your version of this file under either the CDDL or the LGPL
33 # Version 2.1, or to extend the choice of license to its licensees as provided
34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 # Version 2 license, then the option applies only if the new code is made subject
36 # to such option by the copyright holder. 
37
38 initials = ["", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w", ]
39
40 finals = ["", "a", "o", "e", "ai", "ei", "ao", "ou", "an", "en", "ang", "eng", "er", "i", "ia", "ie", "iao", "iu", "ian", "in", "iang", "ing", "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ong", "v", "ue", "iong", ]
41
42 inner_fuzzy_finals = ['ia', 'iao', 'ian', 'iang', 'ie', 'ua', 'uai', 'uan', 'uang', 'ue']
43
44 fuzzy_pairs = [
45     ('z',       'zh'), 
46     ('c',       'ch'), 
47     ('s',       'sh'), 
48     ('an',      'ang'), 
49     ('on',      'ong'), 
50     ('en',      'eng'), 
51     ('in',      'ing'), 
52     ('eng',     'ong'), 
53     ('ian',     'iang'), 
54     ('uan',     'uang'), 
55     ('l',       'n'), 
56     ('f',       'h'), 
57     ('r',       'l'), 
58     ('k',       'g'),
59 ]
60
61 auto_correction_pairs = {
62     'ign':      'ing',
63     'img':      'ing',
64     'uei':      'ui',
65     'uen':      'un',
66     'iou':      'iu',
67 }
68
69 valid_syllables = {
70     "a":        0x00010,
71     "ai":       0x00040,
72     "an":       0x00080,
73     "ang":      0x000a0,
74     "ao":       0x00060,
75     "b":        0x01000,
76     "ba":       0x01010,
77     "bai":      0x01040,
78     "ban":      0x01080,
79     "bang":     0x010a0,
80     "bao":      0x01060,
81     "bei":      0x01050,
82     "ben":      0x01090,
83     "beng":     0x010b0,
84     "bi":       0x010d0,
85     "bian":     0x01120,
86     "biao":     0x01100,
87     "bie":      0x010f0,
88     "bin":      0x01130,
89     "bing":     0x01150,
90     "bo":       0x01020,
91     "bu":       0x01160,
92     "c":        0x14000,
93     "ca":       0x14010,
94     "cai":      0x14040,
95     "can":      0x14080,
96     "cang":     0x140a0,
97     "cao":      0x14060,
98     "ce":       0x14030,
99     "cei":      0x14050,
100     "cen":      0x14090,
101     "ceng":     0x140b0,
102     "ch":       0x10000,
103     "cha":      0x10010,
104     "chai":     0x10040,
105     "chan":     0x10080,
106     "chang":    0x100a0,
107     "chao":     0x10060,
108     "che":      0x10030,
109     "chen":     0x10090,
110     "cheng":    0x100b0,
111     "chi":      0x100d0,
112     "chong":    0x101e0,
113     "chou":     0x10070,
114     "chu":      0x10160,
115     "chua":     0x10170,
116     "chuai":    0x10190,
117     "chuan":    0x101b0,
118     "chuang":   0x101d0,
119     "chui":     0x101a0,
120     "chun":     0x101c0,
121     "chuo":     0x10180,
122     "ci":       0x140d0,
123     "cong":     0x141e0,
124     "cou":      0x14070,
125     "cu":       0x14160,
126     "cuan":     0x141b0,
127     "cui":      0x141a0,
128     "cun":      0x141c0,
129     "cuo":      0x14180,
130     "d":        0x05000,
131     "da":       0x05010,
132     "dai":      0x05040,
133     "dan":      0x05080,
134     "dang":     0x050a0,
135     "dao":      0x05060,
136     "de":       0x05030,
137     "dei":      0x05050,
138     "den":      0x05090,
139     "deng":     0x050b0,
140     "di":       0x050d0,
141     "dia":      0x050e0,
142     "dian":     0x05120,
143     "diao":     0x05100,
144     "die":      0x050f0,
145     "ding":     0x05150,
146     "diu":      0x05110,
147     "dong":     0x051e0,
148     "dou":      0x05070,
149     "du":       0x05160,
150     "duan":     0x051b0,
151     "dui":      0x051a0,
152     "dun":      0x051c0,
153     "duo":      0x05180,
154     "e":        0x00030,
155     "ei":       0x00050,
156     "en":       0x00090,
157     "eng":      0x000b0,
158     "er":       0x000c0,
159     "f":        0x04000,
160     "fa":       0x04010,
161     "fan":      0x04080,
162     "fang":     0x040a0,
163     "fei":      0x04050,
164     "fen":      0x04090,
165     "feng":     0x040b0,
166     "fiao":     0x04100,
167     "fo":       0x04020,
168     "fou":      0x04070,
169     "fu":       0x04160,
170     "g":        0x09000,
171     "ga":       0x09010,
172     "gai":      0x09040,
173     "gan":      0x09080,
174     "gang":     0x090a0,
175     "gao":      0x09060,
176     "ge":       0x09030,
177     "gei":      0x09050,
178     "gen":      0x09090,
179     "geng":     0x090b0,
180     "gong":     0x091e0,
181     "gou":      0x09070,
182     "gu":       0x09160,
183     "gua":      0x09170,
184     "guai":     0x09190,
185     "guan":     0x091b0,
186     "guang":    0x091d0,
187     "gui":      0x091a0,
188     "gun":      0x091c0,
189     "guo":      0x09180,
190     "h":        0x0b000,
191     "ha":       0x0b010,
192     "hai":      0x0b040,
193     "han":      0x0b080,
194     "hang":     0x0b0a0,
195     "hao":      0x0b060,
196     "he":       0x0b030,
197     "hei":      0x0b050,
198     "hen":      0x0b090,
199     "heng":     0x0b0b0,
200     "hong":     0x0b1e0,
201     "hou":      0x0b070,
202     "hu":       0x0b160,
203     "hua":      0x0b170,
204     "huai":     0x0b190,
205     "huan":     0x0b1b0,
206     "huang":    0x0b1d0,
207     "hui":      0x0b1a0,
208     "hun":      0x0b1c0,
209     "huo":      0x0b180,
210     "j":        0x0c000,
211     "ji":       0x0c0d0,
212     "jia":      0x0c0e0,
213     "jian":     0x0c120,
214     "jiang":    0x0c140,
215     "jiao":     0x0c100,
216     "jie":      0x0c0f0,
217     "jin":      0x0c130,
218     "jing":     0x0c150,
219     "jiong":    0x0c210,
220     "jiu":      0x0c110,
221     "ju":       0x0c160,
222     "juan":     0x0c1b0,
223     "jue":      0x0c200,
224     "jun":      0x0c1c0,
225     "k":        0x0a000,
226     "ka":       0x0a010,
227     "kai":      0x0a040,
228     "kan":      0x0a080,
229     "kang":     0x0a0a0,
230     "kao":      0x0a060,
231     "ke":       0x0a030,
232     "kei":      0x0a050,
233     "ken":      0x0a090,
234     "keng":     0x0a0b0,
235     "kong":     0x0a1e0,
236     "kou":      0x0a070,
237     "ku":       0x0a160,
238     "kua":      0x0a170,
239     "kuai":     0x0a190,
240     "kuan":     0x0a1b0,
241     "kuang":    0x0a1d0,
242     "kui":      0x0a1a0,
243     "kun":      0x0a1c0,
244     "kuo":      0x0a180,
245     "l":        0x08000,
246     "la":       0x08010,
247     "lai":      0x08040,
248     "lan":      0x08080,
249     "lang":     0x080a0,
250     "lao":      0x08060,
251     "le":       0x08030,
252     "lei":      0x08050,
253     "leng":     0x080b0,
254     "li":       0x080d0,
255     "lia":      0x080e0,
256     "lian":     0x08120,
257     "liang":    0x08140,
258     "liao":     0x08100,
259     "lie":      0x080f0,
260     "lin":      0x08130,
261     "ling":     0x08150,
262     "liu":      0x08110,
263     "lo":       0x08020,
264     "long":     0x081e0,
265     "lou":      0x08070,
266     "lu":       0x08160,
267     "luan":     0x081b0,
268     "lue":      0x08200,
269     "lun":      0x081c0,
270     "luo":      0x08180,
271     "lv":       0x081f0,
272     "m":        0x03000,
273     "ma":       0x03010,
274     "mai":      0x03040,
275     "man":      0x03080,
276     "mang":     0x030a0,
277     "mao":      0x03060,
278     "me":       0x03030,
279     "mei":      0x03050,
280     "men":      0x03090,
281     "meng":     0x030b0,
282     "mi":       0x030d0,
283     "mian":     0x03120,
284     "miao":     0x03100,
285     "mie":      0x030f0,
286     "min":      0x03130,
287     "ming":     0x03150,
288     "miu":      0x03110,
289     "mo":       0x03020,
290     "mou":      0x03070,
291     "mu":       0x03160,
292     "n":        0x07000,
293     "na":       0x07010,
294     "nai":      0x07040,
295     "nan":      0x07080,
296     "nang":     0x070a0,
297     "nao":      0x07060,
298     "ne":       0x07030,
299     "nei":      0x07050,
300     "nen":      0x07090,
301     "neng":     0x070b0,
302     "ni":       0x070d0,
303     "nian":     0x07120,
304     "niang":    0x07140,
305     "niao":     0x07100,
306     "nie":      0x070f0,
307     "nin":      0x07130,
308     "ning":     0x07150,
309     "niu":      0x07110,
310     "nong":     0x071e0,
311     "nou":      0x07070,
312     "nu":       0x07160,
313     "nuan":     0x071b0,
314     "nue":      0x07200,
315     "nun":      0x071c0,
316     "nuo":      0x07180,
317     "nv":       0x071f0,
318     "o":        0x00020,
319     "ou":       0x00070,
320     "p":        0x02000,
321     "pa":       0x02010,
322     "pai":      0x02040,
323     "pan":      0x02080,
324     "pang":     0x020a0,
325     "pao":      0x02060,
326     "pei":      0x02050,
327     "pen":      0x02090,
328     "peng":     0x020b0,
329     "pi":       0x020d0,
330     "pian":     0x02120,
331     "piao":     0x02100,
332     "pie":      0x020f0,
333     "pin":      0x02130,
334     "ping":     0x02150,
335     "po":       0x02020,
336     "pou":      0x02070,
337     "pu":       0x02160,
338     "q":        0x0d000,
339     "qi":       0x0d0d0,
340     "qia":      0x0d0e0,
341     "qian":     0x0d120,
342     "qiang":    0x0d140,
343     "qiao":     0x0d100,
344     "qie":      0x0d0f0,
345     "qin":      0x0d130,
346     "qing":     0x0d150,
347     "qiong":    0x0d210,
348     "qiu":      0x0d110,
349     "qu":       0x0d160,
350     "quan":     0x0d1b0,
351     "que":      0x0d200,
352     "qun":      0x0d1c0,
353     "r":        0x12000,
354     "ran":      0x12080,
355     "rang":     0x120a0,
356     "rao":      0x12060,
357     "re":       0x12030,
358     "ren":      0x12090,
359     "reng":     0x120b0,
360     "ri":       0x120d0,
361     "rong":     0x121e0,
362     "rou":      0x12070,
363     "ru":       0x12160,
364     "ruan":     0x121b0,
365     "rui":      0x121a0,
366     "run":      0x121c0,
367     "ruo":      0x12180,
368     "s":        0x15000,
369     "sa":       0x15010,
370     "sai":      0x15040,
371     "san":      0x15080,
372     "sang":     0x150a0,
373     "sao":      0x15060,
374     "se":       0x15030,
375     "sen":      0x15090,
376     "seng":     0x150b0,
377     "sh":       0x11000,
378     "sha":      0x11010,
379     "shai":     0x11040,
380     "shan":     0x11080,
381     "shang":    0x110a0,
382     "shao":     0x11060,
383     "she":      0x11030,
384     "shei":     0x11050,
385     "shen":     0x11090,
386     "sheng":    0x110b0,
387     "shi":      0x110d0,
388     "shou":     0x11070,
389     "shu":      0x11160,
390     "shua":     0x11170,
391     "shuai":    0x11190,
392     "shuan":    0x111b0,
393     "shuang":   0x111d0,
394     "shui":     0x111a0,
395     "shun":     0x111c0,
396     "shuo":     0x11180,
397     "si":       0x150d0,
398     "song":     0x151e0,
399     "sou":      0x15070,
400     "su":       0x15160,
401     "suan":     0x151b0,
402     "sui":      0x151a0,
403     "sun":      0x151c0,
404     "suo":      0x15180,
405     "t":        0x06000,
406     "ta":       0x06010,
407     "tai":      0x06040,
408     "tan":      0x06080,
409     "tang":     0x060a0,
410     "tao":      0x06060,
411     "te":       0x06030,
412     "tei":      0x06050,
413     "teng":     0x060b0,
414     "ti":       0x060d0,
415     "tian":     0x06120,
416     "tiao":     0x06100,
417     "tie":      0x060f0,
418     "ting":     0x06150,
419     "tong":     0x061e0,
420     "tou":      0x06070,
421     "tu":       0x06160,
422     "tuan":     0x061b0,
423     "tui":      0x061a0,
424     "tun":      0x061c0,
425     "tuo":      0x06180,
426     "w":        0x17000,
427     "wa":       0x17010,
428     "wai":      0x17040,
429     "wan":      0x17080,
430     "wang":     0x170a0,
431     "wei":      0x17050,
432     "wen":      0x17090,
433     "weng":     0x170b0,
434     "wo":       0x17020,
435     "wu":       0x17160,
436     "x":        0x0e000,
437     "xi":       0x0e0d0,
438     "xia":      0x0e0e0,
439     "xian":     0x0e120,
440     "xiang":    0x0e140,
441     "xiao":     0x0e100,
442     "xie":      0x0e0f0,
443     "xin":      0x0e130,
444     "xing":     0x0e150,
445     "xiong":    0x0e210,
446     "xiu":      0x0e110,
447     "xu":       0x0e160,
448     "xuan":     0x0e1b0,
449     "xue":      0x0e200,
450     "xun":      0x0e1c0,
451     "y":        0x16000,
452     "ya":       0x16010,
453     "yan":      0x16080,
454     "yang":     0x160a0,
455     "yao":      0x16060,
456     "ye":       0x16030,
457     "yi":       0x160d0,
458     "yin":      0x16130,
459     "ying":     0x16150,
460     "yo":       0x16020,
461     "yong":     0x161e0,
462     "you":      0x16070,
463     "yu":       0x16160,
464     "yuan":     0x161b0,
465     "yue":      0x16200,
466     "yun":      0x161c0,
467     "z":        0x13000,
468     "za":       0x13010,
469     "zai":      0x13040,
470     "zan":      0x13080,
471     "zang":     0x130a0,
472     "zao":      0x13060,
473     "ze":       0x13030,
474     "zei":      0x13050,
475     "zen":      0x13090,
476     "zeng":     0x130b0,
477     "zh":       0x0f000,
478     "zha":      0x0f010,
479     "zhai":     0x0f040,
480     "zhan":     0x0f080,
481     "zhang":    0x0f0a0,
482     "zhao":     0x0f060,
483     "zhe":      0x0f030,
484     "zhei":     0x0f050,
485     "zhen":     0x0f090,
486     "zheng":    0x0f0b0,
487     "zhi":      0x0f0d0,
488     "zhong":    0x0f1e0,
489     "zhou":     0x0f070,
490     "zhu":      0x0f160,
491     "zhua":     0x0f170,
492     "zhuai":    0x0f190,
493     "zhuan":    0x0f1b0,
494     "zhuang":   0x0f1d0,
495     "zhui":     0x0f1a0,
496     "zhun":     0x0f1c0,
497     "zhuo":     0x0f180,
498     "zi":       0x130d0,
499     "zong":     0x131e0,
500     "zou":      0x13070,
501     "zu":       0x13160,
502     "zuan":     0x131b0,
503     "zui":      0x131a0,
504     "zun":      0x131c0,
505     "zuo":      0x13180,
506 }
507
508 def decode_syllable (s):
509     return initials[(s>>12)], finals[(s&0x00ff0)>>4]
510
511 def init_fuzzy_map (fuzzy_pairs):
512     fuzzy_map = {}
513     for i, j in fuzzy_pairs:
514         fuzzy_map.setdefault (i, []).append (j)
515         fuzzy_map.setdefault (j, []).append (i)
516
517     return fuzzy_map
518
519 fuzzy_map = init_fuzzy_map (fuzzy_pairs)
520
521 def get_fuzzy_syllables (syllable):
522     i, f = decode_syllable (syllable)
523     iset = fuzzy_map.setdefault(i, []) + [i]
524     fset = fuzzy_map.setdefault(f, []) + [f]
525     sset = [valid_syllables[i+f] for i in iset for f in fset if i+f in valid_syllables]
526     sset.remove (syllable)
527     return sset
528
529 def gen_suffix_trie (fname):
530     from trie import Trie, DATrie
531     
532     trie = Trie ()
533     pytrie = DATrie ()
534
535     for s in valid_syllables:
536         trie.add (s[::-1], valid_syllables[s])
537     
538     pytrie.construct_from_trie (trie)
539     pytrie.output_static_c_arrays (fname)
540
541 def gen_fuzzy_syllable_pairs_tables ():
542     fuzzy_pro_syllables = [s for s in valid_syllables if s[1:] in valid_syllables and s[0] in initials and s not in initials]
543     fuzzy_pre_syllables = [s for s in valid_syllables if s[:-1] in valid_syllables and s[-1] in initials and s not in initials]
544
545     initial_sets = set([s[0] for s in fuzzy_pro_syllables]) & set([s[-1] for s in fuzzy_pre_syllables])
546
547     fuzzy_pro_syllables  = [s for s in fuzzy_pro_syllables if s[0] in initial_sets]
548     fuzzy_pre_syllables  = [s for s in fuzzy_pre_syllables if s[-1] in initial_sets]
549
550     print "static const unsigned fuzzy_pre_syllables [] = {"
551     for s in fuzzy_pre_syllables:
552         print "    %-12s %-12s %-12s /* %s */" % ("0x%05x," % valid_syllables[s[:-1]], "'%s'," % s[-1], "0x%05x," % valid_syllables[s], s)
553     print "    0x0,"
554     print "};\n"
555
556     print "static const unsigned fuzzy_pro_syllables [] = {"
557     for s in fuzzy_pro_syllables:
558         print "    %-12s %-12s %-12s /* %s */" % ("0x%05x," % valid_syllables[s], "'%s'," % s[0], "0x%05x," % valid_syllables[s[1:]], s)
559     print "    0x0,"
560     print "};\n"
561
562 def gen_inner_fuzzy_syllable_tables ():
563     print "static const unsigned fuzzy_finals_map[] = {"
564     for s in inner_fuzzy_finals:
565         print "    %-12s %-12s %-12s /* %-4s -> %-4s len %d */" % ("0x%02x," % finals.index(s), "0x%02x," % valid_syllables[s[1:]], "%d," % (len(s)-1,),  s, s[1:], len(s)-1)
566     print "};\n"
567
568 if __name__ == "__main__":
569     gen_suffix_trie ("../src/pinyin/quanpin_trie.h")
570     gen_inner_fuzzy_syllable_tables ()
571     gen_fuzzy_syllable_pairs_tables ()