update scripts
[platform/upstream/libpinyin.git] / scripts / specialtable.py
1 # -*- coding: utf-8 -*-
2 # vim:set et sts=4 sw=4:
3 #
4 # libpinyin - Library to deal with pinyin.
5 #
6 # Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
21
22
23 import os
24 import sys
25 import math
26 import pinyin
27
28 pinyin_list = sorted(pinyin.PINYIN_LIST)
29 shengmu_list = sorted(pinyin.SHENGMU_LIST)
30 yunmu_list = sorted(pinyin.YUNMU_LIST)
31
32 phrase_dict = {}
33
34
35 def load_phrase(filename):
36     phrasefile = open(filename, "r")
37     for line in phrasefile.readlines():
38         line = line.rstrip(os.linesep)
39         (pinyin_str, freq) = line.split(None, 1)
40         freq = int(freq)
41         if 0 == freq:
42             #print(pinyin_str)
43             continue
44
45         # no duplicate here
46         if "'" in pinyin_str:
47             (first_key, second_key) = pinyin_str.split("'")
48             phrase_dict[(first_key, second_key)] = freq
49         else:
50             phrase_dict[pinyin_str] = freq
51     phrasefile.close()
52
53
54 def gen_all_divided():
55     for pinyin_key in pinyin_list:
56         for first_key in pinyin_list:
57             if len(pinyin_key) <= len(first_key):
58                 continue
59             if not pinyin_key.startswith(first_key):
60                 continue
61             second_key = pinyin_key[len(first_key):]
62             if second_key in pinyin_list:
63                 yield pinyin_key, first_key, second_key
64
65
66 def filter_divided():
67     for (pinyin_key, first_key, second_key) in gen_all_divided():
68         if not (first_key, second_key) in phrase_dict:
69             continue
70         orig_freq = 0
71         if pinyin_key in phrase_dict:
72             orig_freq = phrase_dict[pinyin_key]
73         new_freq = phrase_dict[(first_key, second_key)]
74         yield pinyin_key, orig_freq, first_key, second_key, new_freq
75
76
77 def gen_all_resplit():
78     for pinyin_key in pinyin_list:
79         if pinyin_key[-1] in ["n", "g", "r"]:
80             for yun in yunmu_list:
81                 if yun not in pinyin_list:
82                     continue
83                 #check first new pinyin key
84                 if not pinyin_key[:-1] in pinyin_list:
85                     continue
86                 #check second new pinyin key
87                 new_pinyin_key = pinyin_key[-1] + yun
88                 if new_pinyin_key in pinyin_list:
89                     yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
90 '''
91         elif pinyin_key[-1] in ["e"]:
92             #check first new pinyin key
93             if pinyin_key[:-1] in pinyin_list:
94                 yield pinyin_key, "r", pinyin_key[:-1], "er"
95 '''
96
97
98 def filter_resplit():
99     for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
100     in gen_all_resplit():
101         #do the reverse here, as libpinyin pinyin parser is different with
102         #ibus-pinyin's parser.
103         (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
104             (new_first_key, new_second_key, orig_first_key, orig_second_key)
105         if (new_first_key, new_second_key) not in phrase_dict:
106             continue
107         orig_freq = 0
108         new_freq = phrase_dict[(new_first_key, new_second_key)]
109         if (orig_first_key, orig_second_key) in phrase_dict:
110             orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
111         yield orig_first_key, orig_second_key, orig_freq, \
112         new_first_key, new_second_key, new_freq
113
114
115 #init code
116 load_phrase("pinyins.txt")
117 load_phrase("specials.txt")
118
119 if __name__ == "__main__":
120     for p in filter_divided():
121         print (p)
122     for p in filter_resplit():
123         print (p)