support two letter yunmu from full pinyin
[platform/upstream/libpinyin.git] / scripts / genpinyins.py
1 #!/usr/bin/python3
2 import os
3 from operator import itemgetter
4
5 pinyin_dict = {}
6
7
8 def strip_tone(old_pinyin_str):
9     oldpinyins = old_pinyin_str.split("'")
10     newpinyins = []
11
12     for pinyin in oldpinyins:
13         if pinyin[-1].isdigit():
14             pinyin = pinyin[:-1]
15         newpinyins.append(pinyin)
16
17     new_pinyin_str = "'".join(newpinyins)
18     return new_pinyin_str
19
20
21 def add_pinyin_dict(pinyin, freq):
22     if 0 == freq:
23         return
24     if not pinyin in pinyin_dict:
25         pinyin_dict[pinyin] = freq
26     else:
27         pinyin_dict[pinyin] += freq
28
29
30 def load_phrase(filename):
31     phrasefile = open(filename, "r")
32     for line in phrasefile.readlines():
33         line = line.rstrip(os.linesep)
34         (pinyin, word, token, freq) = line.split(None, 3)
35         pinyin = strip_tone(pinyin)
36         freq = int(freq)
37
38         if len(word) in [1, 2]:
39             add_pinyin_dict(pinyin, freq)
40
41     phrasefile.close()
42
43 load_phrase("../data/gb_char.table")
44 load_phrase("../data/gbk_char.table")
45
46
47 def save_pinyin(filename):
48     pinyinfile = open(filename, "w")
49     for pinyin, freq in pinyin_dict.items():
50         freq = str(freq)
51         line = "\t".join((pinyin, freq))
52         pinyinfile.writelines([line, os.linesep])
53     pinyinfile.close()
54
55
56 if __name__ == "__main__":
57     save_pinyin("pinyins.txt")