3 from operator import itemgetter
8 def strip_tone(old_pinyin_str):
9 oldpinyins = old_pinyin_str.split("'")
12 for pinyin in oldpinyins:
13 if pinyin[-1].isdigit():
15 newpinyins.append(pinyin)
17 new_pinyin_str = "'".join(newpinyins)
21 def add_pinyin_dict(pinyin, freq):
24 if not pinyin in pinyin_dict:
25 pinyin_dict[pinyin] = freq
27 pinyin_dict[pinyin] += freq
30 def load_phrase(filename):
31 phrasefile = open(filename, "r")
32 for line in phrasefile.readlines():
33 line = line.rstrip(os.linesep)
34 (pinyin, word, token, freq) = line.split(None, 3)
35 pinyin = strip_tone(pinyin)
38 if len(word) in [1, 2]:
39 add_pinyin_dict(pinyin, freq)
43 load_phrase("../data/gb_char.table")
44 load_phrase("../data/gbk_char.table")
47 def save_pinyin(filename):
48 pinyinfile = open(filename, "w")
49 for pinyin, freq in pinyin_dict.items():
51 line = "\t".join((pinyin, freq))
52 pinyinfile.writelines([line, os.linesep])
56 if __name__ == "__main__":
57 save_pinyin("pinyins.txt")