3 # thanks for the reverse engineering efforts of following projects/peoples:
4 # http://code.google.com/p/imewlconverter
5 # http://code.google.com/p/ibus-cloud-pinyin
6 # http://forum.ubuntu.org.cn/viewtopic.php?f=8&t=250136&start=0
8 from importer import import_to_sunpinyin_user_dict
12 def read_utf16_str (f, offset=-1, len=2):
16 return str.decode('UTF-16LE')
19 return struct.unpack ('<H', f.read(2))[0]
21 def get_word_from_sogou_cell_dict (fname):
22 f = open (fname, 'rb')
23 file_size = os.path.getsize (fname)
26 mask = struct.unpack ('B', f.read(128)[4])[0]
34 title = read_utf16_str (f, 0x130, 0x338 - 0x130)
35 type = read_utf16_str (f, 0x338, 0x540 - 0x338)
36 desc = read_utf16_str (f, 0x540, 0xd40 - 0x540)
37 samples = read_utf16_str (f, 0xd40, 0x1540 - 0xd40)
43 py_code = read_uint16 (f)
44 py_len = read_uint16 (f)
45 py_str = read_utf16_str (f, -1, py_len)
47 if py_code not in py_map:
48 py_map[py_code] = py_str
54 while f.tell() != file_size:
55 word_count = read_uint16 (f)
56 pinyin_count = read_uint16 (f) / 2
59 for i in range(pinyin_count):
60 py_id = read_uint16(f)
61 py_set.append(py_map[py_id])
62 py_str = "'".join (py_set)
64 for i in range(word_count):
65 word_len = read_uint16(f)
66 word_str = read_utf16_str (f, -1, word_len)
67 f.read(12) # simply ignore word frequence info
68 yield py_str, word_str
73 if len (sys.argv) != 2:
74 print "Please specify the Sogou PinYin Cell dict file!"
77 generator = get_word_from_sogou_cell_dict (sys.argv[1])
78 import_to_sunpinyin_user_dict (generator)
80 if __name__ == "__main__":