Add script for creating db from google pinyin data

author Peng Huang <shawn.p.huang@gmail.com>

Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)

committer Peng Huang <shawn.p.huang@gmail.com>

Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)
author Peng Huang <shawn.p.huang@gmail.com>
Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)
committer Peng Huang <shawn.p.huang@gmail.com>
Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)
diff --git a/data/google/createdb.py b/data/google/createdb.py

new file mode 100644 (file)

index 0000000..f685595
--- /dev/null
+++ b/data/google/createdb.py
@@ -0,0 +1,78 @@
+import sqlite3
+from pydict import *
+from id import *
+
+
+def get_sheng_yun(pinyin):
+    if pinyin == None:
+        return None, None
+    if pinyin == "ng":
+        return "", "en"
+    for i in xrange(2, 0, -1):
+        t = pinyin[:i]
+        if t in SHENGMU_DICT:
+            return t, pinyin[len(t):]
+    return "", pinyin
+
+def create_db():
+    # con = sqlite3.connect("main.db")
+    # con.execute ("PRAGMA synchronous = NORMAL;")
+    # con.execute ("PRAGMA temp_store = MEMORY;")
+    # con.execute ("PRAGMA default_cache_size = 5000;")
+    print "PRAGMA synchronous = NORMAL;"
+    print "PRAGMA temp_store = MEMORY;"
+    print "PRAGMA default_cache_size = 5000;"
+
+
+    sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);"
+    for i in range(0, 16):
+        column = []
+        for j in range(0, i + 1):
+            column.append ("s%d INTEGER" % j)
+            column.append ("y%d INTEGER" % j)
+        print sql % (i, ",".join(column))
+        # con.execute(sql % (i, column))
+        # con.commit()
+
+    validate_hanzi = get_validate_hanzi()
+    records = list(read_phrases(validate_hanzi))
+    records.sort(lambda a, b: 1 if a[1] - b[1] > 0 else -1)
+    
+    print "BEGIN;"
+    insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);"
+    for i, (hanzi, freq, pinyin) in enumerate(records):
+        columns = []
+        for py in pinyin:
+            s, y = get_sheng_yun(py)
+            s, y = pinyin_id[s], pinyin_id[y]
+            columns.append(s)
+            columns.append(y)
+        values = "'%s', %d, %s" % (hanzi, i, ",".join(map(str,columns)))
+            
+        sql = insert_sql % (len(hanzi) - 1, values)
+        print sql
+    print "COMMIT;"
+    print "VACUUM;"
+
+
+def get_validate_hanzi():
+    validate_hanzi = file("valid_utf16.txt").read().decode("utf16")
+    return set(validate_hanzi)
+
+def read_phrases(validate_hanzi):
+    buf = file("rawdict_utf16_65105_freq.txt").read()
+    buf = unicode(buf, "utf16")
+    buf = buf.strip()
+    for l in buf.split(u'\n'):
+        hanzi, freq, flag, pinyin = l.split(u' ', 3)
+        freq = float(freq)
+        pinyin = pinyin.split()
+        if any(map(lambda c: c not in validate_hanzi, hanzi)):
+            continue
+        yield hanzi, freq, pinyin
+
+def main():
+    create_db()
+ 
+if __name__ == "__main__":
+    main()
diff --git a/scripts/id.py b/scripts/id.py

index 44bd80c..201a3cd 100644 (file)
--- a/scripts/id.py
+++ b/scripts/id.py
@@ -1 +1 @@
-pinyin_id = {'ch': 3, 'zh': 23, 'ai': 25, 'uan': 50, 'iu': 43, 'ong': 45, 'ao': 28, 'an': 26, 'uai': 49, 'ang': 27, 'iong': 42, 'in': 40, 'ia': 35, 'ei': 30, 'ing': 41, 'ie': 39, 'er': 33, 'iao': 38, 'ian': 36, 'eng': 32, 'iang': 37, 'uo': 55, 'r': 15, 'en': 31, 'ui': 53, 'un': 54, 'ue': 52, 'uang': 51, 'a': 24, 'c': 2, 'b': 1, 'e': 29, 'd': 4, 'g': 6, 'f': 5, 'i': 34, 'h': 7, 'k': 9, 'j': 8, 'm': 11, 'l': 10, 'o': 44, 'n': 12, 'q': 14, 'p': 13, 's': 16, 'sh': 17, 'u': 47, 't': 18, 'w': 19, 'v': 56, 'y': 21, 'x': 20, 'ou': 46, 'z': 22, 'ua': 48}
+pinyin_id = {'': 0, 'ch': 3, 'zh': 23, 'ai': 25, 'uan': 50, 'iu': 43, 'ong': 45, 'ao': 28, 'an': 26, 'uai': 49, 'ang': 27, 'iong': 42, 'in': 40, 'ia': 35, 'ei': 30, 'ing': 41, 'ie': 39, 'er': 33, 'iao': 38, 'ian': 36, 'eng': 32, 'iang': 37, 'uo': 55, 'r': 15, 'en': 31, 'ui': 53, 'un': 54, 'ue': 52, 'uang': 51, 'a': 24, 'c': 2, 'b': 1, 'e': 29, 'd': 4, 'g': 6, 'f': 5, 'i': 34, 'h': 7, 'k': 9, 'j': 8, 'm': 11, 'l': 10, 'o': 44, 'n': 12, 'q': 14, 'p': 13, 's': 16, 'sh': 17, 'u': 47, 't': 18, 'w': 19, 'v': 56, 'y': 21, 'x': 20, 'ou': 46, 'z': 22, 'ua': 48}
author	Peng Huang <shawn.p.huang@gmail.com>
	Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)
committer	Peng Huang <shawn.p.huang@gmail.com>
	Mon, 5 Oct 2009 05:18:19 +0000 (13:18 +0800)
data/google/createdb.py	[new file with mode: 0644]	patch \| blob
scripts/id.py		patch \| blob \| history