1 SLM_SRC_DIR = ../src/slm
2 LEXICON_SRC_DIR = ../src/lexicon
3 PINYIN_SRC_DIR = ../src/pinyin
9 DICTFILE = ${CORPUS_DIR}/dict.utf8
10 CORPUSFILE = ${CORPUS_DIR}/corpus.utf8
11 TEST_CORPUSFILE = ${CORPUS_DIR}/test.utf8
12 REAL_CORPUSFILE = ${CORPUS_DIR}/BIGCORPUS
15 IDS_FILE = ${SWAP_DIR}/${LMTARGET}.ids
16 SWAP_FILE = ${SWAP_DIR}/swap
18 #FILE NAMES for BIGRAM model
19 IDNGRAM_FILE = ${SWAP_DIR}/${LMTARGET}.id2gram
20 RAW_LM_FILE = ${SWAP_DIR}/${LMTARGET}.2gram
21 SLM_FILE = ${SWAP_DIR}/${LMTARGET}.2gm
22 SLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.2gm.arpa
23 TSLM_FILE = ${RESULT_DIR}/${LMTARGET}.t2g
24 TSLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.t2g.arpa
26 #FILE NAMES for TRIGRAM model
27 IDNGRAM_FILE3 = ${SWAP_DIR}/${LMTARGET}.id3gram
28 RAW_LM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gram
29 SLM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm
30 SLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm.arpa
31 TSLM_FILE3 = ${RESULT_DIR}/${LMTARGET}.t3g
32 TSLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa
33 TSLM_REPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.repacked
34 TSLM_UNPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa.unpacked
36 #Lexicon FILE names (raw resource and others)
37 PINYIN_TEXTFILE = ${CORPUS_DIR}/dict.utf8
38 PINYIN_NMP_TEXTFILE = ${SWAP_DIR}/dict_nmp.utf8
39 PYTRIE_FILE = ${RESULT_DIR}/pydict_sc.bin
40 PYTRIE_PRINTOUT = ${SWAP_DIR}/pydict_sc.log.utf8
43 if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi
44 ln -s ${TEST_CORPUSFILE} ${CORPUSFILE}
47 if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi
48 ln -s ${REAL_CORPUSFILE} ${CORPUSFILE}
51 ./mmseg -d ${DICTFILE} -f bin -s 10 -a 9 ${CORPUSFILE} >${IDS_FILE}
54 ./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE3} ${CORPUSFILE} >${IDS_FILE}
55 cp ${TSLM_FILE3} ${TSLM_FILE3}.normal
58 ./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE} ${CORPUSFILE} >${IDS_FILE}
59 cp ${TSLM_FILE} ${TSLM_FILE}.normal
61 #second round bootstrap bigram
62 bs_bigram : slmids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo
64 #second round bootstrap bigram from a trigram model
65 bs_bigram3 : slmids3 m2_idngram m2_slm m2_prune m2_thread m2_tslminfo
67 #This is the command to make a bigram model
68 bigram : ids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo
71 ./ids2ngram -n 2 -s ${SWAP_FILE} -o ${IDNGRAM_FILE} -p 20000000 ${IDS_FILE}
75 ./slmbuild -n 2 -o ${RAW_LM_FILE} -w 200000 -c 0,2 -d ABS,0.005 -d ABS -b 10 -e 9 ${IDNGRAM_FILE}
78 ./slmprune ${RAW_LM_FILE} ${SLM_FILE} R 100000 200000
81 ./slmthread ${SLM_FILE} ${TSLM_FILE}
84 ./tslminfo -v -l ${DICTFILE} ${TSLM_FILE} >${TSLM_INFO_FILE}
86 #Use this to generate bigram non-threaded lm arpa information if needed
88 ./slminfo -p -v -l ${DICTFILE} ${SLM_FILE} >${SLM_INFO_FILE}
90 #second round bootstrap to make trigram model
91 bs_trigram : slmids3 m3_idngram m3_slm m3_prune m3_thread m3_tslminfo
93 #This is the command to make a trigram model
94 trigram : ids m3_idngram m3_slm m3_prune m3_thread m3_tslminfo
97 ./ids2ngram -n 3 -s ${SWAP_FILE} -o ${IDNGRAM_FILE3} -p 20000000 ${IDS_FILE}
101 ./slmbuild -n 3 -o ${RAW_LM_FILE3} -w 200000 -c 0,2,2 -d ABS,0.0005 -d ABS -d ABS -b 10 -e 9 ${IDNGRAM_FILE3}
104 ./slmprune ${RAW_LM_FILE3} ${SLM_FILE3} R 100000 2500000 1000000
107 ./slmthread ${SLM_FILE3} ${TSLM_FILE3}
110 ./tslminfo -p -v -l ${DICTFILE} ${TSLM_FILE3} >${TSLM_INFO_FILE3}
113 ./tslmpack ${TSLM_INFO_FILE3} ${DICTFILE} ${TSLM_REPACKED_FILE3}
116 ./tslminfo -p -v -l ${DICTFILE} ${TSLM_REPACKED_FILE3} >${TSLM_UNPACKED_FILE3}
118 #Use this to generate trigram non-threaded lm arpa information if needed
120 ./slminfo -p -v -l ${DICTFILE} ${SLM_FILE3} >${SLM_INFO_FILE3}
122 #clean all intermedian file for building the model
126 rm -f ${IDNGRAM_FILE} ${RAW_LM_FILE}
127 rm -f ${IDNGRAM_FILE3} ${RAW_LM_FILE3}
130 ./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE3}
133 ./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE}