SLM_SRC_DIR = ../src/slm LEXICON_SRC_DIR = ../src/lexicon PINYIN_SRC_DIR = ../src/pinyin CORPUS_DIR = ../raw SWAP_DIR = ../swap RESULT_DIR = ../data DICTFILE = ${CORPUS_DIR}/dict.utf8 CORPUSFILE = ${CORPUS_DIR}/corpus.utf8 TEST_CORPUSFILE = ${CORPUS_DIR}/test.utf8 REAL_CORPUSFILE = ${CORPUS_DIR}/BIGCORPUS LMTARGET = lm_sc IDS_FILE = ${SWAP_DIR}/${LMTARGET}.ids SWAP_FILE = ${SWAP_DIR}/swap #FILE NAMES for BIGRAM model IDNGRAM_FILE = ${SWAP_DIR}/${LMTARGET}.id2gram RAW_LM_FILE = ${SWAP_DIR}/${LMTARGET}.2gram SLM_FILE = ${SWAP_DIR}/${LMTARGET}.2gm SLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.2gm.arpa TSLM_FILE = ${RESULT_DIR}/${LMTARGET}.t2g TSLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.t2g.arpa #FILE NAMES for TRIGRAM model IDNGRAM_FILE3 = ${SWAP_DIR}/${LMTARGET}.id3gram RAW_LM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gram SLM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm SLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm.arpa TSLM_FILE3 = ${RESULT_DIR}/${LMTARGET}.t3g TSLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa TSLM_REPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.repacked TSLM_UNPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa.unpacked #Lexicon FILE names (raw resource and others) PINYIN_TEXTFILE = ${CORPUS_DIR}/dict.utf8 PINYIN_NMP_TEXTFILE = ${SWAP_DIR}/dict_nmp.utf8 PYTRIE_FILE = ${RESULT_DIR}/pydict_sc.bin PYTRIE_PRINTOUT = ${SWAP_DIR}/pydict_sc.log.utf8 test_corpus : if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi ln -s ${TEST_CORPUSFILE} ${CORPUSFILE} real_corpus : if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi ln -s ${REAL_CORPUSFILE} ${CORPUSFILE} ids : ./mmseg -d ${DICTFILE} -f bin -s 10 -a 9 ${CORPUSFILE} >${IDS_FILE} slmids3: ./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE3} ${CORPUSFILE} >${IDS_FILE} cp ${TSLM_FILE3} ${TSLM_FILE3}.normal slmids: ./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE} ${CORPUSFILE} >${IDS_FILE} cp ${TSLM_FILE} ${TSLM_FILE}.normal #second round bootstrap bigram bs_bigram : slmids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo #second round bootstrap bigram from a trigram model bs_bigram3 : slmids3 m2_idngram m2_slm m2_prune m2_thread m2_tslminfo #This is the command to make a bigram model bigram : ids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo m2_idngram : ./ids2ngram -n 2 -s ${SWAP_FILE} -o ${IDNGRAM_FILE} -p 20000000 ${IDS_FILE} rm -f ${SWAP_FILE} m2_slm: ./slmbuild -n 2 -o ${RAW_LM_FILE} -w 200000 -c 0,2 -d ABS,0.005 -d ABS -b 10 -e 9 ${IDNGRAM_FILE} m2_prune: ./slmprune ${RAW_LM_FILE} ${SLM_FILE} R 100000 200000 m2_thread : ./slmthread ${SLM_FILE} ${TSLM_FILE} m2_tslminfo : ./tslminfo -v -l ${DICTFILE} ${TSLM_FILE} >${TSLM_INFO_FILE} #Use this to generate bigram non-threaded lm arpa information if needed m2_info : ./slminfo -p -v -l ${DICTFILE} ${SLM_FILE} >${SLM_INFO_FILE} #second round bootstrap to make trigram model bs_trigram : slmids3 m3_idngram m3_slm m3_prune m3_thread m3_tslminfo #This is the command to make a trigram model trigram : ids m3_idngram m3_slm m3_prune m3_thread m3_tslminfo m3_idngram : ./ids2ngram -n 3 -s ${SWAP_FILE} -o ${IDNGRAM_FILE3} -p 20000000 ${IDS_FILE} rm -f ${SWAP_FILE} m3_slm: ./slmbuild -n 3 -o ${RAW_LM_FILE3} -w 200000 -c 0,2,2 -d ABS,0.0005 -d ABS -d ABS -b 10 -e 9 ${IDNGRAM_FILE3} m3_prune: ./slmprune ${RAW_LM_FILE3} ${SLM_FILE3} R 100000 2500000 1000000 m3_thread : ./slmthread ${SLM_FILE3} ${TSLM_FILE3} m3_tslminfo : ./tslminfo -p -v -l ${DICTFILE} ${TSLM_FILE3} >${TSLM_INFO_FILE3} m3_tslmpack : ./tslmpack ${TSLM_INFO_FILE3} ${DICTFILE} ${TSLM_REPACKED_FILE3} m3_tslmunpack : ./tslminfo -p -v -l ${DICTFILE} ${TSLM_REPACKED_FILE3} >${TSLM_UNPACKED_FILE3} #Use this to generate trigram non-threaded lm arpa information if needed m3_info : ./slminfo -p -v -l ${DICTFILE} ${SLM_FILE3} >${SLM_INFO_FILE3} #clean all intermedian file for building the model model_clean : rm -f ${IDS_FILE} rm -f ${SWAP_FILE} rm -f ${IDNGRAM_FILE} ${RAW_LM_FILE} rm -f ${IDNGRAM_FILE3} ${RAW_LM_FILE3} lexicon : ./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE3} lexicon2 : ./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE}