15 #include "pytrie_gen.h"
16 #include "pinyin_data.h"
17 #include "trie_writer.h"
20 skipSpace(const char* p)
22 while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
28 skipNonSpace(const char* p)
30 while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
36 insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id)
38 CPinyinTrieMaker::CWordSet::const_iterator it = idset.find(id);
39 if (it == idset.end())
42 const CPinyinTrieMaker::TWordId& a = *it;
43 if ((a.anony.m_bHide &&
45 (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost >
53 struct TSyllableInfo {
57 TSyllableInfo(const char* py = NULL, int cost = 0) : m_py(py), m_cost(cost)
61 operator<(const TSyllableInfo& b) const
69 isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric)
71 static char gbstr[256];
72 static char utstr[256];
74 TIConvSrcPtr src = (TIConvSrcPtr)utf8;
75 size_t srclen = strlen((char*)src) + 1;
76 char* dst = (char*)gbstr;
78 size_t res = iconv(ic, &src, &srclen, &dst, &dstlen);
80 if (res != size_t(-1) && srclen == 0) {
81 // do revert convertion and compare them
82 src = (TIConvSrcPtr)gbstr;
83 srclen = strlen((char*)src) + 1;
86 res = iconv(ric, &src, &srclen, &dst, &dstlen);
87 if (res != size_t(-1) && srclen == 0)
88 return(strcmp(utf8, utstr) == 0);
93 //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk
95 getPureGBEncoding(const char* utf8str)
97 static iconv_t ic_gb = iconv_open("GB2312", "UTF-8");
98 static iconv_t ic_gbk = iconv_open("GBK", "UTF-8");
99 static iconv_t ric_gb = iconv_open("UTF-8", "GB2312");
100 static iconv_t ric_gbk = iconv_open("UTF-8", "GBK");
104 if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) {
105 ret = 1; // at least it is contains some GBK char
106 if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk))
107 ret = 3; //contains some GB18030-only char
110 fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret);
116 #else // !HAVE_ICONV_H
118 getPureGBEncoding(const char* utf8str)
123 #endif // HAVE_ICONV_H
129 std::set<TSyllableInfo>& pyset)
133 /* ignore the empty lines and comment lines */
134 if (*buf == '\n' || *buf == '#')
137 char* p = (char*)skipSpace(buf);
138 char* t = (char*)skipNonSpace(p);
139 while (p < t) *word_buf++ = *p++;
142 p = (char*)skipSpace(p);
143 t = (char*)skipNonSpace(p);
147 p = (char*)skipSpace(t);
150 t = (char*)skipNonSpace(p);
153 while ((*p >= 'a' && *p <= 'z') || (*p == '\''))
155 if ((p > s) && ((*p == 0) || (*p == ':'))) {
159 cost = -log2(atof(p)/100);
161 pyset.insert(TSyllableInfo(s, cost));
163 p = (char*)skipSpace(t);
165 return pyset.size() > 0;
169 CPinyinTrieMaker::CPinyinTrieMaker()
171 m_RootNode.m_bExpanded = true;
173 /**********************************************************
175 以行为单位的文本文件。行中是空格或TAB(1个或多个)分
176 隔开的字段。 第一个字段为词,第二个字段是word id。
177 后面的字段中,如果一个字段仅仅由小写字母和'构成,
178 则认为该字段是该词的一个拼音。行长最大4095;
179 **********************************************************/
182 CPinyinTrieMaker::constructFromLexicon(const char* fileName)
184 static char buf[4096];
185 static char word_buf[2048];
189 std::set<TSyllableInfo> pyset;
190 FILE *fp = fopen(fileName, "r");
191 if (!fp) return false;
192 printf("Adding pinyin and corresponding words..."); fflush(stdout);
193 while (fgets(buf, sizeof(buf), fp) != NULL) {
194 if (!parseLine(buf, word_buf, id, pyset)) {
195 if (word_buf[0] != L'<' && word_buf[0] != 0) {
196 if (m_Lexicon.size() < id + 1) m_Lexicon.resize(id + 1);
197 m_Lexicon[id] = std::string(word_buf);
201 unsigned gbcategory = getPureGBEncoding(word_buf);
203 std::set<TSyllableInfo>::const_iterator its = pyset.begin();
204 std::set<TSyllableInfo>::const_iterator ite = pyset.end();
205 for (; its != ite; ++its) {
206 const char *pystr = its->m_py.c_str();
207 if (m_Lexicon.size() < id + 1) m_Lexicon.resize(id + 1);
208 m_Lexicon[id] = std::string(word_buf);
210 CPinyinTrieMaker::TWordId wid(id, its->m_cost, false, gbcategory);
211 suc = insertFullPinyinPair(pystr, wid) && suc;
216 printf("\n %zd primitive nodes", TNode::m_AllNodes.size()); fflush(stdout);
218 threadNonCompletePinyin();
219 printf("\n %zd total nodes", TNode::m_AllNodes.size()); fflush(stdout);
221 std::string pyPrefix = "";
222 printf("\n"); fflush(stdout);
227 CPinyinTrieMaker::CNodeList CPinyinTrieMaker::TNode::m_AllNodes;
228 CPinyinTrieMaker::TNode::TNode()
229 : m_bExpanded(false), m_bFullSyllableTransfer(false)
231 m_AllNodes.push_back(this);
235 CPinyinTrieMaker::PNodeSet::operator<(const PNodeSet& another) const
237 CNodeSet::const_iterator t1 = m_pns->begin();
238 CNodeSet::const_iterator t2 = m_pns->end();
239 CNodeSet::const_iterator a1 = another.m_pns->begin();
240 CNodeSet::const_iterator a2 = another.m_pns->end();
241 for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
242 if (*t1 < *a1) return true;
243 if (*t1 > *a1) return false;
249 CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const
251 CNodeSet::const_iterator t1 = m_pns->begin();
252 CNodeSet::const_iterator t2 = m_pns->end();
253 CNodeSet::const_iterator a1 = another.m_pns->begin();
254 CNodeSet::const_iterator a2 = another.m_pns->end();
255 for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
256 if (*t1 != *a1) return false;
258 return(a1 == a2 && t1 != t2);
262 parseFullPinyin(const char *pinyin, std::vector<TSyllable> &ret)
264 char *buf = strdup(pinyin);
265 char *p = buf, *q = buf;
271 unsigned s = CPinyinData::encodeSyllable(q);
273 ret.push_back(TSyllable(s));
275 printf("\nWarning! unrecognized syllable %s", q);
282 unsigned s = CPinyinData::encodeSyllable(q);
284 ret.push_back(TSyllable(s));
286 printf("\nWarning! unrecognized syllable %s", q);
292 CPinyinTrieMaker::TNode*
293 CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned s)
295 CTrans::const_iterator itt = pnode->m_Trans.find(s);
296 CTrans::const_iterator ite = pnode->m_Trans.end();
298 TNode *p = new TNode();
299 p->m_bFullSyllableTransfer = true;
300 p->m_bExpanded = true;
301 pnode->m_Trans[s] = p;
308 CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid)
310 TNode *pnode = &m_RootNode;
311 std::vector<TSyllable> syllables;
312 parseFullPinyin(pinyin, syllables);
314 if (syllables.empty())
317 std::vector<TSyllable>::const_iterator it = syllables.begin();
318 std::vector<TSyllable>::const_iterator ite = syllables.end();
320 for (; it != ite; ++it)
321 pnode = insertTransfer(pnode, *it);
323 insertWordId(pnode->m_WordIdSet, wid);
327 CPinyinTrieMaker::TNode*
328 CPinyinTrieMaker::addCombinedTransfers(TNode *pnode,
330 const CNodeSet& nodes)
332 assert(!nodes.empty());
335 if (nodes.size() == 1) {
336 p = *(nodes.begin());
339 p->m_cmbNodes = nodes;
340 m_StateMap[&p->m_cmbNodes] = p;
342 CNodeSet::const_iterator it = nodes.begin();
343 CNodeSet::const_iterator ite = nodes.end();
344 for (; it != ite; ++it) {
345 CWordSet::const_iterator wit = (*it)->m_WordIdSet.begin();
346 CWordSet::const_iterator wite = (*it)->m_WordIdSet.end();
348 for (; wit != wite; ++wit) {
349 CWordSet::iterator tmp = p->m_WordIdSet.find (*wit);
351 if (tmp == p->m_WordIdSet.end()) {
352 p->m_WordIdSet.insert (*wit);
353 } else if (tmp->anony.m_cost > wit->anony.m_cost) {
354 p->m_WordIdSet.erase (tmp);
355 p->m_WordIdSet.insert (*wit);
361 pnode->m_Trans[s] = p;
366 CPinyinTrieMaker::combineInitialTrans(TNode *pnode)
368 std::map<unsigned, CNodeSet> combTrans;
370 CTrans::const_iterator itTrans = pnode->m_Trans.begin();
371 CTrans::const_iterator itTransLast = pnode->m_Trans.end();
372 for (; itTrans != itTransLast; ++itTrans) {
373 TSyllable s = (TSyllable)itTrans->first;
375 s.final = s.tone = 0;
376 combTrans[s].insert(itTrans->second);
380 std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin();
381 for (; itCombTrans != combTrans.end(); ++itCombTrans)
382 addCombinedTransfers(pnode, itCombTrans->first, itCombTrans->second);
386 CPinyinTrieMaker::expandCombinedNode(TNode *pnode)
388 assert(pnode->m_cmbNodes.size() >= 1);
390 std::map<unsigned, CNodeSet> combTrans;
391 CNodeSet::const_iterator itNode = pnode->m_cmbNodes.begin();
392 CNodeSet::const_iterator itNodeLast = pnode->m_cmbNodes.end();
393 for (; itNode != itNodeLast; ++itNode) {
394 CTrans::const_iterator itTrans = (*itNode)->m_Trans.begin();
395 CTrans::const_iterator itTransLast = (*itNode)->m_Trans.end();
396 for (; itTrans != itTransLast; ++itTrans)
397 combTrans[itTrans->first].insert(itTrans->second);
400 std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin();
401 for (; itCombTrans != combTrans.end(); ++itCombTrans) {
403 unsigned s = itCombTrans->first;
404 CNodeSet nodes = itCombTrans->second;
406 CStateMap::const_iterator itStateMap = m_StateMap.find(&nodes);
407 if (itStateMap != m_StateMap.end())
408 p = itStateMap->second;
410 p = addCombinedTransfers(pnode, s, nodes);
412 pnode->m_Trans[s] = p;
415 pnode->m_bExpanded = true;
419 CPinyinTrieMaker::threadNonCompletePinyin(void)
421 CNodeList::const_iterator itNode = TNode::m_AllNodes.begin();
422 for (; itNode != TNode::m_AllNodes.end(); ++itNode) {
423 TNode* pnode = *itNode;
424 if (pnode->m_bExpanded)
425 combineInitialTrans(pnode);
427 expandCombinedNode(pnode);
433 CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt,
437 FILE* fp = fopen(fileName, "wb");
439 suc = write(fp, psrt, revert_endian);
446 CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt, bool revert_endian)
449 static TWCHAR wbuf[1024];
451 std::map<TNode*, unsigned int> nodeOffsetMap;
453 unsigned int nWord = m_Lexicon.size();
454 unsigned int nNode = TNode::m_AllNodes.size();
455 unsigned int lexiconOffset;
456 unsigned int offset = sizeof(unsigned int) * 3;
458 CNodeList::const_iterator itNode = TNode::m_AllNodes.begin();
459 CNodeList::const_iterator itNodeLast = TNode::m_AllNodes.end();
460 for (; itNode != itNodeLast; ++itNode) {
461 nodeOffsetMap[*itNode] = offset;
462 offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(),
463 (*itNode)->m_WordIdSet.size());
465 lexiconOffset = offset;
466 CLexicon::const_iterator itWordStr = m_Lexicon.begin();
467 CLexicon::const_iterator itWordStrLast = m_Lexicon.end();
468 for (; itWordStr != itWordStrLast; ++itWordStr) {
469 MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
470 int sz = WCSLEN(wbuf);
471 offset += (sz + 1) * sizeof(TWCHAR);
474 Writer f(fp, revert_endian);
476 suc = f.write(nWord);
477 suc = f.write(nNode);
478 suc = f.write(lexiconOffset);
480 itNode = TNode::m_AllNodes.begin();
481 itNodeLast = TNode::m_AllNodes.end();
483 for (; itNode != itNodeLast && suc; ++itNode) {
484 CPinyinTrie::TNode outNode;
485 TNode *pnode = *itNode;
487 outNode.m_nTransfer = pnode->m_Trans.size();
488 outNode.m_nWordId = pnode->m_WordIdSet.size();
489 outNode.m_bFullSyllableTransfer = pnode->m_bFullSyllableTransfer;
490 outNode.m_csLevel = 0;
492 CWordSet::const_iterator itId = pnode->m_WordIdSet.begin();
493 CWordSet::const_iterator itIdLast = pnode->m_WordIdSet.end();
494 for (; itId != itIdLast && outNode.m_csLevel < 3; ++itId) {
495 if (outNode.m_csLevel < itId->anony.m_csLevel)
496 outNode.m_csLevel = itId->anony.m_csLevel;
499 suc = f.write(outNode);
501 CTrans::const_iterator itTrans = pnode->m_Trans.begin();
502 CTrans::const_iterator itTransLast = pnode->m_Trans.end();
503 for (; itTrans != itTransLast && suc; ++itTrans) {
504 CPinyinTrie::TTransUnit tru;
505 tru.m_Syllable = itTrans->first;
506 tru.m_Offset = nodeOffsetMap[itTrans->second];
507 assert(tru.m_Offset != 0 && tru.m_Offset < lexiconOffset);
512 itId = pnode->m_WordIdSet.begin();
513 itIdLast = pnode->m_WordIdSet.end();
514 for (; itId != itIdLast; ++itId)
515 vec.push_back(TWordInfo(*itId, psrt->getCost(*itId) + itId->anony.m_cost,
516 psrt->isSeen(*itId)));
517 std::make_heap(vec.begin(), vec.end());
518 std::sort_heap(vec.begin(), vec.end());
520 CWordVec::const_iterator itv = vec.begin();
521 CWordVec::const_iterator itve = vec.end();
522 for (; itv != itve && suc; ++itv) {
523 CPinyinTrie::TWordIdInfo wi;
524 wi.m_id = itv->m_id.anony.m_id;
525 assert(wi.m_id < nWord);
526 wi.m_csLevel = itv->m_id.anony.m_csLevel;
527 wi.m_bSeen = ((itv->m_bSeen) ? (1) : (0));
528 wi.m_cost = itv->m_id.anony.m_cost;
533 itWordStr = m_Lexicon.begin();
534 itWordStrLast = m_Lexicon.end();
535 for (; itWordStr != itWordStrLast && suc; ++itWordStr) {
536 MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
537 int sz = WCSLEN(wbuf);
538 suc = f.write(wbuf, (sz + 1));