2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
44 #include "imi_defines.h"
45 #include "imi_context.h"
47 TCandiRank::TCandiRank(bool user, bool best, unsigned len,
48 bool fromLattice, TSentenceScore score)
50 anony.m_user = (user) ? 0 : 1;
51 anony.m_best = (best) ? 0 : 1;
52 anony.m_len = (len > 31) ? (0) : (31 - len);
53 anony.m_lattice = (fromLattice) ? 0 : 1;
55 double ds = -score.log2();
60 else if (ds < -32768.0)
62 unsigned cost = unsigned((ds + 32768.0) * 256.0);
66 TCandiRank::TCandiRank(bool user, bool best, unsigned len,
67 bool fromLattice, unsigned rank)
69 anony.m_user = (user) ? 0 : 1;
70 anony.m_best = (best) ? 0 : 1;
71 anony.m_len = (len > 31) ? (0) : (31 - len);
72 anony.m_lattice = (fromLattice) ? 0 : 1;
77 CLatticeFrame::print(std::string prefix)
79 if (m_bwType & BESTWORD) printf("B");
80 if (m_bwType & USER_SELECTED) printf("U");
84 printf(" Lexicon States:\n");
85 for_each(m_lexiconStates.begin(), m_lexiconStates.end(),
86 bind2nd(mem_fun_ref(&TLexiconState::print), prefix));
88 printf(" Lattice States:\n");
89 for_each(m_latticeStates.begin(), m_latticeStates.end(),
90 bind2nd(mem_fun_ref(&TLatticeState::print), prefix));
95 CIMIContext::printLattice()
99 for (size_t i = 0; i <= m_tailIdx; ++i) {
100 if (m_lattice[i].m_type == CLatticeFrame::UNUSED)
103 printf("Lattice Frame [%lu]:", i);
104 m_lattice[i].print(prefix);
108 CIMIContext::CIMIContext()
109 : m_tailIdx(1), m_nBest(0), m_maxBest(1), m_maxTailCandidateNum(0),
110 m_pModel(NULL), m_pPinyinTrie(NULL), m_pUserDict(NULL), m_pHistory(NULL),
111 m_historyPower(5), m_csLevel(0), m_bFullSymbolForwarding(false),
112 m_bOmitPunct(false), m_pGetFullSymbolOp(NULL),
113 m_bFullPunctForwarding(true), m_pGetFullPunctOp(NULL),
114 m_pPySegmentor(NULL), m_bNonCompleteSyllable(true),
115 m_bDynaCandiOrder(true), m_candiStarts(0), m_candiEnds(0)
117 m_lattice.resize(MAX_LATTICE_LENGTH);
118 m_lattice[0].m_latticeStates.add(TLatticeState(-1.0, 0));
119 setMaxBest(m_maxBest);
123 CIMIContext::setCoreData(CIMIData *pCoreData)
125 m_pModel = pCoreData->getSlm();
126 m_pPinyinTrie = pCoreData->getPinyinTrie();
135 m_candiStarts = m_candiEnds = 0;
139 CIMIContext::_clearFrom(unsigned idx)
141 for (size_t i = idx; i < m_tailIdx + 1; i++)
142 m_lattice[i].clear();
146 CIMIContext::buildLattice(IPySegmentor *segmentor, bool doSearch)
148 m_pPySegmentor = segmentor;
149 return _buildLattice(segmentor->getSegments(),
150 segmentor->updatedFrom() + 1, doSearch);
154 CIMIContext::_buildLattice(IPySegmentor::TSegmentVec &segments,
155 unsigned rebuildFrom,
158 _clearFrom(rebuildFrom);
160 IPySegmentor::TSegmentVec::const_iterator it = segments.begin();
161 IPySegmentor::TSegmentVec::const_iterator ite = segments.end();
164 for (; it != ite; ++it) {
168 if (i < rebuildFrom - 1)
171 if (j >= m_lattice.capacity() - 1)
174 if (it->m_type == IPySegmentor::SYLLABLE)
175 _forwardSyllables(i, j, *it);
176 else if (it->m_type == IPySegmentor::SYLLABLE_SEP)
177 _forwardSyllableSep(i, j);
179 _forwardString(i, j, it->m_syllables);
180 m_bOmitPunct = false;
183 _forwardTail(j, j + 1);
186 return doSearch && searchFrom(rebuildFrom);
190 CIMIContext::_forwardSyllables(unsigned i,
192 const IPySegmentor::TSegment& seg)
194 std::vector<unsigned>::const_iterator it = seg.m_syllables.begin();
195 std::vector<unsigned>::const_iterator ite = seg.m_syllables.end();
197 for (; it != ite; ++it)
198 _forwardSingleSyllable(i, j, *it, seg);
200 it = seg.m_fuzzy_syllables.begin();
201 ite = seg.m_fuzzy_syllables.end();
203 for (; it != ite; ++it)
204 _forwardSingleSyllable(i, j, *it, seg, true);
209 CIMIContext::_forwardString(unsigned i,
211 const std::vector<unsigned>& strbuf)
213 if (strbuf.size() == 1) {
214 unsigned ch = strbuf[0];
216 _forwardPunctChar(i, j, ch);
218 _forwardOrdinaryChar(i, j, ch);
221 CLatticeFrame &fr = m_lattice[j];
222 fr.m_wstr.assign(strbuf.begin(), strbuf.end());
223 fr.m_lexiconStates.push_back(TLexiconState(i, 0));
228 CIMIContext::_forwardSingleSyllable(unsigned i,
231 const IPySegmentor::TSegment& seg,
234 const CPinyinTrie::TNode * pn = NULL;
236 CLatticeFrame &fr = m_lattice[j];
237 fr.m_type = CLatticeFrame::SYLLABLE;
239 CLexiconStates::iterator it = m_lattice[i].m_lexiconStates.begin();
240 CLexiconStates::iterator ite = m_lattice[i].m_lexiconStates.end();
241 for (; it != ite; ++it) {
242 TLexiconState &lxst = *it;
243 bool added_from_sysdict = false;
245 if (lxst.m_pPYNode) {
246 // try to match a word from lattice i to lattice j
247 // and if match, we'll count it as a new lexicon on lattice j
248 pn = m_pPinyinTrie->transfer(lxst.m_pPYNode, syllable);
250 added_from_sysdict = true;
251 TLexiconState new_lxst = TLexiconState(lxst.m_start,
256 new_lxst.m_syls.push_back(syllable);
257 new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
258 (seg.m_inner_fuzzy ? 1 : 0);
259 new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
260 fr.m_lexiconStates.push_back(new_lxst);
264 if (m_pUserDict && lxst.m_syls.size() < MAX_USRDEF_WORD_LEN) {
265 // try to match a word from user dict
266 CSyllables syls = lxst.m_syls;
267 syls.push_back(syllable);
268 std::vector<CPinyinTrie::TWordIdInfo> words;
269 m_pUserDict->getWords(syls, words);
270 if (!words.empty() || !added_from_sysdict) {
271 // even if the words is empty we'll add a fake lexicon
272 // here. This helps _saveUserDict detect new words.
273 TLexiconState new_lxst = TLexiconState(lxst.m_start,
278 new_lxst.m_syls.push_back(syllable);
279 new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
280 (seg.m_inner_fuzzy ? 1 : 0);
281 new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
282 fr.m_lexiconStates.push_back(new_lxst);
287 // last, create a lexicon for single character with only one syllable
288 pn = m_pPinyinTrie->transfer(syllable);
291 syls.push_back(syllable);
292 std::vector<unsigned> seg_path;
293 seg_path.push_back(seg.m_start);
294 seg_path.push_back(seg.m_start + seg.m_len);
295 TLexiconState new_lxst = TLexiconState(i, pn, syls, seg_path, fuzzy);
296 new_lxst.m_num_of_inner_fuzzies = seg.m_inner_fuzzy ? 1 : 0;
297 fr.m_lexiconStates.push_back(new_lxst);
302 CIMIContext::_forwardSyllableSep(unsigned i, unsigned j)
304 CLatticeFrame &fr = m_lattice[j];
305 fr.m_type = CLatticeFrame::SYLLABLE | CLatticeFrame::SYLLABLE_SEP;
306 fr.m_lexiconStates = m_lattice[i].m_lexiconStates;
308 CLexiconStates::iterator it = fr.m_lexiconStates.begin();
309 CLexiconStates::iterator ite = fr.m_lexiconStates.end();
310 for (; it != ite; ++it) {
311 it->m_seg_path.back() = j;
316 CIMIContext::_forwardPunctChar(unsigned i, unsigned j, unsigned ch)
318 CLatticeFrame &fr = m_lattice[j];
323 if (m_pGetFullPunctOp) {
324 if (m_bFullPunctForwarding && !m_bOmitPunct) {
325 wstr = (*m_pGetFullPunctOp)(ch);
326 wid = m_pPinyinTrie->getSymbolId(wstr);
330 fr.m_type = CLatticeFrame::PUNC;
335 fr.m_wstr.push_back(ch);
337 fr.m_lexiconStates.push_back(TLexiconState(i, wid));
341 CIMIContext::_forwardOrdinaryChar(unsigned i, unsigned j, unsigned ch)
343 CLatticeFrame &fr = m_lattice[j];
348 if (m_pGetFullSymbolOp) {
349 wstr = (*m_pGetFullSymbolOp)(ch);
350 wid = m_pPinyinTrie->getSymbolId(wstr);
352 if (!m_bFullSymbolForwarding)
356 fr.m_type = wid ? CLatticeFrame::SYMBOL : CLatticeFrame::ASCII;
361 fr.m_wstr.push_back(ch);
363 fr.m_lexiconStates.push_back(TLexiconState(i, wid));
367 CIMIContext::_forwardTail(unsigned i, unsigned j)
369 CLatticeFrame &fr = m_lattice[j];
370 fr.m_type = CLatticeFrame::TAIL;
372 fr.m_lexiconStates.push_back(TLexiconState(i, ENDING_WORD_ID));
375 static double exp2_tbl[32] = {
376 exp2(-0), exp2(-1), exp2(-2), exp2(-3), exp2(-4), exp2(-5), exp2(-6), exp2(-7),
377 exp2(-8), exp2(-9), exp2(-10), exp2(-11), exp2(-12), exp2(-13), exp2(-14),
378 exp2(-15), exp2(-16), exp2(-17), exp2(-18), exp2(-19), exp2(-20), exp2(-21),
379 exp2(-22), exp2(-23), exp2(-24), exp2(-25), exp2(-26), exp2(-27), exp2(-28),
380 exp2(-29), exp2(-30), exp2(-31)
384 CIMIContext::searchFrom(unsigned idx)
386 bool affectCandidates = (idx <= m_candiEnds);
388 for (; idx <= m_tailIdx; ++idx) {
389 CLatticeFrame &fr = m_lattice[idx];
391 if (fr.m_type == CLatticeFrame::UNUSED)
394 fr.m_latticeStates.clear();
396 /* user selected word might be cut in next step */
397 if (fr.m_bwType & CLatticeFrame::USER_SELECTED) {
398 _transferBetween(fr.m_selWord.m_start, idx,
399 fr.m_selWord.m_pLexiconState,
400 fr.m_selWord.m_wordId);
403 CLexiconStates::iterator it = fr.m_lexiconStates.begin();
404 CLexiconStates::iterator ite = fr.m_lexiconStates.end();
405 for (; it != ite; ++it) {
406 unsigned word_num = 0;
407 TLexiconState &lxst = *it;
408 const CPinyinTrie::TWordIdInfo *words = lxst.getWords(word_num);
413 if (lxst.m_start == m_candiStarts && idx > m_candiEnds)
414 affectCandidates = true;
416 // only selected the word with higher unigram probablities, and
417 // narrow the search deepth and lower the initial score for fuzzy
419 int maxsz = it->m_bFuzzy ? MAX_LEXICON_TRIES /
420 2 : MAX_LEXICON_TRIES;
422 double ic = it->m_bFuzzy ? 0.5 : 1.0;
424 int sz = (int) word_num < maxsz ? (int) word_num : maxsz;
425 int i = 0, count = 0;
427 while (count < sz && i < sz && (words[i].m_bSeen || count < 2)) {
428 if (m_csLevel >= words[i].m_csLevel) {
429 // printf("cost %d\n", words[i].m_cost);
430 _transferBetween(lxst.m_start, idx, &lxst, words[i].m_id,
431 ic * exp2_tbl[words[i].m_cost]);
437 /* try extra words in history cache */
439 while (i < (int) word_num) {
440 if (m_csLevel >= words[i].m_csLevel
441 && m_pHistory->seenBefore(words[i].m_id)) {
442 // printf("history cost %d\n", words[i].m_cost);
443 _transferBetween(lxst.m_start, idx, &lxst,
445 ic * exp2_tbl[words[i].m_cost]);
458 std::vector<TLatticeState> tail_states =
459 m_lattice[m_tailIdx].m_latticeStates.getFilteredResult();
462 for (int i = 0; i < tail_states.size(); i++) {
464 tail_states[i].m_score.toString(score);
465 printf("score[%d]: %s\n", i, score.c_str());
469 for (size_t i = 0; i < m_maxBest; i++) {
471 if (_backTracePaths(tail_states, m_nBest, path, segpath)) {
472 m_path.push_back(path);
473 m_segPath.push_back(segpath);
478 if (m_pPySegmentor && m_nBest > 0 && !m_segPath[0].empty())
479 m_pPySegmentor->notify_best_segpath(m_segPath[0]);
481 return affectCandidates;
485 CIMIContext::_transferBetween(unsigned start, unsigned end,
486 TLexiconState* plxst, unsigned wid,
489 CLatticeFrame &start_fr = m_lattice[start];
490 CLatticeFrame &end_fr = m_lattice[end];
492 TLatticeState node(-1.0, end, plxst);
493 TSentenceScore efic(ic);
495 if ((end_fr.m_bwType & CLatticeFrame::USER_SELECTED)
496 && end_fr.m_selWord.m_wordId == wid)
497 efic = TSentenceScore(30000, 1.0);
499 static double s_history_distribution[] = {
500 0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
503 double weight_h = s_history_distribution[m_historyPower];
504 double weight_s = 1.0 - weight_h;
506 CLatticeStates::iterator it = start_fr.m_latticeStates.begin();
507 CLatticeStates::iterator ite = start_fr.m_latticeStates.end();
509 for (; it != ite; ++it) {
510 // for 1-length lattice states, replace ending_word_id (comma)
511 // with none_word_id (recognized by CThreadSlm)
513 if (wid == ENDING_WORD_ID && it->m_pBackTraceNode && it->m_pBackTraceNode->m_frIdx == 0)
516 node.m_pBackTraceNode = &(*it);
517 node.m_backTraceWordId = wid;
519 double ts = m_pModel->transfer(it->m_slmState, _wid, node.m_slmState);
520 m_pModel->historify(node.m_slmState);
522 // backward to psuedo root, so wid is probably a user word,
523 // save the wid in idx field, so that later we could get it via
524 // CThreadSlm::lastWordId, to calculate p_{cache} correctly.
525 if (node.m_slmState.getLevel() == 0
526 && m_pHistory && m_pHistory->seenBefore(wid))
527 node.m_slmState.setIdx(wid); // an psuedo unigram node state
530 unsigned history[2] = { m_pModel->lastWordId(it->m_slmState), _wid };
531 double hpr = m_pHistory->pr(history, history + 2);
532 ts = weight_s * ts + weight_h * hpr;
535 node.m_score = it->m_score * efic * TSentenceScore(ts);
537 // node.m_score.toString(buf);
538 // printf("node score %s ts=%lf ", buf.c_str(), ts);
539 // it->m_score.toString(buf);
540 // printf("%s ic=%lf\n", buf.c_str(), ic);
541 end_fr.m_latticeStates.add(node);
546 CIMIContext::_backTracePaths(const std::vector<TLatticeState>& tail_states,
547 int rank, TPath& path, TPath& segmentPath)
552 if (rank >= (int) tail_states.size()) {
553 // rank out of bounds, only return the segment path
557 const TLatticeState *bs = &(tail_states[rank]);
559 while (bs->m_pBackTraceNode) {
560 unsigned start = bs->m_pBackTraceNode->m_frIdx;
561 unsigned end = bs->m_frIdx;
562 CLatticeFrame & end_fr = m_lattice[end];
564 if (!(end_fr.m_bwType & CLatticeFrame::USER_SELECTED)) {
565 const TWCHAR* cwstr = NULL;
566 if (end_fr.m_wstr.empty()) {
567 cwstr = _getWstr(bs->m_backTraceWordId);
569 cwstr = end_fr.m_wstr.c_str();
572 CCandidate candi(start, end, bs->m_pLexiconState, cwstr,
573 bs->m_backTraceWordId);
575 end_fr.m_bwType |= CLatticeFrame::BESTWORD;
576 end_fr.m_bestWords[rank] = candi;
578 end_fr.m_selWord = candi; // select the first by default.
582 if (bs->m_pBackTraceNode->m_pLexiconState) {
583 std::vector<unsigned> seg_path =
584 bs->m_pBackTraceNode->m_pLexiconState->m_seg_path;
585 std::vector<unsigned>::reverse_iterator it = seg_path.rbegin();
587 for (; it != seg_path.rend(); ++it) {
588 if (segmentPath.empty() || segmentPath.back() != *it)
589 segmentPath.push_back(*it);
594 bs = bs->m_pBackTraceNode;
597 std::reverse(path.begin(), path.end());
598 std::reverse(segmentPath.begin(), segmentPath.end());
601 std::vector<unsigned>::iterator it;
603 printf("trace lattice path[%d]: ", rank);
604 for (it = path.begin(); it != path.end(); ++it)
608 printf("trace segments path[%d]: ", rank);
609 for (it = segmentPath.begin(); it != segmentPath.end(); ++it)
618 CIMIContext::_clearPaths()
624 std::vector<CCandidates>
625 CIMIContext::getBestSentenceTails(int rank, unsigned start, unsigned end)
627 std::vector<CCandidates> result;
632 CCandidates sentence;
633 unsigned word_num = getBestSentence(sentence, rank, start, end);
634 unsigned tail_word_num = word_num;
636 while (tail_word_num > 1) {
637 unsigned dec = tail_word_num / (m_maxTailCandidateNum + 1) + 1;
638 tail_word_num -= std::min(dec, tail_word_num);
639 if (tail_word_num <= 1) {
642 CCandidates tail(sentence.begin(), sentence.begin() + tail_word_num);
643 result.push_back(tail);
649 CIMIContext::getBestSentence(CCandidates& result, int rank,
650 unsigned start, unsigned end)
652 // -1 means selected sentence
653 if (rank < -1 || rank >= (int) m_nBest)
661 while (end > start && m_lattice[end].m_bwType == CLatticeFrame::NO_BESTWORD)
664 unsigned i = end, nWordConverted = 0;
666 CLatticeFrame& fr = m_lattice[i];
668 result.insert(result.begin(), fr.m_selWord);
669 i = fr.m_selWord.m_start;
671 result.insert(result.begin(), fr.m_bestWords[rank]);
672 i = fr.m_bestWords[rank].m_start;
676 return nWordConverted;
680 CIMIContext::getBestSentence(wstring& result, int rank,
681 unsigned start, unsigned end)
683 CCandidates sentence;
684 unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
686 for (size_t i = 0; i < sentence.size(); i++) {
687 result += sentence[i].m_cwstr;
689 return nWordConverted;
693 CIMIContext::getBestSentence(std::vector<unsigned>& result, int rank,
694 unsigned start, unsigned end)
696 CCandidates sentence;
697 unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
699 for (size_t i = 0; i < sentence.size(); i++) {
700 result.push_back(sentence[i].m_wordId);
702 return nWordConverted;
707 CIMIContext::getSelectedSentence(wstring& result,
708 unsigned start, unsigned end)
710 return getBestSentence(result, -1, start, end);
715 CIMIContext::getSelectedSentence(std::vector<unsigned>& result,
716 unsigned start, unsigned end)
718 return getBestSentence(result, -1, start, end);
725 TCandiPair() : m_candi(), m_Rank()
730 struct TCandiPairPtr {
733 TCandiPairPtr(TCandiPair* p = NULL) : m_Ptr(p)
738 operator<(const TCandiPairPtr& b) const
740 return m_Ptr->m_Rank < b.m_Ptr->m_Rank;
745 CIMIContext::_getWstr(unsigned wid)
747 if (wid < m_pPinyinTrie->getWordCount())
748 return (*m_pPinyinTrie)[wid];
749 else if (m_pUserDict)
750 return (*m_pUserDict)[wid];
756 CIMIContext::getCandidates(unsigned frIdx, CCandidates& result)
759 static std::map<wstring, TCandiPair> candidates_map;
760 std::map<wstring, TCandiPair>::iterator candidates_it;
762 candidates_map.clear();
765 std::vector<unsigned> st;
766 getSelectedSentence(st, frIdx);
768 cp.m_candi.m_start = m_candiStarts = frIdx++;
770 for (; frIdx < m_tailIdx; ++frIdx) {
771 if (m_lattice[frIdx + 1].isSyllableSepFrame())
774 CLatticeFrame &fr = m_lattice[frIdx];
775 if (!fr.isSyllableFrame())
778 cp.m_candi.m_end = frIdx;
779 if (fr.m_bwType != CLatticeFrame::NO_BESTWORD) {
780 for (size_t i = 0; i < m_nBest; i++) {
781 if (fr.m_bestWords.find(i) == fr.m_bestWords.end())
783 CCandidate candi = fr.m_bestWords[i];
784 if (candi.m_start != m_candiStarts)
786 if (candi.m_pLexiconState == NULL)
789 TLexiconState & lxst = *(candi.m_pLexiconState);
790 int len = lxst.m_syls.size() - lxst.m_num_of_inner_fuzzies;
791 if (len == 0) len = 1;
795 TCandiRank(fr.m_bwType & CLatticeFrame::USER_SELECTED,
796 fr.m_bwType & CLatticeFrame::BESTWORD,
798 candidates_map[candi.m_cwstr] = cp;
803 CLexiconStates::iterator it = fr.m_lexiconStates.begin();
804 CLexiconStates::iterator ite = fr.m_lexiconStates.end();
805 for (; it != ite; ++it) {
806 TLexiconState & lxst = *it;
808 if (lxst.m_start != m_candiStarts)
811 int len = lxst.m_syls.size() - lxst.m_num_of_inner_fuzzies;
812 if (0 == len) len = 1;
816 const CPinyinTrie::TWordIdInfo *words = lxst.getWords(word_num);
818 for (unsigned i = 0; i < word_num; ++i) {
819 if (m_csLevel < words[i].m_csLevel)
822 cp.m_candi.m_wordId = words[i].m_id;
823 cp.m_candi.m_cwstr = _getWstr(cp.m_candi.m_wordId);
824 cp.m_candi.m_pLexiconState = &lxst;
825 if (!cp.m_candi.m_cwstr)
828 //sorting according to the order in PinYinTire
831 !st.empty() && st.front() == cp.m_candi.m_wordId,
833 candidates_it = candidates_map.find(cp.m_candi.m_cwstr);
834 if (candidates_it == candidates_map.end()
835 || cp.m_Rank < candidates_it->second.m_Rank
836 || cp.m_candi.m_wordId > INI_USRDEF_WID) {
837 candidates_map[cp.m_candi.m_cwstr] = cp;
838 // print_wide(cp.m_candi.m_cwstr);
845 if (!found) continue; // FIXME: need better solution later
847 if (m_bDynaCandiOrder) {
848 CLatticeStates::iterator it = fr.m_latticeStates.begin();
849 CLatticeStates::iterator ite = fr.m_latticeStates.end();
850 // printf("adjusting ");
851 for (; it != ite; ++it) {
852 TLatticeState & ltst = *it;
854 if (ltst.m_pBackTraceNode->m_frIdx != m_candiStarts)
857 cp.m_candi.m_wordId = ltst.m_backTraceWordId;
858 cp.m_candi.m_cwstr = _getWstr(cp.m_candi.m_wordId);
859 cp.m_candi.m_pLexiconState = ltst.m_pLexiconState;
860 if (!cp.m_candi.m_cwstr)
863 int len = cp.m_candi.m_pLexiconState->m_syls.size() -
864 cp.m_candi.m_pLexiconState->m_num_of_inner_fuzzies;
865 if (0 == len) len = 1;
866 cp.m_Rank = TCandiRank(false,
867 !st.empty() && st.front() ==
869 len, true, ltst.m_score /
870 ltst.m_pBackTraceNode->m_score);
871 candidates_it = candidates_map.find(cp.m_candi.m_cwstr);
872 if (candidates_it == candidates_map.end()
873 || cp.m_Rank < candidates_it->second.m_Rank
874 || cp.m_candi.m_wordId > INI_USRDEF_WID) {
875 // print_wide(cp.m_candi.m_cwstr);
877 // ltst.m_score.toString(buf);
878 // printf("len:%d %s", len, buf.c_str());
879 // ltst.m_pBackTraceNode->m_score.toString(buf);
880 // printf("%s ", buf.c_str());
881 candidates_map[cp.m_candi.m_cwstr] = cp;
890 std::vector<TCandiPairPtr> vec;
892 vec.reserve(candidates_map.size());
893 for (candidates_it = candidates_map.begin();
894 candidates_it != candidates_map.end(); ++candidates_it) {
895 vec.push_back(TCandiPairPtr(&(candidates_it->second)));
898 std::sort(vec.begin(), vec.end());
899 for (size_t i = 0; i < vec.size(); i++) {
900 // print_wide(vec[i].m_Ptr->m_candi.m_cwstr);
902 result.push_back(vec[i].m_Ptr->m_candi);
908 CIMIContext::cancelSelection(unsigned frIdx, bool doSearch)
910 unsigned ret = frIdx;
912 CLatticeFrame &fr = m_lattice[frIdx];
913 while (fr.m_bwType & CLatticeFrame::IGNORED) {
915 fr = m_lattice[frIdx];
919 (CLatticeFrame::USER_SELECTED | CLatticeFrame::BESTWORD)) {
920 ret = fr.m_selWord.m_start;
921 fr.m_bwType = CLatticeFrame::NO_BESTWORD;
922 if (doSearch) searchFrom(frIdx);
929 CIMIContext::makeSelection(CCandidate &candi, bool doSearch)
931 CLatticeFrame &fr = m_lattice[candi.m_end];
932 fr.m_bwType = fr.m_bwType | CLatticeFrame::USER_SELECTED;
933 fr.m_selWord = candi;
934 // make best sentence word consistent as well
935 for (size_t i = 0; i < m_nBest; i++) {
936 fr.m_bestWords[i] = candi;
939 if (doSearch) searchFrom(candi.m_end);
943 CIMIContext::selectSentence(int idx)
945 unsigned i = m_tailIdx - 1;
946 while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
950 CLatticeFrame &fr = m_lattice[i];
951 fr.m_selWord = fr.m_bestWords[idx];
952 i = fr.m_selWord.m_start;
957 CIMIContext::memorize()
964 CIMIContext::_saveUserDict()
970 bool has_user_selected = false;
971 unsigned i = m_tailIdx - 1;
974 while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
978 CLatticeFrame &fr = m_lattice[i];
979 if (!fr.isSyllableFrame()) {
980 i = fr.m_selWord.m_start;
984 TLexiconState* state = fr.m_selWord.m_pLexiconState;
986 i = fr.m_selWord.m_start;
990 if (syls.size() + state->m_syls.size() > MAX_USRDEF_WORD_LEN) {
991 i = fr.m_selWord.m_start;
995 if (!e_pos) e_pos = i;
997 has_user_selected |= (fr.m_bwType & CLatticeFrame::USER_SELECTED);
998 std::copy(state->m_syls.begin(), state->m_syls.end(), inserter(syls, syls.begin()));
999 i = fr.m_selWord.m_start;
1002 if (has_user_selected && syls.size() > 1) {
1004 getSelectedSentence (phrase, 0, e_pos);
1005 m_pUserDict->addWord (syls, phrase);
1010 CIMIContext::_saveHistoryCache()
1015 std::vector<unsigned> result;
1016 unsigned i = m_tailIdx - 1;
1017 while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
1021 CLatticeFrame &fr = m_lattice[i];
1022 if (fr.isSyllableFrame()) {
1023 result.insert(result.begin(), fr.m_selWord.m_wordId);
1025 result.insert(result.begin(), 0);
1027 i = fr.m_selWord.m_start;
1030 if (!result.empty()) {
1031 m_pHistory->memorize(&(result[0]), &(result[0]) + result.size());
1032 m_pHistory->saveToFile();
1037 CIMIContext::deleteCandidate(CCandidate &candi)
1039 unsigned wid = candi.m_wordId;
1040 deleteCandidateByWID(wid);
1044 CIMIContext::deleteCandidateByWID(unsigned wid)
1046 if (wid > INI_USRDEF_WID) {
1047 m_pHistory->forget(wid);
1048 m_pUserDict->removeWord(wid);
1049 _buildLattice(m_pPySegmentor->getSegments());
1054 CIMIContext::removeFromHistoryCache(std::vector<unsigned>& wids)
1059 m_pHistory->forget(&(wids[0]), &(wids[0]) + wids.size());
1060 buildLattice(m_pPySegmentor);