Tizen 2.1 base
[platform/core/uifw/ise-engine-sunpinyin.git] / src / ime-core / imi_context.cpp
1 /*
2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3  *
4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
5  *
6  * The contents of this file are subject to the terms of either the GNU Lesser
7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9  * file except in compliance with the License. You can obtain a copy of the CDDL at
10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12  * specific language governing permissions and limitations under the License. When
13  * distributing the software, include this License Header Notice in each file and
14  * include the full text of the License in the License file as well as the
15  * following notice:
16  *
17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
18  * (CDDL)
19  * For Covered Software in this distribution, this License shall be governed by the
20  * laws of the State of California (excluding conflict-of-law provisions).
21  * Any litigation relating to this License shall be subject to the jurisdiction of
22  * the Federal Courts of the Northern District of California and the state courts
23  * of the State of California, with venue lying in Santa Clara County, California.
24  *
25  * Contributor(s):
26  *
27  * If you wish your version of this file to be governed by only the CDDL or only
28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30  * license." If you don't indicate a single choice of license, a recipient has the
31  * option to distribute your version of this file under either the CDDL or the LGPL
32  * Version 2.1, or to extend the choice of license to its licensees as provided
33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34  * Version 2 license, then the option applies only if the new code is made subject
35  * to such option by the copyright holder.
36  */
37
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <assert.h>
43 #include <algorithm>
44 #include "imi_defines.h"
45 #include "imi_context.h"
46
47 TCandiRank::TCandiRank(bool user, bool best, unsigned len,
48                        bool fromLattice, TSentenceScore score)
49 {
50     anony.m_user = (user) ? 0 : 1;
51     anony.m_best = (best) ? 0 : 1;
52     anony.m_len = (len > 31) ? (0) : (31 - len);
53     anony.m_lattice = (fromLattice) ? 0 : 1;
54
55     double ds = -score.log2();
56
57     //make it 24-bit
58     if (ds > 32767.0)
59         ds = 32767.0;
60     else if (ds < -32768.0)
61         ds = -32768.0;
62     unsigned cost = unsigned((ds + 32768.0) * 256.0);
63     anony.m_cost = cost;
64 }
65
66 TCandiRank::TCandiRank(bool user, bool best, unsigned len,
67                        bool fromLattice, unsigned rank)
68 {
69     anony.m_user = (user) ? 0 : 1;
70     anony.m_best = (best) ? 0 : 1;
71     anony.m_len = (len > 31) ? (0) : (31 - len);
72     anony.m_lattice = (fromLattice) ? 0 : 1;
73     anony.m_cost = rank;
74 }
75
76 void
77 CLatticeFrame::print(std::string prefix)
78 {
79     if (m_bwType & BESTWORD) printf("B");
80     if (m_bwType & USER_SELECTED) printf("U");
81     printf("\n");
82
83     prefix += "    ";
84     printf("  Lexicon States:\n");
85     for_each(m_lexiconStates.begin(), m_lexiconStates.end(),
86              bind2nd(mem_fun_ref(&TLexiconState::print), prefix));
87
88     printf("  Lattice States:\n");
89     for_each(m_latticeStates.begin(), m_latticeStates.end(),
90              bind2nd(mem_fun_ref(&TLatticeState::print), prefix));
91     printf("\n");
92 }
93
94 void
95 CIMIContext::printLattice()
96 {
97     std::string prefix;
98
99     for (size_t i = 0; i <= m_tailIdx; ++i) {
100         if (m_lattice[i].m_type == CLatticeFrame::UNUSED)
101             continue;
102
103         printf("Lattice Frame [%lu]:", i);
104         m_lattice[i].print(prefix);
105     }
106 }
107
108 CIMIContext::CIMIContext()
109     : m_tailIdx(1), m_nBest(0), m_maxBest(1), m_maxTailCandidateNum(0),
110       m_pModel(NULL), m_pPinyinTrie(NULL), m_pUserDict(NULL), m_pHistory(NULL),
111       m_historyPower(5), m_csLevel(0), m_bFullSymbolForwarding(false),
112       m_bOmitPunct(false), m_pGetFullSymbolOp(NULL),
113       m_bFullPunctForwarding(true), m_pGetFullPunctOp(NULL),
114       m_pPySegmentor(NULL), m_bNonCompleteSyllable(true),
115       m_bDynaCandiOrder(true), m_candiStarts(0), m_candiEnds(0)
116 {
117     m_lattice.resize(MAX_LATTICE_LENGTH);
118     m_lattice[0].m_latticeStates.add(TLatticeState(-1.0, 0));
119     setMaxBest(m_maxBest);
120 }
121
122 void
123 CIMIContext::setCoreData(CIMIData *pCoreData)
124 {
125     m_pModel = pCoreData->getSlm();
126     m_pPinyinTrie = pCoreData->getPinyinTrie();
127 }
128
129 void
130 CIMIContext::clear()
131 {
132     _clearFrom(1);
133     _clearPaths();
134     m_tailIdx = 1;
135     m_candiStarts = m_candiEnds = 0;
136 }
137
138 void
139 CIMIContext::_clearFrom(unsigned idx)
140 {
141     for (size_t i = idx; i < m_tailIdx + 1; i++)
142         m_lattice[i].clear();
143 }
144
145 bool
146 CIMIContext::buildLattice(IPySegmentor *segmentor, bool doSearch)
147 {
148     m_pPySegmentor = segmentor;
149     return _buildLattice(segmentor->getSegments(),
150                          segmentor->updatedFrom() + 1, doSearch);
151 }
152
153 bool
154 CIMIContext::_buildLattice(IPySegmentor::TSegmentVec &segments,
155                            unsigned rebuildFrom,
156                            bool doSearch)
157 {
158     _clearFrom(rebuildFrom);
159
160     IPySegmentor::TSegmentVec::const_iterator it = segments.begin();
161     IPySegmentor::TSegmentVec::const_iterator ite = segments.end();
162
163     unsigned i, j = 0;
164     for (; it != ite; ++it) {
165         i = it->m_start;
166         j = i + it->m_len;
167
168         if (i < rebuildFrom - 1)
169             continue;
170
171         if (j >= m_lattice.capacity() - 1)
172             break;
173
174         if (it->m_type == IPySegmentor::SYLLABLE)
175             _forwardSyllables(i, j, *it);
176         else if (it->m_type == IPySegmentor::SYLLABLE_SEP)
177             _forwardSyllableSep(i, j);
178         else
179             _forwardString(i, j, it->m_syllables);
180         m_bOmitPunct = false;
181     }
182
183     _forwardTail(j, j + 1);
184     m_tailIdx = j + 1;
185
186     return doSearch && searchFrom(rebuildFrom);
187 }
188
189 void
190 CIMIContext::_forwardSyllables(unsigned i,
191                                unsigned j,
192                                const IPySegmentor::TSegment& seg)
193 {
194     std::vector<unsigned>::const_iterator it = seg.m_syllables.begin();
195     std::vector<unsigned>::const_iterator ite = seg.m_syllables.end();
196
197     for (; it != ite; ++it)
198         _forwardSingleSyllable(i, j, *it, seg);
199
200     it = seg.m_fuzzy_syllables.begin();
201     ite = seg.m_fuzzy_syllables.end();
202
203     for (; it != ite; ++it)
204         _forwardSingleSyllable(i, j, *it, seg, true);
205 }
206
207
208 void
209 CIMIContext::_forwardString(unsigned i,
210                             unsigned j,
211                             const std::vector<unsigned>& strbuf)
212 {
213     if (strbuf.size() == 1) {
214         unsigned ch = strbuf[0];
215         if (ispunct(ch)) {
216             _forwardPunctChar(i, j, ch);
217         } else {
218             _forwardOrdinaryChar(i, j, ch);
219         }
220     } else{
221         CLatticeFrame &fr = m_lattice[j];
222         fr.m_wstr.assign(strbuf.begin(), strbuf.end());
223         fr.m_lexiconStates.push_back(TLexiconState(i, 0));
224     }
225 }
226
227 void
228 CIMIContext::_forwardSingleSyllable(unsigned i,
229                                     unsigned j,
230                                     TSyllable syllable,
231                                     const IPySegmentor::TSegment& seg,
232                                     bool fuzzy)
233 {
234     const CPinyinTrie::TNode * pn = NULL;
235
236     CLatticeFrame &fr = m_lattice[j];
237     fr.m_type = CLatticeFrame::SYLLABLE;
238
239     CLexiconStates::iterator it = m_lattice[i].m_lexiconStates.begin();
240     CLexiconStates::iterator ite = m_lattice[i].m_lexiconStates.end();
241     for (; it != ite; ++it) {
242         TLexiconState &lxst = *it;
243         bool added_from_sysdict = false;
244
245         if (lxst.m_pPYNode) {
246             // try to match a word from lattice i to lattice j
247             // and if match, we'll count it as a new lexicon on lattice j
248             pn = m_pPinyinTrie->transfer(lxst.m_pPYNode, syllable);
249             if (pn) {
250                 added_from_sysdict = true;
251                 TLexiconState new_lxst = TLexiconState(lxst.m_start,
252                                                        pn,
253                                                        lxst.m_syls,
254                                                        lxst.m_seg_path,
255                                                        fuzzy);
256                 new_lxst.m_syls.push_back(syllable);
257                 new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
258                                                   (seg.m_inner_fuzzy ? 1 : 0);
259                 new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
260                 fr.m_lexiconStates.push_back(new_lxst);
261             }
262         }
263
264         if (m_pUserDict && lxst.m_syls.size() < MAX_USRDEF_WORD_LEN) {
265             // try to match a word from user dict
266             CSyllables syls = lxst.m_syls;
267             syls.push_back(syllable);
268             std::vector<CPinyinTrie::TWordIdInfo> words;
269             m_pUserDict->getWords(syls, words);
270             if (!words.empty() || !added_from_sysdict) {
271                 // even if the words is empty we'll add a fake lexicon
272                 // here. This helps _saveUserDict detect new words.
273                 TLexiconState new_lxst = TLexiconState(lxst.m_start,
274                                                        words,
275                                                        lxst.m_syls,
276                                                        lxst.m_seg_path,
277                                                        fuzzy);
278                 new_lxst.m_syls.push_back(syllable);
279                 new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
280                                                   (seg.m_inner_fuzzy ? 1 : 0);
281                 new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
282                 fr.m_lexiconStates.push_back(new_lxst);
283             }
284         }
285     }
286
287     // last, create a lexicon for single character with only one syllable
288     pn = m_pPinyinTrie->transfer(syllable);
289     if (pn) {
290         CSyllables syls;
291         syls.push_back(syllable);
292         std::vector<unsigned> seg_path;
293         seg_path.push_back(seg.m_start);
294         seg_path.push_back(seg.m_start + seg.m_len);
295         TLexiconState new_lxst = TLexiconState(i, pn, syls, seg_path, fuzzy);
296         new_lxst.m_num_of_inner_fuzzies = seg.m_inner_fuzzy ? 1 : 0;
297         fr.m_lexiconStates.push_back(new_lxst);
298     }
299 }
300
301 void
302 CIMIContext::_forwardSyllableSep(unsigned i, unsigned j)
303 {
304     CLatticeFrame &fr = m_lattice[j];
305     fr.m_type = CLatticeFrame::SYLLABLE | CLatticeFrame::SYLLABLE_SEP;
306     fr.m_lexiconStates = m_lattice[i].m_lexiconStates;
307
308     CLexiconStates::iterator it = fr.m_lexiconStates.begin();
309     CLexiconStates::iterator ite = fr.m_lexiconStates.end();
310     for (; it != ite; ++it) {
311         it->m_seg_path.back() = j;
312     }
313 }
314
315 void
316 CIMIContext::_forwardPunctChar(unsigned i, unsigned j, unsigned ch)
317 {
318     CLatticeFrame &fr = m_lattice[j];
319
320     wstring wstr;
321     unsigned wid = 0;
322
323     if (m_pGetFullPunctOp) {
324         if (m_bFullPunctForwarding && !m_bOmitPunct) {
325             wstr = (*m_pGetFullPunctOp)(ch);
326             wid = m_pPinyinTrie->getSymbolId(wstr);
327         }
328     }
329
330     fr.m_type = CLatticeFrame::PUNC;
331
332     if (!wstr.empty())
333         fr.m_wstr = wstr;
334     else
335         fr.m_wstr.push_back(ch);
336
337     fr.m_lexiconStates.push_back(TLexiconState(i, wid));
338 }
339
340 void
341 CIMIContext::_forwardOrdinaryChar(unsigned i, unsigned j, unsigned ch)
342 {
343     CLatticeFrame &fr = m_lattice[j];
344
345     wstring wstr;
346     unsigned wid = 0;
347
348     if (m_pGetFullSymbolOp) {
349         wstr = (*m_pGetFullSymbolOp)(ch);
350         wid = m_pPinyinTrie->getSymbolId(wstr);
351
352         if (!m_bFullSymbolForwarding)
353             wstr.clear();
354     }
355
356     fr.m_type = wid ? CLatticeFrame::SYMBOL : CLatticeFrame::ASCII;
357
358     if (!wstr.empty())
359         fr.m_wstr = wstr;
360     else
361         fr.m_wstr.push_back(ch);
362
363     fr.m_lexiconStates.push_back(TLexiconState(i, wid));
364 }
365
366 void
367 CIMIContext::_forwardTail(unsigned i, unsigned j)
368 {
369     CLatticeFrame &fr = m_lattice[j];
370     fr.m_type = CLatticeFrame::TAIL;
371
372     fr.m_lexiconStates.push_back(TLexiconState(i, ENDING_WORD_ID));
373 }
374
375 static double exp2_tbl[32] = {
376     exp2(-0), exp2(-1), exp2(-2), exp2(-3), exp2(-4), exp2(-5), exp2(-6), exp2(-7),
377     exp2(-8), exp2(-9), exp2(-10), exp2(-11), exp2(-12), exp2(-13), exp2(-14),
378     exp2(-15), exp2(-16), exp2(-17), exp2(-18), exp2(-19), exp2(-20), exp2(-21),
379     exp2(-22), exp2(-23), exp2(-24), exp2(-25), exp2(-26), exp2(-27), exp2(-28),
380     exp2(-29), exp2(-30), exp2(-31)
381 };
382
383 bool
384 CIMIContext::searchFrom(unsigned idx)
385 {
386     bool affectCandidates = (idx <= m_candiEnds);
387
388     for (; idx <= m_tailIdx; ++idx) {
389         CLatticeFrame &fr = m_lattice[idx];
390
391         if (fr.m_type == CLatticeFrame::UNUSED)
392             continue;
393
394         fr.m_latticeStates.clear();
395
396         /* user selected word might be cut in next step */
397         if (fr.m_bwType & CLatticeFrame::USER_SELECTED) {
398             _transferBetween(fr.m_selWord.m_start, idx,
399                              fr.m_selWord.m_pLexiconState,
400                              fr.m_selWord.m_wordId);
401         }
402
403         CLexiconStates::iterator it = fr.m_lexiconStates.begin();
404         CLexiconStates::iterator ite = fr.m_lexiconStates.end();
405         for (; it != ite; ++it) {
406             unsigned word_num = 0;
407             TLexiconState &lxst = *it;
408             const CPinyinTrie::TWordIdInfo *words = lxst.getWords(word_num);
409
410             if (!word_num)
411                 continue;
412
413             if (lxst.m_start == m_candiStarts && idx > m_candiEnds)
414                 affectCandidates = true;
415
416             // only selected the word with higher unigram probablities, and
417             // narrow the search deepth and lower the initial score for fuzzy
418             // syllables
419             int maxsz = it->m_bFuzzy ? MAX_LEXICON_TRIES /
420                         2 : MAX_LEXICON_TRIES;
421
422             double ic = it->m_bFuzzy ? 0.5 : 1.0;
423
424             int sz = (int) word_num < maxsz ? (int) word_num : maxsz;
425             int i = 0, count = 0;
426
427             while (count < sz && i < sz && (words[i].m_bSeen || count < 2)) {
428                 if (m_csLevel >= words[i].m_csLevel) {
429                     // printf("cost %d\n", words[i].m_cost);
430                     _transferBetween(lxst.m_start, idx, &lxst, words[i].m_id,
431                                      ic * exp2_tbl[words[i].m_cost]);
432                     ++count;
433                 }
434                 i++;
435             }
436
437             /* try extra words in history cache */
438             if (m_pHistory) {
439                 while (i < (int) word_num) {
440                     if (m_csLevel >= words[i].m_csLevel
441                         && m_pHistory->seenBefore(words[i].m_id)) {
442                         // printf("history cost %d\n", words[i].m_cost);
443                         _transferBetween(lxst.m_start, idx, &lxst,
444                                          words[i].m_id,
445                                          ic * exp2_tbl[words[i].m_cost]);
446                     }
447                     i++;
448                 }
449             }
450         }
451     }
452
453     _clearPaths();
454     m_path.clear();
455     m_segPath.clear();
456     m_nBest = 0;
457
458     std::vector<TLatticeState> tail_states =
459         m_lattice[m_tailIdx].m_latticeStates.getFilteredResult();
460
461 #ifdef DEBUG
462     for (int i = 0; i < tail_states.size(); i++) {
463         std::string score;
464         tail_states[i].m_score.toString(score);
465         printf("score[%d]: %s\n", i, score.c_str());
466     }
467 #endif
468
469     for (size_t i = 0; i < m_maxBest; i++) {
470         TPath path, segpath;
471         if (_backTracePaths(tail_states, m_nBest, path, segpath)) {
472             m_path.push_back(path);
473             m_segPath.push_back(segpath);
474             m_nBest++;
475         }
476     }
477
478     if (m_pPySegmentor && m_nBest > 0 && !m_segPath[0].empty())
479         m_pPySegmentor->notify_best_segpath(m_segPath[0]);
480
481     return affectCandidates;
482 }
483
484 void
485 CIMIContext::_transferBetween(unsigned start, unsigned end,
486                               TLexiconState* plxst, unsigned wid,
487                               double ic)
488 {
489     CLatticeFrame &start_fr = m_lattice[start];
490     CLatticeFrame &end_fr = m_lattice[end];
491
492     TLatticeState node(-1.0, end, plxst);
493     TSentenceScore efic(ic);
494
495     if ((end_fr.m_bwType & CLatticeFrame::USER_SELECTED)
496         && end_fr.m_selWord.m_wordId == wid)
497         efic = TSentenceScore(30000, 1.0);
498
499     static double s_history_distribution[] = {
500         0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
501     };
502
503     double weight_h = s_history_distribution[m_historyPower];
504     double weight_s = 1.0 - weight_h;
505
506     CLatticeStates::iterator it = start_fr.m_latticeStates.begin();
507     CLatticeStates::iterator ite = start_fr.m_latticeStates.end();
508
509     for (; it != ite; ++it) {
510         // for 1-length lattice states, replace ending_word_id (comma)
511         // with none_word_id (recognized by CThreadSlm)
512         unsigned _wid = wid;
513         if (wid == ENDING_WORD_ID && it->m_pBackTraceNode && it->m_pBackTraceNode->m_frIdx == 0)
514             _wid = NONE_WORD_ID;
515
516         node.m_pBackTraceNode = &(*it);
517         node.m_backTraceWordId = wid;
518
519         double ts = m_pModel->transfer(it->m_slmState, _wid, node.m_slmState);
520         m_pModel->historify(node.m_slmState);
521
522         // backward to psuedo root, so wid is probably a user word,
523         // save the wid in idx field, so that later we could get it via
524         // CThreadSlm::lastWordId, to calculate p_{cache} correctly.
525         if (node.m_slmState.getLevel() == 0
526             && m_pHistory && m_pHistory->seenBefore(wid))
527             node.m_slmState.setIdx(wid);  // an psuedo unigram node state
528
529         if (m_pHistory) {
530             unsigned history[2] = { m_pModel->lastWordId(it->m_slmState), _wid };
531             double hpr = m_pHistory->pr(history, history + 2);
532             ts = weight_s * ts + weight_h * hpr;
533         }
534
535         node.m_score = it->m_score * efic * TSentenceScore(ts);
536         // std::string buf;
537         // node.m_score.toString(buf);
538         // printf("node score %s ts=%lf ", buf.c_str(), ts);
539         // it->m_score.toString(buf);
540         // printf("%s ic=%lf\n", buf.c_str(), ic);
541         end_fr.m_latticeStates.add(node);
542     }
543 }
544
545 bool
546 CIMIContext::_backTracePaths(const std::vector<TLatticeState>& tail_states,
547                              int rank, TPath& path, TPath& segmentPath)
548 {
549     path.clear();
550     segmentPath.clear();
551
552     if (rank >= (int) tail_states.size()) {
553         // rank out of bounds, only return the segment path
554         return false;
555     }
556
557     const TLatticeState *bs = &(tail_states[rank]);
558
559     while (bs->m_pBackTraceNode) {
560         unsigned start = bs->m_pBackTraceNode->m_frIdx;
561         unsigned end = bs->m_frIdx;
562         CLatticeFrame & end_fr = m_lattice[end];
563
564         if (!(end_fr.m_bwType & CLatticeFrame::USER_SELECTED)) {
565             const TWCHAR* cwstr = NULL;
566             if (end_fr.m_wstr.empty()) {
567                 cwstr = _getWstr(bs->m_backTraceWordId);
568             } else {
569                 cwstr = end_fr.m_wstr.c_str();
570             }
571
572             CCandidate candi(start, end, bs->m_pLexiconState, cwstr,
573                              bs->m_backTraceWordId);
574
575             end_fr.m_bwType |= CLatticeFrame::BESTWORD;
576             end_fr.m_bestWords[rank] = candi;
577             if (rank == 0) {
578                 end_fr.m_selWord = candi; // select the first by default.
579             }
580         }
581
582         if (bs->m_pBackTraceNode->m_pLexiconState) {
583             std::vector<unsigned> seg_path =
584                 bs->m_pBackTraceNode->m_pLexiconState->m_seg_path;
585             std::vector<unsigned>::reverse_iterator it = seg_path.rbegin();
586
587             for (; it != seg_path.rend(); ++it) {
588                 if (segmentPath.empty() || segmentPath.back() != *it)
589                     segmentPath.push_back(*it);
590             }
591         }
592
593         path.push_back(end);
594         bs = bs->m_pBackTraceNode;
595     }
596
597     std::reverse(path.begin(), path.end());
598     std::reverse(segmentPath.begin(), segmentPath.end());
599
600 #ifdef DEBUG
601     std::vector<unsigned>::iterator it;
602
603     printf("trace lattice path[%d]: ", rank);
604     for (it = path.begin(); it != path.end(); ++it)
605         printf("%d ", *it);
606     printf("\n");
607
608     printf("trace segments path[%d]: ", rank);
609     for (it = segmentPath.begin(); it != segmentPath.end(); ++it)
610         printf("%d ", *it);
611     printf("\n");
612 #endif
613
614     return true;
615 }
616
617 void
618 CIMIContext::_clearPaths()
619 {
620     m_path.clear();
621     m_segPath.clear();
622 }
623
624 std::vector<CCandidates>
625 CIMIContext::getBestSentenceTails(int rank, unsigned start, unsigned end)
626 {
627     std::vector<CCandidates> result;
628     if (rank < 0) {
629         return result;
630     }
631
632     CCandidates sentence;
633     unsigned word_num = getBestSentence(sentence, rank, start, end);
634     unsigned tail_word_num = word_num;
635
636     while (tail_word_num > 1) {
637         unsigned dec = tail_word_num / (m_maxTailCandidateNum + 1) + 1;
638         tail_word_num -= std::min(dec, tail_word_num);
639         if (tail_word_num <= 1) {
640             break;
641         }
642         CCandidates tail(sentence.begin(), sentence.begin() + tail_word_num);
643         result.push_back(tail);
644     }
645     return result;
646 }
647
648 unsigned
649 CIMIContext::getBestSentence(CCandidates& result, int rank,
650                              unsigned start, unsigned end)
651 {
652     // -1 means selected sentence
653     if (rank < -1 || rank >= (int) m_nBest)
654         return 0;
655
656     result.clear();
657
658     if (end == UINT_MAX)
659         end = m_tailIdx - 1;
660
661     while (end > start && m_lattice[end].m_bwType == CLatticeFrame::NO_BESTWORD)
662         end--;
663
664     unsigned i = end, nWordConverted = 0;
665     while (i > start) {
666         CLatticeFrame& fr = m_lattice[i];
667         if (rank < 0) {
668             result.insert(result.begin(), fr.m_selWord);
669             i = fr.m_selWord.m_start;
670         } else {
671             result.insert(result.begin(), fr.m_bestWords[rank]);
672             i = fr.m_bestWords[rank].m_start;
673         }
674         nWordConverted++;
675     }
676     return nWordConverted;
677 }
678
679 unsigned
680 CIMIContext::getBestSentence(wstring& result, int rank,
681                              unsigned start, unsigned end)
682 {
683     CCandidates sentence;
684     unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
685     result.clear();
686     for (size_t i = 0; i < sentence.size(); i++) {
687         result += sentence[i].m_cwstr;
688     }
689     return nWordConverted;
690 }
691
692 unsigned
693 CIMIContext::getBestSentence(std::vector<unsigned>& result, int rank,
694                              unsigned start, unsigned end)
695 {
696     CCandidates sentence;
697     unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
698     result.clear();
699     for (size_t i = 0; i < sentence.size(); i++) {
700         result.push_back(sentence[i].m_wordId);
701     }
702     return nWordConverted;
703 }
704
705
706 unsigned
707 CIMIContext::getSelectedSentence(wstring& result,
708                                  unsigned start, unsigned end)
709 {
710     return getBestSentence(result, -1, start, end);
711 }
712
713
714 unsigned
715 CIMIContext::getSelectedSentence(std::vector<unsigned>& result,
716                                  unsigned start, unsigned end)
717 {
718     return getBestSentence(result, -1, start, end);
719 }
720
721 struct TCandiPair {
722     CCandidate m_candi;
723     TCandiRank m_Rank;
724
725     TCandiPair() : m_candi(), m_Rank()
726     {
727     }
728 };
729
730 struct TCandiPairPtr {
731     TCandiPair*                     m_Ptr;
732
733     TCandiPairPtr(TCandiPair* p = NULL) : m_Ptr(p)
734     {
735     }
736
737     bool
738     operator<(const TCandiPairPtr& b) const
739     {
740         return m_Ptr->m_Rank < b.m_Ptr->m_Rank;
741     }
742 };
743
744 const TWCHAR *
745 CIMIContext::_getWstr(unsigned wid)
746 {
747     if (wid < m_pPinyinTrie->getWordCount())
748         return (*m_pPinyinTrie)[wid];
749     else if (m_pUserDict)
750         return (*m_pUserDict)[wid];
751     else
752         return NULL;
753 }
754
755 void
756 CIMIContext::getCandidates(unsigned frIdx, CCandidates& result)
757 {
758     TCandiPair cp;
759     static std::map<wstring, TCandiPair> candidates_map;
760     std::map<wstring, TCandiPair>::iterator candidates_it;
761
762     candidates_map.clear();
763     result.clear();
764
765     std::vector<unsigned> st;
766     getSelectedSentence(st, frIdx);
767
768     cp.m_candi.m_start = m_candiStarts = frIdx++;
769
770     for (; frIdx < m_tailIdx; ++frIdx) {
771         if (m_lattice[frIdx + 1].isSyllableSepFrame())
772             continue;
773
774         CLatticeFrame &fr = m_lattice[frIdx];
775         if (!fr.isSyllableFrame())
776             continue;
777
778         cp.m_candi.m_end = frIdx;
779         if (fr.m_bwType != CLatticeFrame::NO_BESTWORD) {
780             for (size_t i = 0; i < m_nBest; i++) {
781                 if (fr.m_bestWords.find(i) == fr.m_bestWords.end())
782                     continue;
783                 CCandidate candi = fr.m_bestWords[i];
784                 if (candi.m_start != m_candiStarts)
785                     continue;
786                 if (candi.m_pLexiconState == NULL)
787                     continue;
788
789                 TLexiconState & lxst = *(candi.m_pLexiconState);
790                 int len = lxst.m_syls.size() - lxst.m_num_of_inner_fuzzies;
791                 if (len == 0) len = 1;
792
793                 cp.m_candi = candi;
794                 cp.m_Rank =
795                     TCandiRank(fr.m_bwType & CLatticeFrame::USER_SELECTED,
796                                fr.m_bwType & CLatticeFrame::BESTWORD,
797                                len, false, 0);
798                 candidates_map[candi.m_cwstr] = cp;
799             }
800         }
801
802         bool found = false;
803         CLexiconStates::iterator it = fr.m_lexiconStates.begin();
804         CLexiconStates::iterator ite = fr.m_lexiconStates.end();
805         for (; it != ite; ++it) {
806             TLexiconState & lxst = *it;
807
808             if (lxst.m_start != m_candiStarts)
809                 continue;
810
811             int len = lxst.m_syls.size() - lxst.m_num_of_inner_fuzzies;
812             if (0 == len) len = 1;
813
814             found = true;
815             unsigned word_num;
816             const CPinyinTrie::TWordIdInfo *words = lxst.getWords(word_num);
817
818             for (unsigned i = 0; i < word_num; ++i) {
819                 if (m_csLevel < words[i].m_csLevel)
820                     continue;
821
822                 cp.m_candi.m_wordId = words[i].m_id;
823                 cp.m_candi.m_cwstr = _getWstr(cp.m_candi.m_wordId);
824                 cp.m_candi.m_pLexiconState = &lxst;
825                 if (!cp.m_candi.m_cwstr)
826                     continue;
827
828                 //sorting according to the order in PinYinTire
829                 cp.m_Rank =
830                     TCandiRank(false,
831                                !st.empty() && st.front() == cp.m_candi.m_wordId,
832                                len, false, i);
833                 candidates_it = candidates_map.find(cp.m_candi.m_cwstr);
834                 if (candidates_it == candidates_map.end()
835                     || cp.m_Rank < candidates_it->second.m_Rank
836                     || cp.m_candi.m_wordId > INI_USRDEF_WID) {
837                     candidates_map[cp.m_candi.m_cwstr] = cp;
838                     // print_wide(cp.m_candi.m_cwstr);
839                     // printf(" ");
840                 }
841             }
842             // puts("");
843         }
844
845         if (!found) continue;  // FIXME: need better solution later
846
847         if (m_bDynaCandiOrder) {
848             CLatticeStates::iterator it = fr.m_latticeStates.begin();
849             CLatticeStates::iterator ite = fr.m_latticeStates.end();
850             // printf("adjusting ");
851             for (; it != ite; ++it) {
852                 TLatticeState & ltst = *it;
853
854                 if (ltst.m_pBackTraceNode->m_frIdx != m_candiStarts)
855                     continue;
856
857                 cp.m_candi.m_wordId = ltst.m_backTraceWordId;
858                 cp.m_candi.m_cwstr = _getWstr(cp.m_candi.m_wordId);
859                 cp.m_candi.m_pLexiconState = ltst.m_pLexiconState;
860                 if (!cp.m_candi.m_cwstr)
861                     continue;
862
863                 int len = cp.m_candi.m_pLexiconState->m_syls.size() -
864                           cp.m_candi.m_pLexiconState->m_num_of_inner_fuzzies;
865                 if (0 == len) len = 1;
866                 cp.m_Rank = TCandiRank(false,
867                                        !st.empty() && st.front() ==
868                                        cp.m_candi.m_wordId,
869                                        len, true, ltst.m_score /
870                                        ltst.m_pBackTraceNode->m_score);
871                 candidates_it = candidates_map.find(cp.m_candi.m_cwstr);
872                 if (candidates_it == candidates_map.end()
873                     || cp.m_Rank < candidates_it->second.m_Rank
874                     || cp.m_candi.m_wordId > INI_USRDEF_WID) {
875                     // print_wide(cp.m_candi.m_cwstr);
876                     // std::string buf;
877                     // ltst.m_score.toString(buf);
878                     // printf("len:%d %s", len, buf.c_str());
879                     // ltst.m_pBackTraceNode->m_score.toString(buf);
880                     // printf("%s ", buf.c_str());
881                     candidates_map[cp.m_candi.m_cwstr] = cp;
882                 }
883             }
884             // puts("");
885         }
886
887         m_candiEnds = frIdx;
888     }
889
890     std::vector<TCandiPairPtr> vec;
891
892     vec.reserve(candidates_map.size());
893     for (candidates_it = candidates_map.begin();
894          candidates_it != candidates_map.end(); ++candidates_it) {
895         vec.push_back(TCandiPairPtr(&(candidates_it->second)));
896     }
897
898     std::sort(vec.begin(), vec.end());
899     for (size_t i = 0; i < vec.size(); i++) {
900         // print_wide(vec[i].m_Ptr->m_candi.m_cwstr);
901         // printf(" ");
902         result.push_back(vec[i].m_Ptr->m_candi);
903     }
904     // puts("");
905 }
906
907 unsigned
908 CIMIContext::cancelSelection(unsigned frIdx, bool doSearch)
909 {
910     unsigned ret = frIdx;
911
912     CLatticeFrame &fr = m_lattice[frIdx];
913     while (fr.m_bwType & CLatticeFrame::IGNORED) {
914         --frIdx;
915         fr = m_lattice[frIdx];
916     }
917
918     if (fr.m_bwType &
919         (CLatticeFrame::USER_SELECTED | CLatticeFrame::BESTWORD)) {
920         ret = fr.m_selWord.m_start;
921         fr.m_bwType = CLatticeFrame::NO_BESTWORD;
922         if (doSearch) searchFrom(frIdx);
923     }
924
925     return ret;
926 }
927
928 void
929 CIMIContext::makeSelection(CCandidate &candi, bool doSearch)
930 {
931     CLatticeFrame &fr = m_lattice[candi.m_end];
932     fr.m_bwType = fr.m_bwType | CLatticeFrame::USER_SELECTED;
933     fr.m_selWord = candi;
934     // make best sentence word consistent as well
935     for (size_t i = 0; i < m_nBest; i++) {
936         fr.m_bestWords[i] = candi;
937     }
938
939     if (doSearch) searchFrom(candi.m_end);
940 }
941
942 void
943 CIMIContext::selectSentence(int idx)
944 {
945     unsigned i = m_tailIdx - 1;
946     while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
947         i--;
948
949     while (i > 0) {
950         CLatticeFrame &fr = m_lattice[i];
951         fr.m_selWord = fr.m_bestWords[idx];
952         i = fr.m_selWord.m_start;
953     }
954 }
955
956 void
957 CIMIContext::memorize()
958 {
959     _saveUserDict();
960     _saveHistoryCache();
961 }
962
963 void
964 CIMIContext::_saveUserDict()
965 {
966     if (!m_pUserDict)
967         return;
968
969     CSyllables syls;
970     bool has_user_selected = false;
971     unsigned i = m_tailIdx - 1;
972     unsigned e_pos = 0;
973
974     while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
975         i--;
976
977     while (i > 0) {
978         CLatticeFrame &fr = m_lattice[i];
979         if (!fr.isSyllableFrame()) {
980             i = fr.m_selWord.m_start;
981             break;
982         }
983
984         TLexiconState* state = fr.m_selWord.m_pLexiconState;
985         if (!state) {
986             i = fr.m_selWord.m_start;
987             continue;
988         }
989
990         if (syls.size() + state->m_syls.size() > MAX_USRDEF_WORD_LEN) {
991             i = fr.m_selWord.m_start;
992             break;
993         }
994
995         if (!e_pos) e_pos = i;
996
997         has_user_selected |= (fr.m_bwType & CLatticeFrame::USER_SELECTED);
998         std::copy(state->m_syls.begin(), state->m_syls.end(), inserter(syls, syls.begin()));
999         i = fr.m_selWord.m_start;
1000     }
1001
1002     if (has_user_selected && syls.size() > 1) {
1003         wstring phrase;
1004         getSelectedSentence (phrase, 0, e_pos);
1005         m_pUserDict->addWord (syls, phrase);
1006     }
1007 }
1008
1009 void
1010 CIMIContext::_saveHistoryCache()
1011 {
1012     if (!m_pHistory)
1013         return;
1014
1015     std::vector<unsigned> result;
1016     unsigned i = m_tailIdx - 1;
1017     while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
1018         i--;
1019
1020     while (i > 0) {
1021         CLatticeFrame &fr = m_lattice[i];
1022         if (fr.isSyllableFrame()) {
1023             result.insert(result.begin(), fr.m_selWord.m_wordId);
1024         } else {
1025             result.insert(result.begin(), 0);
1026         }
1027         i = fr.m_selWord.m_start;
1028     }
1029
1030     if (!result.empty()) {
1031         m_pHistory->memorize(&(result[0]), &(result[0]) + result.size());
1032         m_pHistory->saveToFile();
1033     }
1034 }
1035
1036 void
1037 CIMIContext::deleteCandidate(CCandidate &candi)
1038 {
1039     unsigned wid = candi.m_wordId;
1040     deleteCandidateByWID(wid);
1041 }
1042
1043 void
1044 CIMIContext::deleteCandidateByWID(unsigned wid)
1045 {
1046     if (wid > INI_USRDEF_WID) {
1047         m_pHistory->forget(wid);
1048         m_pUserDict->removeWord(wid);
1049         _buildLattice(m_pPySegmentor->getSegments());
1050     }
1051 }
1052
1053 void
1054 CIMIContext::removeFromHistoryCache(std::vector<unsigned>& wids)
1055 {
1056     if (!m_pHistory)
1057         return;
1058
1059     m_pHistory->forget(&(wids[0]), &(wids[0]) + wids.size());
1060     buildLattice(m_pPySegmentor);
1061 }