3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef SUNPY_IMI_CONTEXT_H
40 #define SUNPY_IMI_CONTEXT_H
42 #include "portability.h"
48 #if defined(DEBUG) && defined (HAVE_ASSET_H)
56 #include "pinyin/pinyin_seg.h"
58 #include "ic_history.h"
60 #include "lattice_states.h"
61 #include "imi_funcobjs.h"
64 * TSentenceScore is only used for whole sentence score,
65 * the score from language model still using double.
67 typedef TLongExpFloat TSentenceScore;
73 typedef std::vector<CLatticeFrame> CLattice;
74 typedef std::vector<CCandidate> CCandidates;
75 typedef CCandidates::iterator CCandidatesIter;
79 bool operator<(const TCandiRank& b) const
80 { return m_all < b.m_all; };
82 TCandiRank() : m_all(0) {
85 TCandiRank(bool user, bool best, unsigned int len,
86 bool fromLattice, TSentenceScore score);
88 TCandiRank(bool user, bool best, unsigned int len,
89 bool fromLattice, unsigned score);
93 #if !defined(WORDS_BIGENDIAN)
96 unsigned m_lattice : 1;
106 unsigned m_lattice : 1;
107 unsigned m_cost : 24;
113 * CCandidate represent basic information about a single candidate.
114 * Its start bone and finishing bone. It's content string. and its
118 friend class CIMIContext;
122 const TWCHAR *m_cwstr;
125 /** Give out the constructor for convinience */
126 CCandidate(unsigned start = 0,
128 TLexiconState* pLxst = NULL,
129 const TWCHAR* s = NULL,
130 unsigned int wid = 0)
131 : m_start(start), m_end(end), m_cwstr(s), m_wordId(wid),
132 m_pLexiconState(pLxst) {}
135 unsigned int m_wordId;
136 TLexiconState* m_pLexiconState;
139 class CLatticeFrame {
140 friend class CIMIContext;
143 UNUSED = 0x0000, // unused frame
144 TAIL = 0x0001, // tail frame
146 CATE_SYLLABLE = 0x0100,
147 SYLLABLE = 0x0101, // pinyin
148 SYLLABLE_SEP = 0x0102, // pinyin
149 INCOMPLETE_SYLLABLE = 0x0104, // incomplete syllable string
152 ASCII = 0x0201, // english string
153 PUNC = 0x0202, // punctuation
154 SYMBOL = 0x0204, // other symbol
155 DIGITAL = 0x0208, // not implemeted here
159 NO_BESTWORD = 1 << 0,
161 USER_SELECTED = 1 << 2,
169 CLatticeFrame () : m_type(UNUSED), m_bwType(NO_BESTWORD) {}
171 bool isUnusedFrame() const
172 { return m_type == 0; }
174 bool isSyllableFrame() const
175 { return(m_type & CATE_SYLLABLE); }
177 bool isSyllableSepFrame() const
178 { return((m_type & SYLLABLE_SEP) > CATE_SYLLABLE); }
180 bool isTailFrame() const
181 { return(m_type == TAIL); }
185 m_bwType = NO_BESTWORD;
186 m_lexiconStates.clear();
187 m_latticeStates.clear();
192 void print(std::string prefix);
195 std::map<int, CCandidate> m_bestWords;
196 CCandidate m_selWord;
197 CLexiconStates m_lexiconStates;
198 CLatticeStates m_latticeStates;
201 typedef std::vector<unsigned> TPath;
207 ~CIMIContext () { clear(); }
211 void setCoreData(CIMIData *pCoreData);
212 void setUserDict(CUserDict *pUserDict) { m_pUserDict = pUserDict; }
214 void setHistoryMemory(CICHistory *phm) { m_pHistory = phm; }
215 CICHistory * getHistoryMemory() { return m_pHistory; }
217 void setHistoryPower(unsigned power)
218 { m_historyPower = power <= 10 ? power : 3; }
220 int getHistoryPower()
221 { return m_historyPower; }
223 void setFullSymbolForwarding(bool value = true) {
224 m_bFullSymbolForwarding = value;
226 bool getFullSymbolForwarding() { return m_bFullSymbolForwarding; }
227 void setGetFullSymbolOp(CGetFullSymbolOp *op) { m_pGetFullSymbolOp = op; }
228 CGetFullSymbolOp& fullSymbolOp() const { return *m_pGetFullSymbolOp; }
230 void setFullPunctForwarding(bool value = true) {
231 m_bFullPunctForwarding = value;
233 bool getFullPunctForwarding() { return m_bFullPunctForwarding; }
234 void setGetFullPunctOp(CGetFullPunctOp *op) { m_pGetFullPunctOp = op; }
235 CGetFullPunctOp& fullPuncOp() const { return *m_pGetFullPunctOp; }
237 void setNonCompleteSyllable(bool value = true) {
238 m_bNonCompleteSyllable = value;
240 bool getNonCompleteSyllable() { return m_bNonCompleteSyllable; }
242 void setCharsetLevel(unsigned l) { m_csLevel = l; }
243 unsigned getCharsetLevel() { return m_csLevel; }
245 void setDynamicCandidateOrder(bool value = true) {
246 m_bDynaCandiOrder = value;
248 bool getDynaCandiOrder() { return m_bDynaCandiOrder; }
250 CLattice& getLattice() { return m_lattice; }
251 bool buildLattice(IPySegmentor *segmentor, bool doSearch = true);
252 bool isEmpty() { return m_tailIdx <= 1; }
253 unsigned getLastFrIdx() { return m_tailIdx - 1; }
255 // omit next punctuation if the very next symbol is an punctuation
256 void omitNextPunct() { m_bOmitPunct = true; }
258 bool searchFrom(unsigned from = 1);
260 size_t getMaxBest() const { return m_maxBest; }
261 void setMaxBest(size_t maxBest) {
263 for (int i = 0; i < MAX_LATTICE_LENGTH; i++) {
264 m_lattice[i].m_latticeStates.setMaxBest(m_maxBest);
268 size_t getMaxTailCandidateNum() const { return m_maxTailCandidateNum; }
269 void setMaxTailCandidateNum(size_t maxTailCandidateNum) {
270 m_maxTailCandidateNum = maxTailCandidateNum;
273 size_t getNBest() { return m_nBest; }
274 std::vector<TPath>& getPath(int rank) { return m_path; }
275 std::vector<TPath>& getSegPath(int rank) { return m_segPath; }
277 TPath& getBestPath() { return m_path[0]; }
278 TPath& getBestSegPath() {
279 if (m_segPath.empty()) {
280 static TPath emptyPath;
283 // CIMIContext would fail to backTrace the bestPathes when there are
284 // no latticeStates on frame e.g., 'yiden' in Quanpin mode, in this
285 // case, return the original segs
286 if (m_segPath[0].empty() && m_pPySegmentor) {
287 // only require the primary segments without the auxiliary ones
288 IPySegmentor::TSegmentVec& segments =
289 m_pPySegmentor->getSegments(false);
290 IPySegmentor::TSegmentVec::const_iterator it = segments.begin();
291 IPySegmentor::TSegmentVec::const_iterator ite = segments.end();
292 m_segPath[0].push_back(0);
293 for (; it != ite; ++it)
294 m_segPath[0].push_back(it->m_start + it->m_len);
299 std::vector<CCandidates> getBestSentenceTails(int rank, unsigned start,
300 unsigned end = UINT_MAX);
302 unsigned getBestSentence(CCandidates& result, int rank,
303 unsigned start = 0, unsigned end = UINT_MAX);
304 unsigned getBestSentence(wstring& result, int rank,
305 unsigned start = 0, unsigned end = UINT_MAX);
306 unsigned getBestSentence(std::vector<unsigned>& result, int rank,
307 unsigned start = 0, unsigned end = UINT_MAX);
309 unsigned getSelectedSentence(wstring& result,
310 unsigned start = 0, unsigned end = UINT_MAX);
311 unsigned getSelectedSentence(std::vector<unsigned>& result,
312 unsigned start = 0, unsigned end = UINT_MAX);
314 void getCandidates(unsigned frIdx, CCandidates& result);
315 unsigned cancelSelection(unsigned frIdx, bool doSearch = true);
316 void makeSelection(CCandidate &candi, bool doSearch = true);
317 void deleteCandidate(CCandidate &candi);
318 void deleteCandidateByWID(unsigned wid);
319 void selectSentence(int idx);
322 void removeFromHistoryCache(std::vector<unsigned>& wids);
325 CUserDict* getUserDict() { return m_pUserDict; }
328 void _clearFrom(unsigned from);
330 bool _buildLattice(IPySegmentor::TSegmentVec &segments,
331 unsigned rebuildFrom = 1, bool doSearch = true);
332 void _forwardSyllables(unsigned i, unsigned j,
333 const IPySegmentor::TSegment& seg);
334 void _forwardSingleSyllable(unsigned i, unsigned j, TSyllable syllable,
335 const IPySegmentor::TSegment& seg,
337 void _forwardSyllableSep(unsigned i, unsigned j);
338 void _forwardString(unsigned i, unsigned j,
339 const std::vector<unsigned>& strbuf);
340 void _forwardPunctChar(unsigned i, unsigned j, unsigned ch);
341 void _forwardOrdinaryChar(unsigned i, unsigned j, unsigned ch);
342 void _forwardTail(unsigned i, unsigned j);
344 void _transferBetween(unsigned start, unsigned end, TLexiconState* plxst,
345 unsigned wid, double ic = 1.0);
346 bool _backTracePaths(const std::vector<TLatticeState>& tail_states,
347 int rank, TPath& path, TPath& segPath);
350 const TWCHAR *_getWstr(unsigned wid);
352 void _saveUserDict();
353 void _saveHistoryCache();
361 size_t m_maxTailCandidateNum;
363 std::vector<TPath> m_path;
364 std::vector<TPath> m_segPath;
366 CThreadSlm* m_pModel;
367 CPinyinTrie* m_pPinyinTrie;
368 CUserDict* m_pUserDict;
369 CICHistory* m_pHistory;
370 unsigned m_historyPower;
374 bool m_bFullSymbolForwarding;
376 CGetFullSymbolOp *m_pGetFullSymbolOp;
378 bool m_bFullPunctForwarding;
379 CGetFullPunctOp *m_pGetFullPunctOp;
381 IPySegmentor *m_pPySegmentor;
383 bool m_bNonCompleteSyllable;
384 bool m_bDynaCandiOrder;
386 unsigned m_candiStarts;
387 unsigned m_candiEnds;