3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef SUNPY_LATTICE_STATES_H
40 #define SUNPY_LATTICE_STATES_H
44 #include "portability.h"
46 #include "pinyin/pinyin_seg.h"
48 typedef TLongExpFloat TSentenceScore;
51 * CSlmState represent the history. In real implementation, it's a
52 * node pointer to a state in the language model. But to save the
53 * language model size, the state node in language model do not
54 * thread the back-off pointer. Now, we just use the Word Id for
55 * the node in the language model. Later we should abstract the
56 * StateNode from language model implemetation to replace this
59 typedef CThreadSlm::TState CSlmState;
62 * A WordKey could represent a word. Define this use the unsigned int
63 * directly. Because in the future, we may adopt word class, such as
66 typedef unsigned CWordId;
69 * This class is used to record lexicon state (pinyin trie nodes)
70 * just before a bone. From the bone, it could see when arriving
71 * it, how many valid Pinyin Trie Node still could be used to search
72 * more words further, and what bone is its starting bone.
74 struct TLexiconState {
75 typedef std::vector<CPinyinTrie::TWordIdInfo> TWordIdInfoVec;
77 const CPinyinTrie::TNode *m_pPYNode;
78 TWordIdInfoVec m_words;
79 // accumulated syllables, may contain fuzzy syllables
81 // accumulated segments, may contain fuzzy segments
82 std::vector<unsigned> m_seg_path;
84 unsigned m_start : 16;
85 unsigned m_num_of_inner_fuzzies : 14;
89 TLexiconState (unsigned start,
90 const CPinyinTrie::TNode *pnode,
92 std::vector<unsigned>& seg_path,
94 : m_pPYNode(pnode), m_syls(syls), m_seg_path(seg_path), m_start(start),
95 m_num_of_inner_fuzzies(0), m_bFuzzy(fuzzy), m_bPinyin(true) {}
97 TLexiconState (unsigned start,
98 TWordIdInfoVec &words,
100 std::vector<unsigned>& seg_path,
102 : m_pPYNode(NULL), m_words(words), m_syls(syls), m_seg_path(seg_path),
103 m_start(start), m_num_of_inner_fuzzies(0), m_bFuzzy(fuzzy),
106 TLexiconState (unsigned start, unsigned wid)
107 : m_pPYNode(NULL), m_start(start), m_bPinyin(false) {
108 m_words.push_back(wid);
109 m_seg_path.push_back(start);
110 m_seg_path.push_back(start + 1);
113 const CPinyinTrie::TWordIdInfo *getWords(unsigned &num);
114 void print(std::string prefix) const;
118 * A list of Lexicon State. Every state may from different
119 * starting position. Later, when Fuzzy PinYin are added,
120 * more than one state may comes from one starting bone.
122 typedef std::vector<TLexiconState> CLexiconStates;
126 * The basic static unit used in the lattice searching
128 struct TLatticeState {
129 TSentenceScore m_score;
131 TLexiconState* m_pLexiconState;
132 TLatticeState* m_pBackTraceNode;
133 CSlmState m_slmState;
134 CWordId m_backTraceWordId;
136 TLatticeState(double score = -1.0,
138 TLexiconState* lxstPtr = NULL,
139 TLatticeState* btNodePtr = NULL,
140 CSlmState sk = CSlmState(),
141 CWordId wk = CWordId())
142 : m_score(score), m_frIdx(frIdx), m_pLexiconState(lxstPtr),
143 m_pBackTraceNode(btNodePtr), m_slmState(sk), m_backTraceWordId(wk) {}
145 /** for debug printing... */
146 void print(std::string prefix) const;
148 bool operator<(const TLatticeState& rhs) const {
149 return m_score < rhs.m_score;
152 bool operator==(const TLatticeState& rhs) const {
153 return m_score == rhs.m_score;
156 bool operator>(const TLatticeState& rhs) const {
157 return !((*this) < rhs || (*this) == rhs);
161 typedef std::vector<TLatticeState> CLatticeStateVec;
164 * A container that keeps the top K elements.
166 class CTopLatticeStates {
167 std::vector<TLatticeState> m_heap;
170 CTopLatticeStates(size_t threshold) : m_threshold(threshold) {}
172 bool push(const TLatticeState& state);
175 const TLatticeState& top() const { return m_heap[0]; }
177 size_t size() const { return m_heap.size(); }
179 typedef std::vector<TLatticeState>::iterator iterator;
181 iterator begin() { return m_heap.begin(); }
182 iterator end() { return m_heap.end(); }
186 * All lattice node on a lattice frame. This class provide beam pruning
187 * while push_back, which means at most the best MAX states are reserved,
188 * ie, weak state will may be discard while new better state are inserted,
189 * and the number MAX is arrived.
191 class CLatticeStates {
193 static const unsigned beam_width;
194 static const TSentenceScore filter_ratio_l1;
195 static const TSentenceScore filter_ratio_l2;
196 static const TSentenceScore filter_threshold_exp;
199 CLatticeStates() : m_size(0), m_maxBest(2) {}
201 void setMaxBest(size_t maxBest) { m_maxBest = maxBest; }
204 void add(const TLatticeState& state);
206 std::vector<TLatticeState> getSortedResult();
207 std::vector<TLatticeState> getFilteredResult();
209 typedef std::map<CSlmState, CTopLatticeStates> state_map;
211 friend class CLatticeStates;
212 state_map::iterator m_mainIt;
213 state_map::iterator m_mainEnd;
214 CTopLatticeStates::iterator m_childIt;
216 iterator(state_map::iterator mit, state_map::iterator mend,
217 CTopLatticeStates::iterator cit)
218 : m_mainIt(mit), m_mainEnd(mend), m_childIt(cit) {}
223 bool operator!=(const CLatticeStates::iterator& rhs);
224 TLatticeState& operator*();
225 TLatticeState* operator->();
231 void _pushScoreHeap(TSentenceScore score, CSlmState slmState);
232 void _popScoreHeap();
233 void _refreshHeapIdx(int heapIdx);
234 void _adjustUp(int node);
235 void _adjustDown(int node);
238 state_map m_stateMap;
242 std::map<CSlmState, int> m_heapIdx;
243 std::vector<std::pair<TSentenceScore, CSlmState> > m_scoreHeap;