3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef _SUNPINYIN_CONTEXT_HISTORY_H
40 #define _SUNPINYIN_CONTEXT_HISTORY_H
42 #include "portability.h"
49 * A forget all history memory
51 class EXPORTED CICHistory {
53 /** don't care word id, or seperator word id */
54 static const uint32_t DCWID;
56 virtual ~CICHistory();
58 virtual bool seenBefore(uint32_t wid) = 0;
61 * memorize the context stream pointed by [its_wid, ite_wid)
63 virtual bool memorize(uint32_t* its_wid, uint32_t* ite_wid) = 0;
64 virtual void clear() = 0;
67 * remove a word id from history cache
69 virtual void forget(uint32_t wid) = 0;
70 virtual void forget(uint32_t* its_wid, uint32_t* ite_wid) = 0;
73 * @param its_wid is the first word pointer of the context stream
74 * @param ite_wid is the last (exclusive) word pointer of the context stream
75 * @return pr(*(ite_wid-1) | *its_wid, ..., *(ite_wid-2))
76 * The return value could be zero, i.e. no need to smooth the probabilities
78 virtual double pr(uint32_t* its_wid, uint32_t* ite_wid) = 0;
81 * @param its_wid is the first word pointer of the history stream
82 * @param ite_wid is the last (exclusive) word pointer of the history stream
83 * @return pr(*wid | *its_wid, ..., *(ite_wid-1))
84 * The return value could be zero, i.e. no need to smooth the probabilities
86 virtual double pr(uint32_t* its_wid,
91 * allocate a buffer, and put the context memory's contect into it
92 * @param buf_ptr would be stored the buffer pointer
93 * @param sz would be the size in byte of the buffer allocated
94 * @return false on error
95 * Note: the buf_ptr should be used free(*buf_ptr) to free after usage
98 bufferize(void** buf_ptr, size_t* sz) = 0;
101 * Load context memory according to the buf
102 * @param buf_ptr uffer pointer
103 * @param sz is the size in byte of the buffer
104 * @return false on error
105 * call with buf_ptr with NULL value would clear the context memory
107 virtual bool loadFromBuffer(void* buf_ptr, size_t sz) = 0;
108 virtual bool loadFromFile(const char *fname) = 0;
109 virtual bool saveToFile(const char *fname = NULL) = 0;
111 virtual void addStopWords(const std::set<uint32_t>& stopWords) = 0;
113 virtual void initStopWords() = 0;
116 class EXPORTED CBigramHistory : public CICHistory {
118 static void initClass();
122 virtual ~CBigramHistory();
124 virtual bool seenBefore(uint32_t wid);
126 virtual bool memorize(uint32_t* its_wid, uint32_t* ite_wid);
127 virtual void clear();
129 virtual void forget(uint32_t wid);
130 virtual void forget(uint32_t* its_wid, uint32_t* ite_wid);
133 * @param its_wid is the first word pointer of the context stream
134 * @param ite_wid is the last (exclusive) word pointer of the context stream
135 * @return pr(*(ite_wid-1) | *(ite_wid-2))
137 virtual double pr(uint32_t* its_wid, uint32_t* ite_wid);
140 * @param its_wid is the first word pointer of the history stream
141 * @param ite_wid is the last (exclusive) word pointer of the history stream
142 * @return pr(*wid | *(ite_wid-1))
144 virtual double pr(uint32_t* its_wid,
148 virtual bool bufferize(void** buf_ptr, size_t* sz);
150 virtual bool loadFromBuffer(void* buf_ptr, size_t sz);
151 virtual bool loadFromFile(const char *fname);
152 virtual bool saveToFile(const char *fname = NULL);
154 virtual void addStopWords(const std::set<uint32_t>& stopWords);
155 virtual void initStopWords();
158 typedef uint32_t TWordId;
159 typedef std::pair<TWordId, TWordId> TBigram;
160 typedef TWordId TUnigram;
161 typedef std::map<TBigram, int> TBigramPool;
162 typedef std::map<TUnigram, int> TUnigramPool;
163 typedef std::deque<TWordId> TContextMemory;
165 static const size_t contxt_memory_size;
166 static const double focus_memory_ratio;
168 TContextMemory m_memory;
169 TUnigramPool m_unifreq;
170 TBigramPool m_bifreq;
172 std::string m_history_path;
173 std::set<uint32_t> m_stopWords;
176 double pr(TBigram& bg);
177 int uniFreq(TUnigram& ug);
178 int biFreq(TBigram& bg);
180 void decUniFreq(TUnigram& ug);
181 void decBiFreq(TBigram& bg);
182 void incUniFreq(TUnigram& ug);
183 void incBiFreq(TBigram& bg);