2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
47 #include <arpa/inet.h>
49 #include <sys/types.h>
51 #include "ic_history.h"
53 const uint32_t CICHistory::DCWID = (uint32_t)-1;
55 CICHistory::~CICHistory()
59 const size_t CBigramHistory::contxt_memory_size = 8192;
60 const double CBigramHistory::focus_memory_ratio = 0.05;
62 //FIXME: CBigramHistory need to be thread safe
63 CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq()
68 CBigramHistory::~CBigramHistory()
73 CBigramHistory::memorize(uint32_t* its_wid, uint32_t* ite_wid)
75 TBigram bigram(DCWID, DCWID);
77 // First , we insert an DC word id before the context history
78 // to seperated from previous stream.
79 if (m_memory.size() == contxt_memory_size) {
81 hb.first = m_memory.front();
83 hb.second = m_memory.front();
88 m_memory.push_back(DCWID);
90 //Now trying to memorize new stream and forget oldest
91 for (; its_wid != ite_wid; ++its_wid) {
92 if (m_memory.size() == contxt_memory_size) {
94 hb.first = m_memory.front();
96 hb.second = m_memory.front();
101 bigram.first = bigram.second;
102 bigram.second = *its_wid;
103 m_memory.push_back(*its_wid);
104 incUniFreq(bigram.second);
111 CBigramHistory::clear()
119 CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid)
121 TBigram bigram(DCWID, DCWID);
122 if (its_wid != ite_wid) {
124 bigram.second = *ite_wid;
125 if (its_wid != ite_wid)
126 bigram.first = *(ite_wid - 1);
132 CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid, uint32_t wid)
134 TBigram bigram(DCWID, DCWID);
135 if (its_wid != ite_wid)
136 bigram.first = *(ite_wid - 1);
141 // htonl could be a macro, so wrap it in a func.
149 CBigramHistory::bufferize(void** buf_ptr, size_t* sz)
154 *sz = sizeof(uint32_t) * m_memory.size();
156 *buf_ptr = malloc(*sz); // malloc for C compatible
157 #ifdef WORDS_BIGENDIAN
158 std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr);
160 std::transform(m_memory.begin(),
161 m_memory.end(), (uint32_t*)*buf_ptr, swap32);
175 CBigramHistory::loadFromFile(const char *fname)
177 m_history_path = fname;
180 int fd = open(fname, O_CREAT, 0600);
182 suc = loadFromBuffer(NULL, 0);
188 void* buf = malloc(info.st_size);
191 read(fd, buf, info.st_size);
192 suc = loadFromBuffer(buf, info.st_size);
200 CBigramHistory::saveToFile(const char *fname)
203 fname = m_history_path.c_str();
208 if (bufferize(&buf, &sz) && buf) {
209 FILE* fp = fopen(fname, "wb");
211 suc = (fwrite(buf, 1, sz, fp) == sz);
220 CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz)
224 sz /= sizeof(uint32_t);
225 uint32_t *pw = (uint32_t*)buf_ptr;
228 #ifndef WORDS_BIGENDIAN
229 std::transform(pw, pw + sz, pw, swap32);
231 TBigram bigram(DCWID, DCWID);
232 for (size_t i = 0; i < sz; ++i) {
233 bigram.first = bigram.second;
234 bigram.second = *pw++;
235 m_memory.push_back(bigram.second);
236 incUniFreq(bigram.second);
244 CBigramHistory::pr(TBigram& bigram)
246 int uf0 = uniFreq(bigram.first);
247 int bf = biFreq(bigram);
248 int uf1 = uniFreq(bigram.second);
251 pr += 0.68 * double(bf) / double(uf0 + 0.5);
252 pr += 0.32 * double(uf1) /
253 double(m_memory.size() + (contxt_memory_size - m_memory.size()) / 10);
257 fprintf(stderr, "uf0:%d bf:%d uf1:%d pr(%d|%d):%lf\n", uf0, bf, uf1,
258 bigram.second, bigram.first, pr);
265 CBigramHistory::uniFreq(TUnigram& ug)
268 if (m_stopWords.find(ug) == m_stopWords.end()) {
269 TUnigramPool::iterator it = m_unifreq.find(ug);
270 if (it != m_unifreq.end()) {
272 TContextMemory::reverse_iterator rit = m_memory.rbegin();
275 rit != m_memory.rend() && i < contxt_memory_size *
279 freq += 1.0 / focus_memory_ratio;
284 //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq);
289 CBigramHistory::biFreq(TBigram& bg)
292 //std::set<unsigned>::const_iterator ite = m_stopWords.end();
293 if (m_stopWords.find(bg.first) == m_stopWords.end()
294 && m_stopWords.find(bg.second) == m_stopWords.end()) {
295 TBigramPool::const_iterator it = m_bifreq.find(bg);
296 if (it != m_bifreq.end()) {
298 TContextMemory::reverse_iterator re = m_memory.rbegin();
299 TContextMemory::reverse_iterator rs = re + 1;
301 rs != m_memory.rend() && i < contxt_memory_size *
304 if (*rs == bg.first && *re == bg.second)
305 freq += 1.0 / focus_memory_ratio;
311 //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq);
316 CBigramHistory::decUniFreq(TUnigram& ug)
318 TUnigramPool::iterator it = m_unifreq.find(ug);
319 if (it != m_unifreq.end()) {
328 CBigramHistory::seenBefore(uint32_t wid)
330 return(wid != DCWID && m_stopWords.find(wid) == m_stopWords.end() &&
331 m_unifreq.find(wid) != m_unifreq.end());
335 CBigramHistory::decBiFreq(TBigram& bg)
337 TBigramPool::iterator it = m_bifreq.find(bg);
338 if (it != m_bifreq.end()) {
347 CBigramHistory::incUniFreq(TUnigram& ug)
350 //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]);
354 CBigramHistory::incBiFreq(TBigram& bg)
357 //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]);
360 // so far, it's very expensive to erase a word from bigram pairs, need to design
361 // a better data structure for this.
363 // And Even though, we may also need to remove the individual characters in this
364 // word (identified by wid), which is current infeasible,
366 // Here are what we need to do:
367 // 1. get the wstring by word id from userdict
368 // 2. iterate the character in this wstring
369 // 3. get the word id from each character from system lexicon (not supported yet)
370 // 4. remove the unigrams and bigrams of each character, and the entire word
373 CBigramHistory::forget(uint32_t wid)
375 TUnigramPool::iterator uni_it = m_unifreq.find(wid);
376 if (uni_it != m_unifreq.end())
377 m_unifreq.erase(uni_it);
379 TBigramPool::iterator it = m_bifreq.begin();
380 TBigramPool::iterator ite = m_bifreq.end();
383 TBigram bigram = it->first;
385 if (bigram.first == wid || bigram.second == wid)
386 m_bifreq.erase(it++);
393 CBigramHistory::forget(uint32_t *its_wid, uint32_t *ite_wid)
395 for (; its_wid < ite_wid; ++its_wid) {
396 TBigram bigram(*its_wid, DCWID);
398 if (its_wid + 1 != ite_wid)
399 bigram.second = *(its_wid + 1);
401 TBigramPool::iterator it = m_bifreq.find(bigram);
402 if (it != m_bifreq.end())
408 CBigramHistory::addStopWords(const std::set<uint32_t>& stopWords)
410 m_stopWords.insert(stopWords.begin(), stopWords.end());
414 CBigramHistory::initStopWords()
418 m_stopWords.insert(0); //unknown world
419 m_stopWords.insert(DCWID); //seperator word id used by history memory interanlly