src/ime-core/ic_history.cpp

   1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
   3  *
   4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
   5  *
   6  * The contents of this file are subject to the terms of either the GNU Lesser
   7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
   8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
   9  * file except in compliance with the License. You can obtain a copy of the CDDL at
  10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
  11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
  12  * specific language governing permissions and limitations under the License. When
  13  * distributing the software, include this License Header Notice in each file and
  14  * include the full text of the License in the License file as well as the
  15  * following notice:
  16  *
  17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  18  * (CDDL)
  19  * For Covered Software in this distribution, this License shall be governed by the
  20  * laws of the State of California (excluding conflict-of-law provisions).
  21  * Any litigation relating to this License shall be subject to the jurisdiction of
  22  * the Federal Courts of the Northern District of California and the state courts
  23  * of the State of California, with venue lying in Santa Clara County, California.
  24  *
  25  * Contributor(s):
  26  *
  27  * If you wish your version of this file to be governed by only the CDDL or only
  28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
  29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
  30  * license." If you don't indicate a single choice of license, a recipient has the
  31  * option to distribute your version of this file under either the CDDL or the LGPL
  32  * Version 2.1, or to extend the choice of license to its licensees as provided
  33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
  34  * Version 2 license, then the option applies only if the new code is made subject
  35  * to such option by the copyright holder.
  36  */
  37
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42 #include <fcntl.h>
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <unistd.h>
  46 #include <cassert>
  47 #include <arpa/inet.h>
  48 #include <sys/stat.h>
  49 #include <sys/types.h>
  50 #include <algorithm>
  51 #include "ic_history.h"
  52
  53 const uint32_t CICHistory::DCWID = (uint32_t)-1;
  54
  55 CICHistory::~CICHistory()
  56 {
  57 }
  58
  59 const size_t CBigramHistory::contxt_memory_size = 8192;
  60 const double CBigramHistory::focus_memory_ratio = 0.05;
  61
  62 //FIXME: CBigramHistory need to be thread safe
  63 CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq()
  64 {
  65     initStopWords();
  66 }
  67
  68 CBigramHistory::~CBigramHistory()
  69 {
  70 }
  71
  72 bool
  73 CBigramHistory::memorize(uint32_t* its_wid, uint32_t* ite_wid)
  74 {
  75     TBigram bigram(DCWID, DCWID);
  76
  77     // First , we insert an DC word id before the context history
  78     // to seperated from previous stream.
  79     if (m_memory.size() == contxt_memory_size) {
  80         TBigram hb;
  81         hb.first = m_memory.front();
  82         m_memory.pop_front();
  83         hb.second = m_memory.front();
  84
  85         decUniFreq(hb.first);
  86         decBiFreq(hb);
  87     }
  88     m_memory.push_back(DCWID);
  89
  90     //Now trying to memorize new stream and forget oldest
  91     for (; its_wid != ite_wid; ++its_wid) {
  92         if (m_memory.size() == contxt_memory_size) {
  93             TBigram hb;
  94             hb.first = m_memory.front();
  95             m_memory.pop_front();
  96             hb.second = m_memory.front();
  97
  98             decUniFreq(hb.first);
  99             decBiFreq(hb);
 100         }
 101         bigram.first = bigram.second;
 102         bigram.second = *its_wid;
 103         m_memory.push_back(*its_wid);
 104         incUniFreq(bigram.second);
 105         incBiFreq(bigram);
 106     }
 107     return true;
 108 }
 109
 110 void
 111 CBigramHistory::clear()
 112 {
 113     m_memory.clear();
 114     m_unifreq.clear();
 115     m_bifreq.clear();
 116 }
 117
 118 double
 119 CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid)
 120 {
 121     TBigram bigram(DCWID, DCWID);
 122     if (its_wid != ite_wid) {
 123         --ite_wid;
 124         bigram.second = *ite_wid;
 125         if (its_wid != ite_wid)
 126             bigram.first = *(ite_wid - 1);
 127     }
 128     return pr(bigram);
 129 }
 130
 131 double
 132 CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid, uint32_t wid)
 133 {
 134     TBigram bigram(DCWID, DCWID);
 135     if (its_wid != ite_wid)
 136         bigram.first = *(ite_wid - 1);
 137     bigram.second = wid;
 138     return pr(bigram);
 139 }
 140
 141 // htonl could be a macro, so wrap it in a func.
 142 inline uint32_t
 143 swap32(uint32_t x)
 144 {
 145     return htonl(x);
 146 }
 147
 148 bool
 149 CBigramHistory::bufferize(void** buf_ptr, size_t* sz)
 150 {
 151     *buf_ptr = NULL;
 152     *sz = 0;
 153     try {
 154         *sz = sizeof(uint32_t) * m_memory.size();
 155         if (*sz > 0) {
 156             *buf_ptr = malloc(*sz); // malloc for C compatible
 157 #ifdef WORDS_BIGENDIAN
 158             std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr);
 159 #else
 160             std::transform(m_memory.begin(),
 161                            m_memory.end(), (uint32_t*)*buf_ptr, swap32);
 162 #endif
 163         }
 164         return true;
 165     } catch (...) {
 166         if (*buf_ptr)
 167             free(*buf_ptr);
 168         *buf_ptr = NULL;
 169         *sz = 0;
 170     }
 171     return false;
 172 }
 173
 174 bool
 175 CBigramHistory::loadFromFile(const char *fname)
 176 {
 177     m_history_path = fname;
 178
 179     bool suc = false;
 180     int fd = open(fname, O_CREAT, 0600);
 181     if (fd == -1) {
 182         suc = loadFromBuffer(NULL, 0);
 183         return suc;
 184     }
 185
 186     struct stat info;
 187     fstat(fd, &info);
 188     void* buf = malloc(info.st_size);
 189
 190     if (buf) {
 191         read(fd, buf, info.st_size);
 192         suc = loadFromBuffer(buf, info.st_size);
 193         free(buf);
 194     }
 195     close(fd);
 196     return suc;
 197 }
 198
 199 bool
 200 CBigramHistory::saveToFile(const char *fname)
 201 {
 202     if (!fname)
 203         fname = m_history_path.c_str();
 204
 205     bool suc = false;
 206     size_t sz = 0;
 207     void* buf = NULL;
 208     if (bufferize(&buf, &sz) && buf) {
 209         FILE* fp = fopen(fname, "wb");
 210         if (fp) {
 211             suc = (fwrite(buf, 1, sz, fp) == sz);
 212             fclose(fp);
 213         }
 214         free(buf);
 215     }
 216     return suc;
 217 }
 218
 219 bool
 220 CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz)
 221 {
 222     clear();
 223
 224     sz /= sizeof(uint32_t);
 225     uint32_t *pw = (uint32_t*)buf_ptr;
 226
 227     if (pw && sz > 0) {
 228 #ifndef WORDS_BIGENDIAN
 229         std::transform(pw, pw + sz, pw, swap32);
 230 #endif
 231         TBigram bigram(DCWID, DCWID);
 232         for (size_t i = 0; i < sz; ++i) {
 233             bigram.first = bigram.second;
 234             bigram.second = *pw++;
 235             m_memory.push_back(bigram.second);
 236             incUniFreq(bigram.second);
 237             incBiFreq(bigram);
 238         }
 239     }
 240     return true;
 241 }
 242
 243 double
 244 CBigramHistory::pr(TBigram& bigram)
 245 {
 246     int uf0 = uniFreq(bigram.first);
 247     int bf = biFreq(bigram);
 248     int uf1 = uniFreq(bigram.second);
 249
 250     double pr = 0.0;
 251     pr += 0.68 * double(bf) / double(uf0 + 0.5);
 252     pr += 0.32 * double(uf1) /
 253           double(m_memory.size() + (contxt_memory_size - m_memory.size()) / 10);
 254
 255 #ifdef DEBUG
 256     if (pr != 0)
 257         fprintf(stderr, "uf0:%d bf:%d uf1:%d pr(%d|%d):%lf\n", uf0, bf, uf1,
 258                 bigram.second, bigram.first, pr);
 259 #endif
 260
 261     return pr;
 262 }
 263
 264 int
 265 CBigramHistory::uniFreq(TUnigram& ug)
 266 {
 267     int freq = 0;
 268     if (m_stopWords.find(ug) == m_stopWords.end()) {
 269         TUnigramPool::iterator it = m_unifreq.find(ug);
 270         if (it != m_unifreq.end()) {
 271             freq = it->second;
 272             TContextMemory::reverse_iterator rit = m_memory.rbegin();
 273             for (int i =
 274                      0;
 275                  rit != m_memory.rend() && i < contxt_memory_size *
 276                  focus_memory_ratio;
 277                  i++) {
 278                 if (*rit == ug)
 279                     freq += 1.0 / focus_memory_ratio;
 280                 *rit++;
 281             }
 282         }
 283     }
 284     //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq);
 285     return freq / 2;
 286 }
 287
 288 int
 289 CBigramHistory::biFreq(TBigram& bg)
 290 {
 291     int freq = 0;
 292     //std::set<unsigned>::const_iterator ite = m_stopWords.end();
 293     if (m_stopWords.find(bg.first) == m_stopWords.end()
 294         && m_stopWords.find(bg.second) == m_stopWords.end()) {
 295         TBigramPool::const_iterator it = m_bifreq.find(bg);
 296         if (it != m_bifreq.end()) {
 297             freq = it->second;
 298             TContextMemory::reverse_iterator re = m_memory.rbegin();
 299             TContextMemory::reverse_iterator rs = re + 1;
 300             for (int i = 0;
 301                  rs != m_memory.rend() && i < contxt_memory_size *
 302                  focus_memory_ratio;
 303                  i++) {
 304                 if (*rs == bg.first && *re == bg.second)
 305                     freq += 1.0 / focus_memory_ratio;
 306                 ++rs; ++re;
 307             }
 308         }
 309     }
 310
 311     //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq);
 312     return freq;
 313 }
 314
 315 void
 316 CBigramHistory::decUniFreq(TUnigram& ug)
 317 {
 318     TUnigramPool::iterator it = m_unifreq.find(ug);
 319     if (it != m_unifreq.end()) {
 320         if (it->second > 1)
 321             --(it->second);
 322         else
 323             m_unifreq.erase(it);
 324     }
 325 }
 326
 327 bool
 328 CBigramHistory::seenBefore(uint32_t wid)
 329 {
 330     return(wid != DCWID && m_stopWords.find(wid) == m_stopWords.end() &&
 331            m_unifreq.find(wid) != m_unifreq.end());
 332 }
 333
 334 void
 335 CBigramHistory::decBiFreq(TBigram& bg)
 336 {
 337     TBigramPool::iterator it = m_bifreq.find(bg);
 338     if (it != m_bifreq.end()) {
 339         if (it->second > 1)
 340             --(it->second);
 341         else
 342             m_bifreq.erase(it);
 343     }
 344 }
 345
 346 void
 347 CBigramHistory::incUniFreq(TUnigram& ug)
 348 {
 349     ++m_unifreq[ug];
 350     //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]);
 351 }
 352
 353 void
 354 CBigramHistory::incBiFreq(TBigram& bg)
 355 {
 356     ++m_bifreq[bg];
 357     //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]);
 358 }
 359
 360 // so far, it's very expensive to erase a word from bigram pairs, need to design
 361 // a better data structure for this.
 362 //
 363 // And Even though, we may also need to remove the individual characters in this
 364 // word (identified by wid), which is current infeasible,
 365 //
 366 // Here are what we need to do:
 367 //   1. get the wstring by word id from userdict
 368 //   2. iterate the character in this wstring
 369 //   3. get the word id from each character from system lexicon (not supported yet)
 370 //   4. remove the unigrams and bigrams of each character, and the entire word
 371 //
 372 void
 373 CBigramHistory::forget(uint32_t wid)
 374 {
 375     TUnigramPool::iterator uni_it = m_unifreq.find(wid);
 376     if (uni_it != m_unifreq.end())
 377         m_unifreq.erase(uni_it);
 378
 379     TBigramPool::iterator it = m_bifreq.begin();
 380     TBigramPool::iterator ite = m_bifreq.end();
 381
 382     while (it != ite) {
 383         TBigram bigram = it->first;
 384
 385         if (bigram.first == wid || bigram.second == wid)
 386             m_bifreq.erase(it++);
 387         else
 388             ++it;
 389     }
 390 }
 391
 392 void
 393 CBigramHistory::forget(uint32_t *its_wid, uint32_t *ite_wid)
 394 {
 395     for (; its_wid < ite_wid; ++its_wid) {
 396         TBigram bigram(*its_wid, DCWID);
 397
 398         if (its_wid + 1 != ite_wid)
 399             bigram.second = *(its_wid + 1);
 400
 401         TBigramPool::iterator it = m_bifreq.find(bigram);
 402         if (it != m_bifreq.end())
 403             m_bifreq.erase(it);
 404     }
 405 }
 406
 407 void
 408 CBigramHistory::addStopWords(const std::set<uint32_t>& stopWords)
 409 {
 410     m_stopWords.insert(stopWords.begin(), stopWords.end());
 411 }
 412
 413 void
 414 CBigramHistory::initStopWords()
 415 {
 416     m_stopWords.clear();
 417
 418     m_stopWords.insert(0);     //unknown world
 419     m_stopWords.insert(DCWID); //seperator word id used by history memory interanlly
 420 }