src/storage/ngram.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include <stdio.h>
  23 #include <errno.h>
  24 #include <glib.h>
  25 #include <glib/gstdio.h>
  26 #include "memory_chunk.h"
  27 #include "novel_types.h"
  28 #include "ngram.h"
  29
  30 using namespace pinyin;
  31
  32 struct SingleGramItem{
  33     phrase_token_t m_token;
  34     guint32 m_freq;
  35 };
  36
  37 SingleGram::SingleGram(){
  38     m_chunk.set_size(sizeof(guint32));
  39     memset(m_chunk.begin(), 0, sizeof(guint32));
  40 }
  41
  42 SingleGram::SingleGram(void * buffer, size_t length){
  43     m_chunk.set_chunk(buffer, length, NULL);
  44 }
  45
  46 bool SingleGram::get_total_freq(guint32 & total) const{
  47     char * buf_begin = (char *)m_chunk.begin();
  48     total = *((guint32 *)buf_begin);
  49     return true;
  50 }
  51
  52 bool SingleGram::set_total_freq(guint32 total){
  53     char * buf_begin = (char *)m_chunk.begin();
  54     *((guint32 *)buf_begin) = total;
  55     return true;
  56 }
  57
  58 bool SingleGram::prune(){
  59     assert(false);
  60 #if 0
  61     SingleGramItem * begin = (SingleGramItem *)
  62         ((const char *)(m_chunk.begin()) + sizeof(guint32));
  63     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
  64
  65     size_t nitem = 0;
  66     for ( SingleGramItem * cur = begin; cur != end; ++cur){
  67         cur->m_freq--;
  68         nitem++;
  69         if ( cur->m_freq == 0 ){
  70             size_t offset = sizeof(guint32) + (cur - begin)
  71                 * sizeof(SingleGramItem) ;
  72             m_chunk.remove_content(offset, sizeof(SingleGramItem));
  73         }
  74     }
  75     guint32 total_freq;
  76     assert(get_total_freq(total_freq));
  77     assert(set_total_freq(total_freq - nitem));
  78 #endif
  79         return true;
  80 }
  81
  82 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
  83     return lhs.m_token < rhs.m_token;
  84 }
  85
  86 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
  87     const {
  88     const SingleGramItem * begin = (const SingleGramItem *)
  89         ((const char *)(m_chunk.begin()) + sizeof(guint32));
  90     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
  91
  92     guint32 total_freq;
  93     BigramPhraseItemWithCount bigram_item_with_count;
  94     assert(get_total_freq(total_freq));
  95
  96     for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
  97         bigram_item_with_count.m_token = cur_item->m_token;
  98         bigram_item_with_count.m_count = cur_item->m_freq;
  99         bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
 100         g_array_append_val(array, bigram_item_with_count);
 101     }
 102
 103     return true;
 104 }
 105
 106 bool SingleGram::search(/* in */ PhraseIndexRange * range,
 107                         /* out */ BigramPhraseArray array) const {
 108     const SingleGramItem * begin = (const SingleGramItem *)
 109         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 110     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 111
 112     SingleGramItem compare_item;
 113     compare_item.m_token = range->m_range_begin;
 114     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 115
 116     guint32 total_freq;
 117     BigramPhraseItem bigram_item;
 118     assert(get_total_freq(total_freq));
 119
 120     for ( ; cur_item != end; ++cur_item){
 121         if ( cur_item->m_token >= range->m_range_end )
 122             break;
 123         bigram_item.m_token = cur_item->m_token;
 124         bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
 125         g_array_append_val(array, bigram_item);
 126     }
 127
 128     return true;
 129 }
 130
 131 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
 132                               /* in */ guint32 freq){
 133     SingleGramItem * begin = (SingleGramItem *)
 134         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 135     SingleGramItem * end = (SingleGramItem *) m_chunk.end();
 136     SingleGramItem compare_item;
 137     compare_item.m_token = token;
 138     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 139
 140     SingleGramItem insert_item;
 141     insert_item.m_token = token;
 142     insert_item.m_freq = freq;
 143     for ( ; cur_item != end; ++cur_item ){
 144         if ( cur_item->m_token > token ){
 145             size_t offset = sizeof(guint32) +
 146                 sizeof(SingleGramItem) * (cur_item - begin);
 147             m_chunk.insert_content(offset, &insert_item,
 148                                    sizeof(SingleGramItem));
 149             return true;
 150         }
 151         if ( cur_item->m_token == token ){
 152             return false;
 153         }
 154     }
 155     m_chunk.insert_content(m_chunk.size(), &insert_item,
 156                            sizeof(SingleGramItem));
 157     return true;
 158 }
 159
 160 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
 161                               /* out */ guint32 & freq){
 162     freq = 0;
 163     const SingleGramItem * begin = (const SingleGramItem *)
 164         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 165     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 166     SingleGramItem compare_item;
 167     compare_item.m_token = token;
 168     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 169
 170     for ( ; cur_item != end; ++cur_item ){
 171         if ( cur_item->m_token > token )
 172             return false;
 173         if ( cur_item->m_token == token ){
 174             freq = cur_item -> m_freq;
 175             size_t offset = sizeof(guint32) +
 176                 sizeof(SingleGramItem) * (cur_item - begin);
 177             m_chunk.remove_content(offset, sizeof(SingleGramItem));
 178             return true;
 179         }
 180     }
 181     return false;
 182 }
 183
 184 bool SingleGram::get_freq(/* in */ phrase_token_t token,
 185                           /* out */ guint32 & freq) const {
 186     freq = 0;
 187     const SingleGramItem * begin = (const SingleGramItem *)
 188         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 189     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 190     SingleGramItem compare_item;
 191     compare_item.m_token = token;
 192     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 193
 194     for ( ; cur_item != end; ++cur_item){
 195         if ( cur_item->m_token > token )
 196             return false;
 197         if ( cur_item->m_token == token ){
 198             freq = cur_item -> m_freq;
 199             return true;
 200         }
 201     }
 202     return false;
 203 }
 204
 205 bool SingleGram::set_freq( /* in */ phrase_token_t token,
 206                            /* in */ guint32 freq){
 207     SingleGramItem * begin = (SingleGramItem *)
 208         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 209     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
 210     SingleGramItem compare_item;
 211     compare_item.m_token = token;
 212     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 213
 214     for ( ;cur_item != end; ++cur_item){
 215         if ( cur_item->m_token > token ){
 216             return false;
 217         }
 218         if ( cur_item->m_token == token ){
 219             cur_item -> m_freq = freq;
 220             return true;
 221         }
 222     }
 223     return false;
 224 }
 225
 226 bool Bigram::load_db(const char * dbfile){
 227     reset();
 228
 229     /* create in memory db. */
 230     int ret = db_create(&m_db, NULL, 0);
 231     assert(ret == 0);
 232
 233     ret = m_db->open(m_db, NULL, NULL, NULL,
 234                      DB_HASH, DB_CREATE, 0600);
 235     if ( ret != 0 )
 236         return false;
 237
 238     /* load db into memory. */
 239     DB * tmp_db = NULL;
 240     ret = db_create(&tmp_db, NULL, 0);
 241     assert(ret == 0);
 242
 243     ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
 244                        DB_HASH, DB_RDONLY, 0600);
 245     if ( ret != 0 )
 246         return false;
 247
 248     DBC * cursorp = NULL;
 249     DBT key, data;
 250     /* Get a cursor */
 251     tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
 252
 253     /* Initialize our DBTs. */
 254     memset(&key, 0, sizeof(DBT));
 255     memset(&data, 0, sizeof(DBT));
 256
 257     /* Iterate over the database, retrieving each record in turn. */
 258     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 259         int ret = m_db->put(m_db, NULL, &key, &data, 0);
 260         assert(ret == 0);
 261     }
 262     assert (ret == DB_NOTFOUND);
 263
 264     /* Cursors must be closed */
 265     if ( cursorp != NULL )
 266         cursorp->c_close(cursorp);
 267
 268     if ( tmp_db != NULL )
 269         tmp_db->close(tmp_db, 0);
 270
 271     return true;
 272 }
 273
 274 bool Bigram::save_db(const char * dbfile){
 275     DB * tmp_db = NULL;
 276
 277     int ret = g_unlink(dbfile);
 278     if ( ret != 0 && errno != ENOENT)
 279         return false;
 280
 281     ret = db_create(&tmp_db, NULL, 0);
 282     assert(ret == 0);
 283
 284     ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
 285                        DB_HASH, DB_CREATE, 0600);
 286     if ( ret != 0 )
 287         return false;
 288
 289     DBC * cursorp = NULL;
 290     DBT key, data;
 291     /* Get a cursor */
 292     m_db->cursor(m_db, NULL, &cursorp, 0);
 293
 294     /* Initialize our DBTs. */
 295     memset(&key, 0, sizeof(DBT));
 296     memset(&data, 0, sizeof(DBT));
 297
 298     /* Iterate over the database, retrieving each record in turn. */
 299     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 300         int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
 301         assert(ret == 0);
 302     }
 303     assert (ret == DB_NOTFOUND);
 304
 305     /* Cursors must be closed */
 306     if ( cursorp != NULL )
 307         cursorp->c_close(cursorp);
 308
 309     if ( tmp_db != NULL )
 310         tmp_db->close(tmp_db, 0);
 311
 312     return true;
 313 }
 314
 315 bool Bigram::attach(const char * dbfile, guint32 flags){
 316     reset();
 317     u_int32_t db_flags = 0;
 318
 319     if ( flags & ATTACH_READONLY )
 320         db_flags |= DB_RDONLY;
 321     if ( flags & ATTACH_READWRITE )
 322         assert( !( flags & ATTACH_READONLY ) );
 323     if ( flags & ATTACH_CREATE )
 324         db_flags |= DB_CREATE;
 325
 326     if ( !dbfile )
 327         return false;
 328     int ret = db_create(&m_db, NULL, 0);
 329     if ( ret != 0 )
 330         assert(false);
 331
 332     ret = m_db->open(m_db, NULL, dbfile, NULL,
 333                      DB_HASH, db_flags, 0644);
 334     if ( ret != 0)
 335         return false;
 336
 337     return true;
 338 }
 339
 340 bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
 341     single_gram = NULL;
 342     if ( !m_db )
 343         return false;
 344
 345     DBT db_key;
 346     memset(&db_key, 0, sizeof(DBT));
 347     db_key.data = &index;
 348     db_key.size = sizeof(phrase_token_t);
 349
 350     DBT db_data;
 351     memset(&db_data, 0, sizeof(DBT));
 352     int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
 353     if ( ret != 0 )
 354         return false;
 355
 356     single_gram = new SingleGram(db_data.data, db_data.size);
 357     return true;
 358 }
 359
 360 bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
 361     if ( !m_db )
 362         return false;
 363
 364     DBT db_key;
 365     memset(&db_key, 0, sizeof(DBT));
 366     db_key.data = &index;
 367     db_key.size = sizeof(phrase_token_t);
 368     DBT db_data;
 369     memset(&db_data, 0, sizeof(DBT));
 370     db_data.data = single_gram->m_chunk.begin();
 371     db_data.size = single_gram->m_chunk.size();
 372
 373     int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
 374     return ret == 0;
 375 }
 376
 377 bool Bigram::get_all_items(GArray * items){
 378     g_array_set_size(items, 0);
 379
 380     if ( !m_db )
 381         return false;
 382
 383     DBC * cursorp = NULL;
 384     DBT key, data;
 385     int ret;
 386     /* Get a cursor */
 387     m_db->cursor(m_db, NULL, &cursorp, 0);
 388
 389     /* Initialize our DBTs. */
 390     memset(&key, 0, sizeof(DBT));
 391     memset(&data, 0, sizeof(DBT));
 392
 393     /* Iterate over the database, retrieving each record in turn. */
 394     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 395         assert(key.size == sizeof(phrase_token_t));
 396         phrase_token_t * token = (phrase_token_t *)key.data;
 397         g_array_append_val(items, *token);
 398     }
 399
 400     assert (ret == DB_NOTFOUND);
 401
 402     /* Cursors must be closed */
 403     if (cursorp != NULL)
 404         cursorp->c_close(cursorp);
 405
 406     return true;
 407 }
 408
 409
 410 namespace pinyin{
 411
 412 /* merge origin system info and delta user info */
 413 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
 414                        const SingleGram * user){
 415     if (NULL == system && NULL == user)
 416         return false;
 417
 418     MemoryChunk & merged_chunk = merged->m_chunk;
 419
 420     if (NULL == system) {
 421         merged_chunk.set_chunk(user->m_chunk.begin(),
 422                                user->m_chunk.size(), NULL);
 423         return true;
 424     }
 425
 426     if (NULL == user) {
 427         merged_chunk.set_chunk(system->m_chunk.begin(),
 428                                system->m_chunk.size(), NULL);
 429         return true;
 430     }
 431
 432     /* clear merged. */
 433     merged_chunk.set_size(sizeof(guint32));
 434
 435     /* merge the origin info and delta info */
 436     guint32 system_total, user_total;
 437     assert(system->get_total_freq(system_total));
 438     assert(user->get_total_freq(user_total));
 439     const guint32 merged_total = system_total + user_total;
 440     merged_chunk.set_content(0, &merged_total, sizeof(guint32));
 441
 442     const SingleGramItem * cur_system = (const SingleGramItem *)
 443         (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
 444     const SingleGramItem * system_end = (const SingleGramItem *)
 445         system->m_chunk.end();
 446
 447     const SingleGramItem * cur_user = (const SingleGramItem *)
 448         (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
 449     const SingleGramItem * user_end = (const SingleGramItem *)
 450         user->m_chunk.end();
 451
 452     while (cur_system < system_end && cur_user < user_end) {
 453
 454         if (cur_system->m_token < cur_user->m_token) {
 455             /* do append operation here */
 456             merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
 457             cur_system++;
 458         } else if (cur_system->m_token > cur_user->m_token) {
 459             /* do append operation here */
 460             merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
 461             cur_user++;
 462         } else {
 463             assert(cur_system->m_token == cur_user->m_token);
 464
 465             SingleGramItem merged_item;
 466             merged_item.m_token = cur_system->m_token;
 467             merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
 468
 469             merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
 470             cur_system++; cur_user++;
 471         }
 472     }
 473
 474     /* add remained items. */
 475     while (cur_system < system_end) {
 476         merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
 477         cur_system++;
 478     }
 479
 480     while (cur_user < user_end) {
 481         merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
 482         cur_user++;
 483     }
 484
 485     return true;
 486 }
 487
 488 };