src/storage/ngram.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include <stdio.h>
  23 #include <errno.h>
  24 #include <glib.h>
  25 #include <glib/gstdio.h>
  26 #include "memory_chunk.h"
  27 #include "novel_types.h"
  28 #include "ngram.h"
  29
  30 using namespace pinyin;
  31
  32 struct SingleGramItem{
  33     phrase_token_t m_token;
  34     guint32 m_freq;
  35 };
  36
  37 SingleGram::SingleGram(){
  38     m_chunk.set_size(sizeof(guint32));
  39     memset(m_chunk.begin(), 0, sizeof(guint32));
  40 }
  41
  42 SingleGram::SingleGram(void * buffer, size_t length){
  43     m_chunk.set_chunk(buffer, length, NULL);
  44 }
  45
  46 bool SingleGram::get_total_freq(guint32 & total) const{
  47     char * buf_begin = (char *)m_chunk.begin();
  48     total = *((guint32 *)buf_begin);
  49     return true;
  50 }
  51
  52 bool SingleGram::set_total_freq(guint32 total){
  53     char * buf_begin = (char *)m_chunk.begin();
  54     *((guint32 *)buf_begin) = total;
  55     return true;
  56 }
  57
  58 guint32 SingleGram::get_length(){
  59     /* get the number of items. */
  60     const SingleGramItem * begin = (const SingleGramItem *)
  61         ((const char *)(m_chunk.begin()) + sizeof(guint32));
  62     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
  63
  64     const guint32 length = end - begin;
  65
  66     if (0 == length) {
  67         /* no items here, total freq should be zero. */
  68         guint32 total_freq = 0;
  69         assert(get_total_freq(total_freq));
  70         assert(0 == total_freq);
  71     }
  72
  73     return length;
  74 }
  75
  76 guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
  77     guint32 removed_items = 0;
  78
  79     guint32 total_freq = 0;
  80     assert(get_total_freq(total_freq));
  81
  82     const SingleGramItem * begin = (const SingleGramItem *)
  83         ((const char *)(m_chunk.begin()) + sizeof(guint32));
  84     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
  85
  86     for (const SingleGramItem * cur = begin; cur != end; ++cur) {
  87         if ((cur->m_token & mask) != value)
  88             continue;
  89
  90         total_freq -= cur->m_freq;
  91         size_t offset = sizeof(guint32) +
  92             sizeof(SingleGramItem) * (cur - begin);
  93         m_chunk.remove_content(offset, sizeof(SingleGramItem));
  94
  95         /* update chunk end. */
  96         end = (const SingleGramItem *) m_chunk.end();
  97         ++removed_items;
  98         --cur;
  99     }
 100
 101     assert(set_total_freq(total_freq));
 102     return removed_items;
 103 }
 104
 105 bool SingleGram::prune(){
 106     assert(false);
 107 #if 0
 108     SingleGramItem * begin = (SingleGramItem *)
 109         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 110     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
 111
 112     size_t nitem = 0;
 113     for ( SingleGramItem * cur = begin; cur != end; ++cur){
 114         cur->m_freq--;
 115         nitem++;
 116         if ( cur->m_freq == 0 ){
 117             size_t offset = sizeof(guint32) + (cur - begin)
 118                 * sizeof(SingleGramItem) ;
 119             m_chunk.remove_content(offset, sizeof(SingleGramItem));
 120         }
 121     }
 122     guint32 total_freq;
 123     assert(get_total_freq(total_freq));
 124     assert(set_total_freq(total_freq - nitem));
 125 #endif
 126         return true;
 127 }
 128
 129 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
 130     return lhs.m_token < rhs.m_token;
 131 }
 132
 133 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
 134     const {
 135     const SingleGramItem * begin = (const SingleGramItem *)
 136         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 137     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
 138
 139     guint32 total_freq;
 140     BigramPhraseItemWithCount bigram_item_with_count;
 141     assert(get_total_freq(total_freq));
 142
 143     for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
 144         bigram_item_with_count.m_token = cur_item->m_token;
 145         bigram_item_with_count.m_count = cur_item->m_freq;
 146         bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
 147         g_array_append_val(array, bigram_item_with_count);
 148     }
 149
 150     return true;
 151 }
 152
 153 bool SingleGram::search(/* in */ PhraseIndexRange * range,
 154                         /* out */ BigramPhraseArray array) const {
 155     const SingleGramItem * begin = (const SingleGramItem *)
 156         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 157     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 158
 159     SingleGramItem compare_item;
 160     compare_item.m_token = range->m_range_begin;
 161     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 162
 163     guint32 total_freq;
 164     BigramPhraseItem bigram_item;
 165     assert(get_total_freq(total_freq));
 166
 167     for ( ; cur_item != end; ++cur_item){
 168         if ( cur_item->m_token >= range->m_range_end )
 169             break;
 170         bigram_item.m_token = cur_item->m_token;
 171         bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
 172         g_array_append_val(array, bigram_item);
 173     }
 174
 175     return true;
 176 }
 177
 178 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
 179                               /* in */ guint32 freq){
 180     SingleGramItem * begin = (SingleGramItem *)
 181         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 182     SingleGramItem * end = (SingleGramItem *) m_chunk.end();
 183     SingleGramItem compare_item;
 184     compare_item.m_token = token;
 185     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 186
 187     SingleGramItem insert_item;
 188     insert_item.m_token = token;
 189     insert_item.m_freq = freq;
 190     for ( ; cur_item != end; ++cur_item ){
 191         if ( cur_item->m_token > token ){
 192             size_t offset = sizeof(guint32) +
 193                 sizeof(SingleGramItem) * (cur_item - begin);
 194             m_chunk.insert_content(offset, &insert_item,
 195                                    sizeof(SingleGramItem));
 196             return true;
 197         }
 198         if ( cur_item->m_token == token ){
 199             return false;
 200         }
 201     }
 202     m_chunk.insert_content(m_chunk.size(), &insert_item,
 203                            sizeof(SingleGramItem));
 204     return true;
 205 }
 206
 207 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
 208                               /* out */ guint32 & freq){
 209     freq = 0;
 210     const SingleGramItem * begin = (const SingleGramItem *)
 211         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 212     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 213     SingleGramItem compare_item;
 214     compare_item.m_token = token;
 215     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 216
 217     for ( ; cur_item != end; ++cur_item ){
 218         if ( cur_item->m_token > token )
 219             return false;
 220         if ( cur_item->m_token == token ){
 221             freq = cur_item -> m_freq;
 222             size_t offset = sizeof(guint32) +
 223                 sizeof(SingleGramItem) * (cur_item - begin);
 224             m_chunk.remove_content(offset, sizeof(SingleGramItem));
 225             return true;
 226         }
 227     }
 228     return false;
 229 }
 230
 231 bool SingleGram::get_freq(/* in */ phrase_token_t token,
 232                           /* out */ guint32 & freq) const {
 233     freq = 0;
 234     const SingleGramItem * begin = (const SingleGramItem *)
 235         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 236     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
 237     SingleGramItem compare_item;
 238     compare_item.m_token = token;
 239     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 240
 241     for ( ; cur_item != end; ++cur_item){
 242         if ( cur_item->m_token > token )
 243             return false;
 244         if ( cur_item->m_token == token ){
 245             freq = cur_item -> m_freq;
 246             return true;
 247         }
 248     }
 249     return false;
 250 }
 251
 252 bool SingleGram::set_freq( /* in */ phrase_token_t token,
 253                            /* in */ guint32 freq){
 254     SingleGramItem * begin = (SingleGramItem *)
 255         ((const char *)(m_chunk.begin()) + sizeof(guint32));
 256     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
 257     SingleGramItem compare_item;
 258     compare_item.m_token = token;
 259     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
 260
 261     for ( ;cur_item != end; ++cur_item){
 262         if ( cur_item->m_token > token ){
 263             return false;
 264         }
 265         if ( cur_item->m_token == token ){
 266             cur_item -> m_freq = freq;
 267             return true;
 268         }
 269     }
 270     return false;
 271 }
 272
 273 bool Bigram::load_db(const char * dbfile){
 274     reset();
 275
 276     /* create in memory db. */
 277     int ret = db_create(&m_db, NULL, 0);
 278     assert(ret == 0);
 279
 280     ret = m_db->open(m_db, NULL, NULL, NULL,
 281                      DB_HASH, DB_CREATE, 0600);
 282     if ( ret != 0 )
 283         return false;
 284
 285     /* load db into memory. */
 286     DB * tmp_db = NULL;
 287     ret = db_create(&tmp_db, NULL, 0);
 288     assert(ret == 0);
 289
 290     if (NULL == tmp_db)
 291         return false;
 292
 293     ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
 294                        DB_HASH, DB_RDONLY, 0600);
 295     if ( ret != 0 )
 296         return false;
 297
 298     DBC * cursorp = NULL;
 299     DBT key, data;
 300
 301     /* Get a cursor */
 302     tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
 303
 304     if (NULL == cursorp)
 305         return false;
 306
 307     /* Initialize our DBTs. */
 308     memset(&key, 0, sizeof(DBT));
 309     memset(&data, 0, sizeof(DBT));
 310
 311     /* Iterate over the database, retrieving each record in turn. */
 312     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 313         int ret = m_db->put(m_db, NULL, &key, &data, 0);
 314         assert(ret == 0);
 315     }
 316     assert (ret == DB_NOTFOUND);
 317
 318     /* Cursors must be closed */
 319     if ( cursorp != NULL )
 320         cursorp->c_close(cursorp);
 321
 322     if ( tmp_db != NULL )
 323         tmp_db->close(tmp_db, 0);
 324
 325     return true;
 326 }
 327
 328 bool Bigram::save_db(const char * dbfile){
 329     DB * tmp_db = NULL;
 330
 331     int ret = unlink(dbfile);
 332     if ( ret != 0 && errno != ENOENT)
 333         return false;
 334
 335     ret = db_create(&tmp_db, NULL, 0);
 336     assert(ret == 0);
 337
 338     if (NULL == tmp_db)
 339         return false;
 340
 341     ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
 342                        DB_HASH, DB_CREATE, 0600);
 343     if ( ret != 0 )
 344         return false;
 345
 346     DBC * cursorp = NULL;
 347     DBT key, data;
 348     /* Get a cursor */
 349     m_db->cursor(m_db, NULL, &cursorp, 0);
 350
 351     if (NULL == cursorp)
 352         return false;
 353
 354     /* Initialize our DBTs. */
 355     memset(&key, 0, sizeof(DBT));
 356     memset(&data, 0, sizeof(DBT));
 357
 358     /* Iterate over the database, retrieving each record in turn. */
 359     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 360         int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
 361         assert(ret == 0);
 362     }
 363     assert (ret == DB_NOTFOUND);
 364
 365     /* Cursors must be closed */
 366     if ( cursorp != NULL )
 367         cursorp->c_close(cursorp);
 368
 369     if ( tmp_db != NULL )
 370         tmp_db->close(tmp_db, 0);
 371
 372     return true;
 373 }
 374
 375 bool Bigram::attach(const char * dbfile, guint32 flags){
 376     reset();
 377     u_int32_t db_flags = 0;
 378
 379     if ( flags & ATTACH_READONLY )
 380         db_flags |= DB_RDONLY;
 381     if ( flags & ATTACH_READWRITE )
 382         assert( !( flags & ATTACH_READONLY ) );
 383     if ( flags & ATTACH_CREATE )
 384         db_flags |= DB_CREATE;
 385
 386     if ( !dbfile )
 387         return false;
 388     int ret = db_create(&m_db, NULL, 0);
 389     if ( ret != 0 )
 390         assert(false);
 391
 392     ret = m_db->open(m_db, NULL, dbfile, NULL,
 393                      DB_HASH, db_flags, 0644);
 394     if ( ret != 0)
 395         return false;
 396
 397     return true;
 398 }
 399
 400 bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
 401     single_gram = NULL;
 402     if ( !m_db )
 403         return false;
 404
 405     DBT db_key;
 406     memset(&db_key, 0, sizeof(DBT));
 407     db_key.data = &index;
 408     db_key.size = sizeof(phrase_token_t);
 409
 410     DBT db_data;
 411     memset(&db_data, 0, sizeof(DBT));
 412     int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
 413     if ( ret != 0 )
 414         return false;
 415
 416     single_gram = new SingleGram(db_data.data, db_data.size);
 417     return true;
 418 }
 419
 420 bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
 421     if ( !m_db )
 422         return false;
 423
 424     DBT db_key;
 425     memset(&db_key, 0, sizeof(DBT));
 426     db_key.data = &index;
 427     db_key.size = sizeof(phrase_token_t);
 428     DBT db_data;
 429     memset(&db_data, 0, sizeof(DBT));
 430     db_data.data = single_gram->m_chunk.begin();
 431     db_data.size = single_gram->m_chunk.size();
 432
 433     int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
 434     return ret == 0;
 435 }
 436
 437 bool Bigram::remove(/* in */ phrase_token_t index){
 438     if ( !m_db )
 439         return false;
 440
 441     DBT db_key;
 442     memset(&db_key, 0, sizeof(DBT));
 443     db_key.data = &index;
 444     db_key.size = sizeof(phrase_token_t);
 445
 446     int ret = m_db->del(m_db, NULL, &db_key, 0);
 447     return 0 == ret;
 448 }
 449
 450 bool Bigram::get_all_items(GArray * items){
 451     g_array_set_size(items, 0);
 452
 453     if ( !m_db )
 454         return false;
 455
 456     DBC * cursorp = NULL;
 457     DBT key, data;
 458     int ret;
 459     /* Get a cursor */
 460     m_db->cursor(m_db, NULL, &cursorp, 0);
 461
 462     if (NULL == cursorp)
 463         return false;
 464
 465     /* Initialize our DBTs. */
 466     memset(&key, 0, sizeof(DBT));
 467     memset(&data, 0, sizeof(DBT));
 468
 469     /* Iterate over the database, retrieving each record in turn. */
 470     while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
 471         assert(key.size == sizeof(phrase_token_t));
 472         phrase_token_t * token = (phrase_token_t *)key.data;
 473         g_array_append_val(items, *token);
 474     }
 475
 476     assert (ret == DB_NOTFOUND);
 477
 478     /* Cursors must be closed */
 479     if (cursorp != NULL)
 480         cursorp->c_close(cursorp);
 481
 482     return true;
 483 }
 484
 485 bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
 486     GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 487
 488     if (!get_all_items(items)) {
 489         g_array_free(items, TRUE);
 490         return false;
 491     }
 492
 493     for (size_t i = 0; i < items->len; ++i) {
 494         phrase_token_t index = g_array_index(items, phrase_token_t, i);
 495
 496         if ((index & mask) == value) {
 497             assert(remove(index));
 498             continue;
 499         }
 500
 501         SingleGram * gram = NULL;
 502         assert(load(index, gram));
 503
 504         int num = gram->mask_out(mask, value);
 505         if (0 == num) {
 506             delete gram;
 507             continue;
 508         }
 509
 510         if (0 == gram->get_length()) {
 511             assert(remove(index));
 512         } else {
 513             assert(store(index, gram));
 514         }
 515
 516         delete gram;
 517     }
 518
 519     g_array_free(items, TRUE);
 520     return true;
 521 }
 522
 523
 524 namespace pinyin{
 525
 526 /* merge origin system info and delta user info */
 527 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
 528                        const SingleGram * user){
 529     if (NULL == system && NULL == user)
 530         return false;
 531
 532     MemoryChunk & merged_chunk = merged->m_chunk;
 533
 534     if (NULL == system) {
 535         merged_chunk.set_chunk(user->m_chunk.begin(),
 536                                user->m_chunk.size(), NULL);
 537         return true;
 538     }
 539
 540     if (NULL == user) {
 541         merged_chunk.set_chunk(system->m_chunk.begin(),
 542                                system->m_chunk.size(), NULL);
 543         return true;
 544     }
 545
 546     /* clear merged. */
 547     merged_chunk.set_size(sizeof(guint32));
 548
 549     /* merge the origin info and delta info */
 550     guint32 system_total, user_total;
 551     assert(system->get_total_freq(system_total));
 552     assert(user->get_total_freq(user_total));
 553     const guint32 merged_total = system_total + user_total;
 554     merged_chunk.set_content(0, &merged_total, sizeof(guint32));
 555
 556     const SingleGramItem * cur_system = (const SingleGramItem *)
 557         (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
 558     const SingleGramItem * system_end = (const SingleGramItem *)
 559         system->m_chunk.end();
 560
 561     const SingleGramItem * cur_user = (const SingleGramItem *)
 562         (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
 563     const SingleGramItem * user_end = (const SingleGramItem *)
 564         user->m_chunk.end();
 565
 566     while (cur_system < system_end && cur_user < user_end) {
 567
 568         if (cur_system->m_token < cur_user->m_token) {
 569             /* do append operation here */
 570             merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
 571             cur_system++;
 572         } else if (cur_system->m_token > cur_user->m_token) {
 573             /* do append operation here */
 574             merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
 575             cur_user++;
 576         } else {
 577             assert(cur_system->m_token == cur_user->m_token);
 578
 579             SingleGramItem merged_item;
 580             merged_item.m_token = cur_system->m_token;
 581             merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
 582
 583             merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
 584             cur_system++; cur_user++;
 585         }
 586     }
 587
 588     /* add remained items. */
 589     while (cur_system < system_end) {
 590         merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
 591         cur_system++;
 592     }
 593
 594     while (cur_user < user_end) {
 595         merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
 596         cur_user++;
 597     }
 598
 599     return true;
 600 }
 601
 602 };