src/storage/phrase_index.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include "phrase_index.h"
  23 #include "pinyin_custom2.h"
  24
  25 using namespace pinyin;
  26
  27 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  28     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  29     return true;
  30 }
  31
  32 bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
  33                                        guint32 & freq){
  34     guint8 phrase_length = get_phrase_length();
  35     table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
  36
  37     bool retval = m_chunk.get_content
  38         (offset, keys, phrase_length * sizeof(ChewingKey));
  39     if ( !retval )
  40         return retval;
  41     return m_chunk.get_content
  42         (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
  43 }
  44
  45 void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
  46     guint8 phrase_length = get_phrase_length();
  47     set_n_pronunciation(get_n_pronunciation() + 1);
  48     m_chunk.set_content(m_chunk.size(), keys,
  49                         phrase_length * sizeof(ChewingKey));
  50     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  51 }
  52
  53 void PhraseItem::remove_nth_pronunciation(size_t index){
  54     guint8 phrase_length = get_phrase_length();
  55     set_n_pronunciation(get_n_pronunciation() - 1);
  56     size_t offset = phrase_item_header + phrase_length * sizeof ( ucs4_t ) +
  57         index * (phrase_length * sizeof (ChewingKey) + sizeof(guint32));
  58     m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
  59 }
  60
  61 bool PhraseItem::get_phrase_string(ucs4_t * phrase){
  62     guint8 phrase_length = get_phrase_length();
  63     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
  64 }
  65
  66 bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
  67     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
  68     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
  69     return true;
  70 }
  71
  72 void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
  73                                                     ChewingKey * keys,
  74                                                     gint32 delta){
  75     guint8 phrase_length = get_phrase_length();
  76     guint8 npron = get_n_pronunciation();
  77     size_t offset = phrase_item_header + phrase_length * sizeof ( ucs4_t );
  78     char * buf_begin = (char *) m_chunk.begin();
  79     guint32 total_freq = 0;
  80     for ( int i = 0 ; i < npron ; ++i){
  81         char * chewing_begin = buf_begin + offset +
  82             i * ( phrase_length * sizeof(ChewingKey) + sizeof(guint32) );
  83         guint32 * freq = (guint32 *)(chewing_begin +
  84                                      phrase_length * sizeof(ChewingKey));
  85         total_freq += *freq;
  86         if ( 0 == pinyin_compare_with_ambiguities2
  87              (options, keys,
  88               (ChewingKey *)chewing_begin, phrase_length) ){
  89             //protect against total_freq overflow.
  90             if ( delta > 0 && total_freq > total_freq + delta )
  91                 return;
  92             *freq += delta;
  93             total_freq += delta;
  94         }
  95     }
  96 }
  97
  98
  99 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
 100     return m_total_freq;
 101 }
 102
 103 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
 104     table_offset_t offset;
 105     guint32 freq;
 106     bool result = m_phrase_index.get_content
 107         ((token & PHRASE_MASK)
 108          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 109
 110     if ( !result )
 111         return ERROR_OUT_OF_RANGE;
 112
 113     if ( 0 == offset )
 114         return ERROR_NO_ITEM;
 115
 116     result = m_phrase_content.get_content
 117         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 118
 119     if ( !result )
 120         return ERROR_FILE_CORRUPTION;
 121
 122     //protect total_freq overflow
 123     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 124         return ERROR_INTEGER_OVERFLOW;
 125
 126     freq += delta;
 127     m_total_freq += delta;
 128     m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 129
 130     return ERROR_OK;
 131 }
 132
 133 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 134     table_offset_t offset;
 135     guint8 phrase_length;
 136     guint8 n_prons;
 137
 138     bool result = m_phrase_index.get_content
 139         ((token & PHRASE_MASK)
 140          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 141
 142     if ( !result )
 143         return ERROR_OUT_OF_RANGE;
 144
 145     if ( 0 == offset )
 146         return ERROR_NO_ITEM;
 147
 148     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 149     if ( !result )
 150         return ERROR_FILE_CORRUPTION;
 151
 152     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 153     if ( !result )
 154         return ERROR_FILE_CORRUPTION;
 155
 156     size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
 157     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 158     return ERROR_OK;
 159 }
 160
 161 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 162     table_offset_t offset = m_phrase_content.size();
 163     if ( 0 == offset )
 164         offset = 8;
 165     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 166     m_phrase_index.set_content((token & PHRASE_MASK)
 167                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 168     m_total_freq += item->get_unigram_frequency();
 169     return ERROR_OK;
 170 }
 171
 172 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 173     PhraseItem old_item;
 174
 175     int result = get_phrase_item(token, old_item);
 176     if (result != ERROR_OK)
 177         return result;
 178
 179     item = new PhraseItem;
 180     //implictly copy data from m_chunk_content.
 181     item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
 182
 183     const table_offset_t zero_const = 0;
 184     m_phrase_index.set_content((token & PHRASE_MASK)
 185                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 186     m_total_freq -= item->get_unigram_frequency();
 187     return ERROR_OK;
 188 }
 189
 190 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 191     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 192     if ( !sub_phrases ){
 193         sub_phrases = new SubPhraseIndex;
 194     }
 195
 196     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 197     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 198     if ( !retval )
 199         return retval;
 200     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 201     return retval;
 202 }
 203
 204 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 205     table_offset_t end;
 206     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 207     if ( !sub_phrases )
 208         return false;
 209
 210     sub_phrases->store(new_chunk, 0, end);
 211     return true;
 212 }
 213
 214 bool FacadePhraseIndex::unload(guint8 phrase_index){
 215     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 216     if ( !sub_phrases )
 217         return false;
 218     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 219     delete sub_phrases;
 220     sub_phrases = NULL;
 221     return true;
 222 }
 223
 224 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
 225                              MemoryChunk * newlog){
 226     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 227     if ( !sub_phrases )
 228         return false;
 229
 230     SubPhraseIndex old_sub_phrases;
 231     old_sub_phrases.load(oldchunk, 0, oldchunk->size());
 232     PhraseIndexLogger logger;
 233
 234     bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
 235     logger.store(newlog);
 236     return retval;
 237 }
 238
 239 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
 240     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 241     if ( !sub_phrases )
 242         return false;
 243
 244     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 245     PhraseIndexLogger logger;
 246     logger.load(log);
 247
 248     bool retval = sub_phrases->merge(&logger);
 249     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 250
 251     return retval;
 252 }
 253
 254 bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
 255                                         MemoryChunk * log,
 256                                         phrase_token_t mask,
 257                                         phrase_token_t value){
 258     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 259     if ( !sub_phrases )
 260         return false;
 261
 262     /* check mask and value. */
 263     phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
 264     phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
 265     if ((phrase_index & index_mask) != index_value)
 266         return false;
 267
 268     /* unload old sub phrase index */
 269     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 270
 271     /* calculate the sub phrase index mask and value. */
 272     mask &= PHRASE_MASK; value &= PHRASE_MASK;
 273
 274     /* prepare the new logger. */
 275     PhraseIndexLogger oldlogger;
 276     oldlogger.load(log);
 277     PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
 278         (&oldlogger, mask, value);
 279
 280     bool retval = sub_phrases->merge(newlogger);
 281     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 282     delete newlogger;
 283
 284     return retval;
 285 }
 286
 287
 288 bool SubPhraseIndex::load(MemoryChunk * chunk,
 289                           table_offset_t offset, table_offset_t end){
 290     //save the memory chunk
 291     if ( m_chunk ){
 292         delete m_chunk;
 293         m_chunk = NULL;
 294     }
 295     m_chunk = chunk;
 296
 297     char * buf_begin = (char *)chunk->begin();
 298     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 299     offset += sizeof(guint32);
 300     table_offset_t index_one, index_two, index_three;
 301     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 302     offset += sizeof(table_offset_t);
 303     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 304     offset += sizeof(table_offset_t);
 305     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 306     offset += sizeof(table_offset_t);
 307     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 308     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 309     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 310     m_phrase_index.set_chunk(buf_begin + index_one,
 311                              index_two - 1 - index_one, NULL);
 312     m_phrase_content.set_chunk(buf_begin + index_two,
 313                                index_three - 1 - index_two, NULL);
 314     g_return_val_if_fail( index_three <= end, FALSE);
 315     return true;
 316 }
 317
 318 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 319                            table_offset_t offset, table_offset_t& end){
 320     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 321     table_offset_t index = offset + sizeof(guint32);
 322
 323     offset = index + sizeof(table_offset_t) * 3 ;
 324     new_chunk->set_content(offset, &c_separate, sizeof(char));
 325     offset += sizeof(char);
 326
 327     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 328     index += sizeof(table_offset_t);
 329     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 330     offset += m_phrase_index.size();
 331     new_chunk->set_content(offset, &c_separate, sizeof(char));
 332     offset += sizeof(char);
 333
 334     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 335     index += sizeof(table_offset_t);
 336
 337     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 338     offset += m_phrase_content.size();
 339     new_chunk->set_content(offset, &c_separate, sizeof(char));
 340     offset += sizeof(char);
 341     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 342     return true;
 343 }
 344
 345 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
 346     /* diff the header */
 347     MemoryChunk oldheader, newheader;
 348     guint32 total_freq = oldone->get_phrase_index_total_freq();
 349     oldheader.set_content(0, &total_freq, sizeof(guint32));
 350     total_freq = get_phrase_index_total_freq();
 351     newheader.set_content(0, &total_freq, sizeof(guint32));
 352     logger->append_record(LOG_MODIFY_HEADER, null_token,
 353                           &oldheader, &newheader);
 354
 355     /* diff phrase items */
 356     PhraseIndexRange oldrange, currange, range;
 357     oldone->get_range(oldrange); get_range(currange);
 358     range.m_range_begin = std_lite::min(oldrange.m_range_begin,
 359                                         currange.m_range_begin);
 360     range.m_range_end = std_lite::max(oldrange.m_range_end,
 361                                       currange.m_range_end);
 362     PhraseItem olditem, newitem;
 363
 364     for (phrase_token_t token = range.m_range_begin;
 365          token < range.m_range_end; ++token ){
 366         bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
 367         bool newretval = ERROR_OK == get_phrase_item(token, newitem);
 368
 369         if ( oldretval ){
 370             if ( newretval ) { /* compare phrase item. */
 371                 if ( olditem == newitem )
 372                     continue;
 373                 logger->append_record(LOG_MODIFY_RECORD, token,
 374                                       &(olditem.m_chunk), &(newitem.m_chunk));
 375             } else { /* remove phrase item. */
 376                 logger->append_record(LOG_REMOVE_RECORD, token,
 377                                       &(olditem.m_chunk), NULL);
 378             }
 379         } else {
 380             if ( newretval ){ /* add phrase item. */
 381                 logger->append_record(LOG_ADD_RECORD, token,
 382                                       NULL, &(newitem.m_chunk));
 383             } else { /* both empty. */
 384                 /* do nothing. */
 385             }
 386         }
 387     }
 388
 389     return true;
 390 }
 391
 392 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
 393     LOG_TYPE log_type; phrase_token_t token;
 394     MemoryChunk oldchunk, newchunk;
 395     PhraseItem olditem, newitem, item, * tmpitem;
 396
 397     while(logger->has_next_record()){
 398         bool retval = logger->next_record
 399             (log_type, token, &oldchunk, &newchunk);
 400
 401         if (!retval)
 402             break;
 403
 404         switch(log_type){
 405         case LOG_ADD_RECORD:{
 406             assert( 0 == oldchunk.size() );
 407             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 408                                       NULL);
 409             add_phrase_item(token, &newitem);
 410             break;
 411         }
 412         case LOG_REMOVE_RECORD:{
 413             assert( 0 == newchunk.size() );
 414             tmpitem = NULL;
 415             remove_phrase_item(token, tmpitem);
 416
 417             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 418                                       NULL);
 419             if (olditem != *tmpitem)
 420                 return false;
 421             delete tmpitem;
 422
 423             break;
 424         }
 425         case LOG_MODIFY_RECORD:{
 426             get_phrase_item(token, item);
 427             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 428                                       NULL);
 429             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 430                                       NULL);
 431             if (item != olditem)
 432                 return false;
 433
 434             if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
 435                 tmpitem = NULL;
 436                 remove_phrase_item(token, tmpitem);
 437                 assert(olditem == *tmpitem);
 438                 add_phrase_item(token, &newitem);
 439                 delete tmpitem;
 440             } else { /* in place editing. */
 441                 /* newchunk.size() <= item.m_chunk.size() */
 442                 /* Hack here: we assume the behaviour of get_phrase_item
 443                  * point to the actual data positon, so changes to item
 444                  * will be saved in SubPhraseIndex immediately.
 445                  */
 446                 memmove(item.m_chunk.begin(), newchunk.begin(),
 447                         newchunk.size());
 448             }
 449             break;
 450         }
 451         case LOG_MODIFY_HEADER:{
 452             guint32 total_freq = get_phrase_index_total_freq();
 453             guint32 tmp_freq = 0;
 454             assert(null_token == token);
 455             assert(oldchunk.size() == newchunk.size());
 456             oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
 457             if (total_freq != tmp_freq)
 458                 return false;
 459             newchunk.get_content(0, &tmp_freq, sizeof(guint32));
 460             m_total_freq = tmp_freq;
 461             break;
 462         }
 463         default:
 464             assert(false);
 465         }
 466     }
 467     return true;
 468 }
 469
 470 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 471     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 472     if ( !sub_phrases ){
 473         sub_phrases = new SubPhraseIndex;
 474     }
 475
 476     char pinyin[256];
 477     char phrase[256];
 478     phrase_token_t token;
 479     size_t freq;
 480     PhraseItem * item_ptr = new PhraseItem;
 481     phrase_token_t cur_token = 0;
 482     while ( !feof(infile)){
 483         fscanf(infile, "%s", pinyin);
 484         fscanf(infile, "%s", phrase);
 485         fscanf(infile, "%u", &token);
 486         fscanf(infile, "%ld", &freq);
 487         if ( feof(infile) )
 488             break;
 489
 490         assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
 491
 492         glong written;
 493         ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
 494                                               &written, NULL);
 495
 496         if ( 0 == cur_token ){
 497             cur_token = token;
 498             item_ptr->set_phrase_string(written, phrase_ucs4);
 499         }
 500
 501         if ( cur_token != token ){
 502             add_phrase_item( cur_token, item_ptr);
 503             delete item_ptr;
 504             item_ptr = new PhraseItem;
 505             cur_token = token;
 506             item_ptr->set_phrase_string(written, phrase_ucs4);
 507         }
 508
 509         pinyin_option_t options = USE_TONE;
 510         FullPinyinParser2 parser;
 511         ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
 512         ChewingKeyRestVector key_rests =
 513             g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
 514
 515         parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
 516
 517         if (item_ptr->get_phrase_length() == keys->len) {
 518             item_ptr->append_pronunciation((ChewingKey *)keys->data, freq);
 519         } else {
 520             fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
 521                     pinyin, phrase);
 522         }
 523
 524         g_array_free(keys, TRUE);
 525         g_array_free(key_rests, TRUE);
 526         g_free(phrase_ucs4);
 527     }
 528
 529     add_phrase_item( cur_token, item_ptr);
 530     delete item_ptr;
 531 #if 0
 532     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 533 #endif
 534     return true;
 535 }
 536
 537 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
 538                                             guint8 & max_index){
 539     min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
 540     for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
 541         if ( m_sub_phrase_indices[i] ) {
 542             min_index = std_lite::min(min_index, i);
 543             max_index = std_lite::max(max_index, i);
 544         }
 545     }
 546     return ERROR_OK;
 547 }
 548
 549 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
 550     SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
 551     if ( !sub_phrase )
 552         return ERROR_NO_SUB_PHRASE_INDEX;
 553
 554     int result = sub_phrase->get_range(range);
 555     if ( result )
 556         return result;
 557
 558     range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
 559     range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
 560     return ERROR_OK;
 561 }
 562
 563 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
 564     const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
 565     const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
 566
 567     /* remove trailing zeros. */
 568     const table_offset_t * poffset = 0;
 569     for (poffset = end - 1; poffset >= begin + 1; --poffset) {
 570         if (0 !=  *poffset)
 571             break;
 572     }
 573
 574     range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
 575     range.m_range_end = poffset + 1 - begin; /* removed zeros. */
 576
 577     return ERROR_OK;
 578 }
 579
 580 bool FacadePhraseIndex::compact(){
 581     for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
 582         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
 583         if ( !sub_phrase )
 584             continue;
 585
 586         PhraseIndexRange range;
 587         int result = sub_phrase->get_range(range);
 588         if ( result != ERROR_OK )
 589             continue;
 590
 591         SubPhraseIndex * new_sub_phrase =  new SubPhraseIndex;
 592
 593         PhraseItem item;
 594         for ( phrase_token_t token = range.m_range_begin;
 595               token < range.m_range_end;
 596               ++token ) {
 597             result = sub_phrase->get_phrase_item(token, item);
 598             if ( result != ERROR_OK )
 599                 continue;
 600             new_sub_phrase->add_phrase_item(token, &item);
 601         }
 602
 603         delete sub_phrase;
 604         m_sub_phrase_indices[index] = new_sub_phrase;
 605     }
 606     return true;
 607 }
 608
 609 bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
 610     PhraseIndexRange range;
 611     if (ERROR_OK != get_range(range))
 612         return false;
 613
 614     /* calculate mask and value for sub phrase index. */
 615     mask &= PHRASE_MASK; value &= PHRASE_MASK;
 616
 617     for (phrase_token_t token = range.m_range_begin;
 618          token < range.m_range_end; ++token) {
 619         if ((token & mask) != value)
 620             continue;
 621
 622         PhraseItem * item = NULL;
 623         remove_phrase_item(token, item);
 624         if (item)
 625             delete item;
 626     }
 627
 628     return true;
 629 }
 630
 631 bool FacadePhraseIndex::mask_out(guint8 phrase_index,
 632                                  phrase_token_t mask,
 633                                  phrase_token_t value){
 634     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 635     if (!sub_phrases)
 636         return false;
 637
 638     /* check mask and value. */
 639     phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
 640     phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
 641
 642     if ((phrase_index & index_mask ) != index_value)
 643         return false;
 644
 645     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 646     bool retval = sub_phrases->mask_out(mask, value);
 647     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 648
 649     return retval;
 650 }
 651
 652 namespace pinyin{
 653 const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT] =
 654     {
 655         {NULL, NULL, NULL, NOT_USED},
 656         {"gb_char.table", "gb_char.bin", "gb_char.dbin", SYSTEM_FILE},
 657         {"gbk_char.table", "gbk_char.bin", "gbk_char.dbin", SYSTEM_FILE},
 658         {NULL, NULL, NULL, NOT_USED},
 659         {NULL, NULL, NULL, NOT_USED},
 660
 661         {NULL, NULL, NULL, NOT_USED},
 662         {NULL, NULL, NULL, NOT_USED},
 663         {NULL, NULL, NULL, NOT_USED},
 664         {NULL, NULL, NULL, NOT_USED},
 665         {NULL, NULL, NULL, NOT_USED},
 666
 667         {NULL, NULL, NULL, NOT_USED},
 668         {NULL, NULL, NULL, NOT_USED},
 669         {NULL, NULL, NULL, NOT_USED},
 670         {NULL, NULL, NULL, NOT_USED},
 671         {NULL, NULL, NULL, NOT_USED},
 672
 673         {NULL, NULL, "user.bin", USER_FILE}
 674     };
 675
 676
 677 static bool _peek_header(PhraseIndexLogger * logger,
 678                          guint32 & old_total_freq){
 679     old_total_freq = 0;
 680
 681     size_t header_count = 0;
 682     LOG_TYPE log_type; phrase_token_t token;
 683     MemoryChunk oldchunk, newchunk;
 684
 685     while (logger->has_next_record()) {
 686         bool retval = logger->next_record
 687             (log_type, token, &oldchunk, &newchunk);
 688
 689         if (!retval)
 690             break;
 691
 692         if (LOG_MODIFY_HEADER != log_type)
 693             continue;
 694
 695         ++header_count;
 696
 697         oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
 698     }
 699
 700     /* 1 for normal case, 0 for corrupted file. */
 701     assert(1 >= header_count);
 702
 703     return  1 == header_count? true : false;
 704 }
 705
 706 bool _compute_new_header(PhraseIndexLogger * logger,
 707                          phrase_token_t mask,
 708                          phrase_token_t value,
 709                          guint32 & new_total_freq) {
 710
 711     LOG_TYPE log_type; phrase_token_t token;
 712     MemoryChunk oldchunk, newchunk;
 713     PhraseItem olditem, newitem;
 714
 715     while(logger->has_next_record()) {
 716         bool retval = logger->next_record
 717             (log_type, token, &oldchunk, &newchunk);
 718
 719         if (!retval)
 720             break;
 721
 722         if (LOG_MODIFY_HEADER == log_type)
 723             continue;
 724
 725         if ((token & mask) == value)
 726             continue;
 727
 728         switch(log_type) {
 729         case LOG_ADD_RECORD:{
 730             assert( 0 == oldchunk.size() );
 731             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 732                                       NULL);
 733             new_total_freq += newitem.get_unigram_frequency();
 734             break;
 735         }
 736         case LOG_REMOVE_RECORD:{
 737             assert( 0 == newchunk.size() );
 738             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 739                                       NULL);
 740             new_total_freq -= olditem.get_unigram_frequency();
 741             break;
 742         }
 743         case LOG_MODIFY_RECORD:{
 744             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 745                                       NULL);
 746             new_total_freq -= olditem.get_unigram_frequency();
 747
 748             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 749                                       NULL);
 750             new_total_freq += newitem.get_unigram_frequency();
 751             break;
 752         }
 753         default:
 754             assert(false);
 755         }
 756     }
 757
 758     return true;
 759 }
 760
 761 static bool _write_header(PhraseIndexLogger * logger,
 762                           guint32 & old_total_freq,
 763                           guint32 & new_total_freq) {
 764     MemoryChunk oldheader, newheader;
 765     oldheader.set_content(0, &old_total_freq, sizeof(guint32));
 766     newheader.set_content(0, &new_total_freq, sizeof(guint32));
 767     logger->append_record(LOG_MODIFY_HEADER, null_token,
 768                           &oldheader, &newheader);
 769     return true;
 770 }
 771
 772 static bool _mask_out_records(PhraseIndexLogger * oldlogger,
 773                               phrase_token_t mask,
 774                               phrase_token_t value,
 775                               PhraseIndexLogger * newlogger) {
 776     LOG_TYPE log_type; phrase_token_t token;
 777     MemoryChunk oldchunk, newchunk;
 778
 779     while(oldlogger->has_next_record()) {
 780         bool retval = oldlogger->next_record
 781             (log_type, token, &oldchunk, &newchunk);
 782
 783         if (!retval)
 784             break;
 785
 786         if (LOG_MODIFY_HEADER == log_type)
 787             continue;
 788
 789         if ((token & mask) == value)
 790             continue;
 791
 792         newlogger->append_record(log_type, token, &oldchunk, &newchunk);
 793     }
 794
 795     return true;
 796 }
 797
 798 PhraseIndexLogger * mask_out_phrase_index_logger
 799 (PhraseIndexLogger * oldlogger, phrase_token_t mask,
 800  phrase_token_t value) {
 801     PhraseIndexLogger * newlogger = new PhraseIndexLogger;
 802     guint32 old_total_freq = 0, new_total_freq = 0;
 803
 804     /* peek the header value. */
 805     if (!_peek_header(oldlogger, old_total_freq))
 806         return newlogger;
 807
 808     new_total_freq = old_total_freq;
 809
 810     /* compute the new header based on add/modify/remove records. */
 811     oldlogger->rewind();
 812     if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
 813         return newlogger;
 814
 815     /* write out the modify header record. */
 816     _write_header(newlogger, old_total_freq, new_total_freq);
 817
 818     /* mask out the matched records. */
 819     oldlogger->rewind();
 820     _mask_out_records(oldlogger, mask, value, newlogger);
 821
 822     return newlogger;
 823 }
 824
 825 };