src/storage/phrase_index.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include "phrase_index.h"
  23 #include "pinyin_custom2.h"
  24
  25 using namespace pinyin;
  26
  27 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  28     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  29     return true;
  30 }
  31
  32 bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
  33                                        guint32 & freq){
  34     guint8 phrase_length = get_phrase_length();
  35     table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
  36
  37     bool retval = m_chunk.get_content
  38         (offset, keys, phrase_length * sizeof(ChewingKey));
  39     if ( !retval )
  40         return retval;
  41     return m_chunk.get_content
  42         (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
  43 }
  44
  45 #if 0
  46 void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
  47     guint8 phrase_length = get_phrase_length();
  48     set_n_pronunciation(get_n_pronunciation() + 1);
  49     m_chunk.set_content(m_chunk.size(), keys,
  50                         phrase_length * sizeof(ChewingKey));
  51     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  52 }
  53 #endif
  54
  55 bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
  56     guint8 phrase_length = get_phrase_length();
  57     guint8 npron = get_n_pronunciation();
  58     size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
  59     char * buf_begin = (char *) m_chunk.begin();
  60     guint32 total_freq = 0;
  61
  62     for (int i = 0; i < npron; ++i) {
  63         char * chewing_begin = buf_begin + offset +
  64             i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
  65         guint32 * freq = (guint32 *)(chewing_begin +
  66                                      phrase_length * sizeof(ChewingKey));
  67
  68         total_freq += *freq;
  69
  70         if (0 == pinyin_exact_compare2
  71             (keys, (ChewingKey *)chewing_begin, phrase_length)) {
  72             /* found the exact match pinyin keys. */
  73
  74             /* protect against total_freq overflow. */
  75             if (delta > 0 && total_freq > total_freq + delta)
  76                 return false;
  77
  78             *freq += delta;
  79             total_freq += delta;
  80             return true;
  81         }
  82     }
  83
  84     set_n_pronunciation(npron + 1);
  85     m_chunk.set_content(m_chunk.size(), keys,
  86                         phrase_length * sizeof(ChewingKey));
  87     m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32));
  88     return true;
  89 }
  90
  91 void PhraseItem::remove_nth_pronunciation(size_t index){
  92     guint8 phrase_length = get_phrase_length();
  93     set_n_pronunciation(get_n_pronunciation() - 1);
  94     size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) +
  95         index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
  96     m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
  97 }
  98
  99 bool PhraseItem::get_phrase_string(ucs4_t * phrase){
 100     guint8 phrase_length = get_phrase_length();
 101     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
 102 }
 103
 104 bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
 105     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
 106     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
 107     return true;
 108 }
 109
 110 void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
 111                                                     ChewingKey * keys,
 112                                                     gint32 delta){
 113     guint8 phrase_length = get_phrase_length();
 114     guint8 npron = get_n_pronunciation();
 115     size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
 116     char * buf_begin = (char *) m_chunk.begin();
 117     guint32 total_freq = 0;
 118
 119     for (int i = 0; i < npron; ++i) {
 120         char * chewing_begin = buf_begin + offset +
 121             i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
 122         guint32 * freq = (guint32 *)(chewing_begin +
 123                                      phrase_length * sizeof(ChewingKey));
 124         total_freq += *freq;
 125
 126         if (0 == pinyin_compare_with_ambiguities2
 127             (options, keys,
 128              (ChewingKey *)chewing_begin, phrase_length)) {
 129
 130             /* protect against total_freq overflow. */
 131             if (delta > 0 && total_freq > total_freq + delta)
 132                 return;
 133
 134             *freq += delta;
 135             total_freq += delta;
 136         }
 137     }
 138 }
 139
 140
 141 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
 142     return m_total_freq;
 143 }
 144
 145 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
 146     table_offset_t offset;
 147     guint32 freq;
 148     bool result = m_phrase_index.get_content
 149         ((token & PHRASE_MASK)
 150          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 151
 152     if ( !result )
 153         return ERROR_OUT_OF_RANGE;
 154
 155     if ( 0 == offset )
 156         return ERROR_NO_ITEM;
 157
 158     result = m_phrase_content.get_content
 159         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 160
 161     if ( !result )
 162         return ERROR_FILE_CORRUPTION;
 163
 164     //protect total_freq overflow
 165     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 166         return ERROR_INTEGER_OVERFLOW;
 167
 168     freq += delta;
 169     m_total_freq += delta;
 170     m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 171
 172     return ERROR_OK;
 173 }
 174
 175 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 176     table_offset_t offset;
 177     guint8 phrase_length;
 178     guint8 n_prons;
 179
 180     bool result = m_phrase_index.get_content
 181         ((token & PHRASE_MASK)
 182          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 183
 184     if ( !result )
 185         return ERROR_OUT_OF_RANGE;
 186
 187     if ( 0 == offset )
 188         return ERROR_NO_ITEM;
 189
 190     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 191     if ( !result )
 192         return ERROR_FILE_CORRUPTION;
 193
 194     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 195     if ( !result )
 196         return ERROR_FILE_CORRUPTION;
 197
 198     size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
 199     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 200     return ERROR_OK;
 201 }
 202
 203 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 204     table_offset_t offset = m_phrase_content.size();
 205     if ( 0 == offset )
 206         offset = 8;
 207     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 208     m_phrase_index.set_content((token & PHRASE_MASK)
 209                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 210     m_total_freq += item->get_unigram_frequency();
 211     return ERROR_OK;
 212 }
 213
 214 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 215     PhraseItem old_item;
 216
 217     int result = get_phrase_item(token, old_item);
 218     if (result != ERROR_OK)
 219         return result;
 220
 221     item = new PhraseItem;
 222     //implictly copy data from m_chunk_content.
 223     item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
 224
 225     const table_offset_t zero_const = 0;
 226     m_phrase_index.set_content((token & PHRASE_MASK)
 227                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 228     m_total_freq -= item->get_unigram_frequency();
 229     return ERROR_OK;
 230 }
 231
 232 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 233     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 234     if ( !sub_phrases ){
 235         sub_phrases = new SubPhraseIndex;
 236     }
 237
 238     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 239     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 240     if ( !retval )
 241         return retval;
 242     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 243     return retval;
 244 }
 245
 246 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 247     table_offset_t end;
 248     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 249     if ( !sub_phrases )
 250         return false;
 251
 252     sub_phrases->store(new_chunk, 0, end);
 253     return true;
 254 }
 255
 256 bool FacadePhraseIndex::unload(guint8 phrase_index){
 257     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 258     if ( !sub_phrases )
 259         return false;
 260     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 261     delete sub_phrases;
 262     sub_phrases = NULL;
 263     return true;
 264 }
 265
 266 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
 267                              MemoryChunk * newlog){
 268     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 269     if ( !sub_phrases )
 270         return false;
 271
 272     SubPhraseIndex old_sub_phrases;
 273     old_sub_phrases.load(oldchunk, 0, oldchunk->size());
 274     PhraseIndexLogger logger;
 275
 276     bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
 277     logger.store(newlog);
 278     return retval;
 279 }
 280
 281 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
 282     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 283     if ( !sub_phrases )
 284         return false;
 285
 286     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 287     PhraseIndexLogger logger;
 288     logger.load(log);
 289
 290     bool retval = sub_phrases->merge(&logger);
 291     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 292
 293     return retval;
 294 }
 295
 296 bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
 297                                         MemoryChunk * log,
 298                                         phrase_token_t mask,
 299                                         phrase_token_t value){
 300     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 301     if ( !sub_phrases )
 302         return false;
 303
 304     /* check mask and value. */
 305     phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
 306     phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
 307     if ((phrase_index & index_mask) != index_value)
 308         return false;
 309
 310     /* unload old sub phrase index */
 311     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 312
 313     /* calculate the sub phrase index mask and value. */
 314     mask &= PHRASE_MASK; value &= PHRASE_MASK;
 315
 316     /* prepare the new logger. */
 317     PhraseIndexLogger oldlogger;
 318     oldlogger.load(log);
 319     PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
 320         (&oldlogger, mask, value);
 321
 322     bool retval = sub_phrases->merge(newlogger);
 323     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 324     delete newlogger;
 325
 326     return retval;
 327 }
 328
 329
 330 bool SubPhraseIndex::load(MemoryChunk * chunk,
 331                           table_offset_t offset, table_offset_t end){
 332     //save the memory chunk
 333     if ( m_chunk ){
 334         delete m_chunk;
 335         m_chunk = NULL;
 336     }
 337     m_chunk = chunk;
 338
 339     char * buf_begin = (char *)chunk->begin();
 340     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 341     offset += sizeof(guint32);
 342     table_offset_t index_one, index_two, index_three;
 343     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 344     offset += sizeof(table_offset_t);
 345     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 346     offset += sizeof(table_offset_t);
 347     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 348     offset += sizeof(table_offset_t);
 349     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 350     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 351     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 352     m_phrase_index.set_chunk(buf_begin + index_one,
 353                              index_two - 1 - index_one, NULL);
 354     m_phrase_content.set_chunk(buf_begin + index_two,
 355                                index_three - 1 - index_two, NULL);
 356     g_return_val_if_fail( index_three <= end, FALSE);
 357     return true;
 358 }
 359
 360 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 361                            table_offset_t offset, table_offset_t& end){
 362     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 363     table_offset_t index = offset + sizeof(guint32);
 364
 365     offset = index + sizeof(table_offset_t) * 3 ;
 366     new_chunk->set_content(offset, &c_separate, sizeof(char));
 367     offset += sizeof(char);
 368
 369     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 370     index += sizeof(table_offset_t);
 371     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 372     offset += m_phrase_index.size();
 373     new_chunk->set_content(offset, &c_separate, sizeof(char));
 374     offset += sizeof(char);
 375
 376     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 377     index += sizeof(table_offset_t);
 378
 379     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 380     offset += m_phrase_content.size();
 381     new_chunk->set_content(offset, &c_separate, sizeof(char));
 382     offset += sizeof(char);
 383     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 384     return true;
 385 }
 386
 387 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
 388     /* diff the header */
 389     MemoryChunk oldheader, newheader;
 390     guint32 total_freq = oldone->get_phrase_index_total_freq();
 391     oldheader.set_content(0, &total_freq, sizeof(guint32));
 392     total_freq = get_phrase_index_total_freq();
 393     newheader.set_content(0, &total_freq, sizeof(guint32));
 394     logger->append_record(LOG_MODIFY_HEADER, null_token,
 395                           &oldheader, &newheader);
 396
 397     /* diff phrase items */
 398     PhraseIndexRange oldrange, currange, range;
 399     oldone->get_range(oldrange); get_range(currange);
 400     range.m_range_begin = std_lite::min(oldrange.m_range_begin,
 401                                         currange.m_range_begin);
 402     range.m_range_end = std_lite::max(oldrange.m_range_end,
 403                                       currange.m_range_end);
 404     PhraseItem olditem, newitem;
 405
 406     for (phrase_token_t token = range.m_range_begin;
 407          token < range.m_range_end; ++token ){
 408         bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
 409         bool newretval = ERROR_OK == get_phrase_item(token, newitem);
 410
 411         if ( oldretval ){
 412             if ( newretval ) { /* compare phrase item. */
 413                 if ( olditem == newitem )
 414                     continue;
 415                 logger->append_record(LOG_MODIFY_RECORD, token,
 416                                       &(olditem.m_chunk), &(newitem.m_chunk));
 417             } else { /* remove phrase item. */
 418                 logger->append_record(LOG_REMOVE_RECORD, token,
 419                                       &(olditem.m_chunk), NULL);
 420             }
 421         } else {
 422             if ( newretval ){ /* add phrase item. */
 423                 logger->append_record(LOG_ADD_RECORD, token,
 424                                       NULL, &(newitem.m_chunk));
 425             } else { /* both empty. */
 426                 /* do nothing. */
 427             }
 428         }
 429     }
 430
 431     return true;
 432 }
 433
 434 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
 435     LOG_TYPE log_type; phrase_token_t token;
 436     MemoryChunk oldchunk, newchunk;
 437     PhraseItem olditem, newitem, item, * tmpitem;
 438
 439     while(logger->has_next_record()){
 440         bool retval = logger->next_record
 441             (log_type, token, &oldchunk, &newchunk);
 442
 443         if (!retval)
 444             break;
 445
 446         switch(log_type){
 447         case LOG_ADD_RECORD:{
 448             assert( 0 == oldchunk.size() );
 449             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 450                                       NULL);
 451             add_phrase_item(token, &newitem);
 452             break;
 453         }
 454         case LOG_REMOVE_RECORD:{
 455             assert( 0 == newchunk.size() );
 456             tmpitem = NULL;
 457             remove_phrase_item(token, tmpitem);
 458
 459             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 460                                       NULL);
 461
 462             if (olditem != *tmpitem) {
 463                 delete tmpitem;
 464                 return false;
 465             }
 466
 467             delete tmpitem;
 468
 469             break;
 470         }
 471         case LOG_MODIFY_RECORD:{
 472             get_phrase_item(token, item);
 473             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 474                                       NULL);
 475             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 476                                       NULL);
 477             if (item != olditem)
 478                 return false;
 479
 480             if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
 481                 tmpitem = NULL;
 482                 remove_phrase_item(token, tmpitem);
 483                 assert(olditem == *tmpitem);
 484                 add_phrase_item(token, &newitem);
 485                 delete tmpitem;
 486             } else { /* in place editing. */
 487                 /* newchunk.size() <= item.m_chunk.size() */
 488                 /* Hack here: we assume the behaviour of get_phrase_item
 489                  * point to the actual data positon, so changes to item
 490                  * will be saved in SubPhraseIndex immediately.
 491                  */
 492                 memmove(item.m_chunk.begin(), newchunk.begin(),
 493                         newchunk.size());
 494             }
 495             break;
 496         }
 497         case LOG_MODIFY_HEADER:{
 498             guint32 total_freq = get_phrase_index_total_freq();
 499             guint32 tmp_freq = 0;
 500             assert(null_token == token);
 501             assert(oldchunk.size() == newchunk.size());
 502             oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
 503             if (total_freq != tmp_freq)
 504                 return false;
 505             newchunk.get_content(0, &tmp_freq, sizeof(guint32));
 506             m_total_freq = tmp_freq;
 507             break;
 508         }
 509         default:
 510             assert(false);
 511         }
 512     }
 513     return true;
 514 }
 515
 516 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 517     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 518     if ( !sub_phrases ){
 519         sub_phrases = new SubPhraseIndex;
 520     }
 521
 522     char pinyin[256];
 523     char phrase[256];
 524     phrase_token_t token;
 525     size_t freq;
 526
 527     PhraseItem * item_ptr = new PhraseItem;
 528     phrase_token_t cur_token = 0;
 529
 530     while (!feof(infile)){
 531         int num = fscanf(infile, "%s %s %u %ld",
 532                          pinyin, phrase, &token, &freq);
 533
 534         if (4 != num)
 535             continue;
 536
 537         if (feof(infile))
 538             break;
 539
 540         assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
 541
 542         glong written;
 543         ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
 544                                               &written, NULL);
 545
 546         if ( 0 == cur_token ){
 547             cur_token = token;
 548             item_ptr->set_phrase_string(written, phrase_ucs4);
 549         }
 550
 551         if ( cur_token != token ){
 552             add_phrase_item( cur_token, item_ptr);
 553             delete item_ptr;
 554             item_ptr = new PhraseItem;
 555             cur_token = token;
 556             item_ptr->set_phrase_string(written, phrase_ucs4);
 557         }
 558
 559         pinyin_option_t options = USE_TONE;
 560         FullPinyinParser2 parser;
 561         ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
 562         ChewingKeyRestVector key_rests =
 563             g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
 564
 565         parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
 566
 567         if (item_ptr->get_phrase_length() == keys->len) {
 568             item_ptr->add_pronunciation((ChewingKey *)keys->data, freq);
 569         } else {
 570             fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
 571                     pinyin, phrase);
 572         }
 573
 574         g_array_free(keys, TRUE);
 575         g_array_free(key_rests, TRUE);
 576         g_free(phrase_ucs4);
 577     }
 578
 579     add_phrase_item( cur_token, item_ptr);
 580     delete item_ptr;
 581 #if 0
 582     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 583 #endif
 584     return true;
 585 }
 586
 587 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
 588                                             guint8 & max_index){
 589     min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
 590     for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
 591         if ( m_sub_phrase_indices[i] ) {
 592             min_index = std_lite::min(min_index, i);
 593             max_index = std_lite::max(max_index, i);
 594         }
 595     }
 596     return ERROR_OK;
 597 }
 598
 599 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
 600     SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
 601     if ( !sub_phrase )
 602         return ERROR_NO_SUB_PHRASE_INDEX;
 603
 604     int result = sub_phrase->get_range(range);
 605     if ( result )
 606         return result;
 607
 608     range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
 609     range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
 610     return ERROR_OK;
 611 }
 612
 613 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
 614     const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
 615     const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
 616
 617     if (begin == end) {
 618         /* skip empty sub phrase index. */
 619         range.m_range_begin = 1;
 620         range.m_range_end = 1;
 621         return ERROR_OK;
 622     }
 623
 624     /* remove trailing zeros. */
 625     const table_offset_t * poffset = 0;
 626     for (poffset = end - 1; poffset >= begin + 1; --poffset) {
 627         if (0 !=  *poffset)
 628             break;
 629     }
 630
 631     range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
 632     range.m_range_end = poffset + 1 - begin; /* removed zeros. */
 633
 634     return ERROR_OK;
 635 }
 636
 637 bool FacadePhraseIndex::compact(){
 638     for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
 639         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
 640         if ( !sub_phrase )
 641             continue;
 642
 643         PhraseIndexRange range;
 644         int result = sub_phrase->get_range(range);
 645         if ( result != ERROR_OK )
 646             continue;
 647
 648         SubPhraseIndex * new_sub_phrase =  new SubPhraseIndex;
 649
 650         PhraseItem item;
 651         for ( phrase_token_t token = range.m_range_begin;
 652               token < range.m_range_end;
 653               ++token ) {
 654             result = sub_phrase->get_phrase_item(token, item);
 655             if ( result != ERROR_OK )
 656                 continue;
 657             new_sub_phrase->add_phrase_item(token, &item);
 658         }
 659
 660         delete sub_phrase;
 661         m_sub_phrase_indices[index] = new_sub_phrase;
 662     }
 663     return true;
 664 }
 665
 666 bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
 667     PhraseIndexRange range;
 668     if (ERROR_OK != get_range(range))
 669         return false;
 670
 671     /* calculate mask and value for sub phrase index. */
 672     mask &= PHRASE_MASK; value &= PHRASE_MASK;
 673
 674     for (phrase_token_t token = range.m_range_begin;
 675          token < range.m_range_end; ++token) {
 676         if ((token & mask) != value)
 677             continue;
 678
 679         PhraseItem * item = NULL;
 680         remove_phrase_item(token, item);
 681         if (item)
 682             delete item;
 683     }
 684
 685     return true;
 686 }
 687
 688 bool FacadePhraseIndex::mask_out(guint8 phrase_index,
 689                                  phrase_token_t mask,
 690                                  phrase_token_t value){
 691     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 692     if (!sub_phrases)
 693         return false;
 694
 695     /* check mask and value. */
 696     phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
 697     phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
 698
 699     if ((phrase_index & index_mask ) != index_value)
 700         return false;
 701
 702     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 703     bool retval = sub_phrases->mask_out(mask, value);
 704     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 705
 706     return retval;
 707 }
 708
 709 namespace pinyin{
 710
 711
 712 static bool _peek_header(PhraseIndexLogger * logger,
 713                          guint32 & old_total_freq){
 714     old_total_freq = 0;
 715
 716     size_t header_count = 0;
 717     LOG_TYPE log_type; phrase_token_t token;
 718     MemoryChunk oldchunk, newchunk;
 719
 720     while (logger->has_next_record()) {
 721         bool retval = logger->next_record
 722             (log_type, token, &oldchunk, &newchunk);
 723
 724         if (!retval)
 725             break;
 726
 727         if (LOG_MODIFY_HEADER != log_type)
 728             continue;
 729
 730         ++header_count;
 731
 732         oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
 733     }
 734
 735     /* 1 for normal case, 0 for corrupted file. */
 736     assert(1 >= header_count);
 737
 738     return  1 == header_count? true : false;
 739 }
 740
 741 bool _compute_new_header(PhraseIndexLogger * logger,
 742                          phrase_token_t mask,
 743                          phrase_token_t value,
 744                          guint32 & new_total_freq) {
 745
 746     LOG_TYPE log_type; phrase_token_t token;
 747     MemoryChunk oldchunk, newchunk;
 748     PhraseItem olditem, newitem;
 749
 750     while(logger->has_next_record()) {
 751         bool retval = logger->next_record
 752             (log_type, token, &oldchunk, &newchunk);
 753
 754         if (!retval)
 755             break;
 756
 757         if (LOG_MODIFY_HEADER == log_type)
 758             continue;
 759
 760         if ((token & mask) == value)
 761             continue;
 762
 763         switch(log_type) {
 764         case LOG_ADD_RECORD:{
 765             assert( 0 == oldchunk.size() );
 766             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 767                                       NULL);
 768             new_total_freq += newitem.get_unigram_frequency();
 769             break;
 770         }
 771         case LOG_REMOVE_RECORD:{
 772             assert( 0 == newchunk.size() );
 773             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 774                                       NULL);
 775             new_total_freq -= olditem.get_unigram_frequency();
 776             break;
 777         }
 778         case LOG_MODIFY_RECORD:{
 779             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 780                                       NULL);
 781             new_total_freq -= olditem.get_unigram_frequency();
 782
 783             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 784                                       NULL);
 785             new_total_freq += newitem.get_unigram_frequency();
 786             break;
 787         }
 788         default:
 789             assert(false);
 790         }
 791     }
 792
 793     return true;
 794 }
 795
 796 static bool _write_header(PhraseIndexLogger * logger,
 797                           guint32 & old_total_freq,
 798                           guint32 & new_total_freq) {
 799     MemoryChunk oldheader, newheader;
 800     oldheader.set_content(0, &old_total_freq, sizeof(guint32));
 801     newheader.set_content(0, &new_total_freq, sizeof(guint32));
 802     logger->append_record(LOG_MODIFY_HEADER, null_token,
 803                           &oldheader, &newheader);
 804     return true;
 805 }
 806
 807 static bool _mask_out_records(PhraseIndexLogger * oldlogger,
 808                               phrase_token_t mask,
 809                               phrase_token_t value,
 810                               PhraseIndexLogger * newlogger) {
 811     LOG_TYPE log_type; phrase_token_t token;
 812     MemoryChunk oldchunk, newchunk;
 813
 814     while(oldlogger->has_next_record()) {
 815         bool retval = oldlogger->next_record
 816             (log_type, token, &oldchunk, &newchunk);
 817
 818         if (!retval)
 819             break;
 820
 821         if (LOG_MODIFY_HEADER == log_type)
 822             continue;
 823
 824         if ((token & mask) == value)
 825             continue;
 826
 827         newlogger->append_record(log_type, token, &oldchunk, &newchunk);
 828     }
 829
 830     return true;
 831 }
 832
 833 PhraseIndexLogger * mask_out_phrase_index_logger
 834 (PhraseIndexLogger * oldlogger, phrase_token_t mask,
 835  phrase_token_t value) {
 836     PhraseIndexLogger * newlogger = new PhraseIndexLogger;
 837     guint32 old_total_freq = 0, new_total_freq = 0;
 838
 839     /* peek the header value. */
 840     if (!_peek_header(oldlogger, old_total_freq))
 841         return newlogger;
 842
 843     new_total_freq = old_total_freq;
 844
 845     /* compute the new header based on add/modify/remove records. */
 846     oldlogger->rewind();
 847     if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
 848         return newlogger;
 849
 850     /* write out the modify header record. */
 851     _write_header(newlogger, old_total_freq, new_total_freq);
 852
 853     /* mask out the matched records. */
 854     oldlogger->rewind();
 855     _mask_out_records(oldlogger, mask, value, newlogger);
 856
 857     return newlogger;
 858 }
 859
 860 };