src/storage/phrase_index.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include "phrase_index.h"
  23
  24 using namespace pinyin;
  25
  26 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  27     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  28     return true;
  29 }
  30
  31 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
  32     guint8 phrase_length = get_phrase_length();
  33     table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  34     bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
  35     if ( !retval )
  36         return retval;
  37     return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
  38 }
  39
  40 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
  41     guint8 phrase_length = get_phrase_length();
  42     set_n_pronunciation(get_n_pronunciation() + 1);
  43     m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
  44     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  45 }
  46
  47 void PhraseItem::remove_nth_pronunciation(size_t index){
  48     guint8 phrase_length = get_phrase_length();
  49     set_n_pronunciation(get_n_pronunciation() - 1);
  50     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  51     m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
  52 }
  53
  54 bool PhraseItem::get_phrase_string(utf16_t * phrase){
  55     guint8 phrase_length = get_phrase_length();
  56     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  57 }
  58
  59 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
  60     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
  61     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  62     return true;
  63 }
  64
  65 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
  66                                              PinyinKey * pinyin_keys,
  67                                              gint32 delta){
  68     guint8 phrase_length = get_phrase_length();
  69     guint8 npron = get_n_pronunciation();
  70     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
  71     char * buf_begin = (char *) m_chunk.begin();
  72     guint32 total_freq = 0;
  73     for ( int i = 0 ; i < npron ; ++i){
  74         char * pinyin_begin = buf_begin + offset +
  75             i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
  76         guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
  77         total_freq += *freq;
  78         if ( 0 == pinyin_compare_with_ambiguities
  79              (custom, pinyin_keys,
  80               (PinyinKey *)pinyin_begin, phrase_length) ){
  81             //protect against total_freq overflow.
  82             if ( delta > 0 && total_freq > total_freq + delta )
  83                 return;
  84             *freq += delta;
  85             total_freq += delta;
  86         }
  87     }
  88 }
  89
  90
  91 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
  92     return m_total_freq;
  93 }
  94
  95 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
  96     table_offset_t offset;
  97     guint32 freq;
  98     bool result = m_phrase_index.get_content
  99         ((token & PHRASE_MASK)
 100          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 101
 102     if ( !result )
 103         return ERROR_OUT_OF_RANGE;
 104
 105     if ( 0 == offset )
 106     return ERROR_NO_ITEM;
 107
 108     result = m_phrase_content.get_content
 109         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 110
 111     if ( !result )
 112     return ERROR_FILE_CORRUPTION;
 113
 114     //protect total_freq overflow
 115     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 116         return ERROR_INTEGER_OVERFLOW;
 117
 118     freq += delta;
 119     m_total_freq += delta;
 120     m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 121
 122     return ERROR_OK;
 123 }
 124
 125 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 126     table_offset_t offset;
 127     guint8 phrase_length;
 128     guint8 n_prons;
 129
 130     bool result = m_phrase_index.get_content
 131         ((token & PHRASE_MASK)
 132          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 133
 134     if ( !result )
 135         return ERROR_OUT_OF_RANGE;
 136
 137     if ( 0 == offset )
 138     return ERROR_NO_ITEM;
 139
 140     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 141     if ( !result )
 142     return ERROR_FILE_CORRUPTION;
 143
 144     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 145     if ( !result )
 146         return ERROR_FILE_CORRUPTION;
 147
 148     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 149     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 150     return ERROR_OK;
 151 }
 152
 153 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 154     table_offset_t offset = m_phrase_content.size();
 155     if ( 0 == offset )
 156         offset = 8;
 157     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 158     m_phrase_index.set_content((token & PHRASE_MASK)
 159                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 160     m_total_freq += item->get_unigram_frequency();
 161     return ERROR_OK;
 162 }
 163
 164 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 165     PhraseItem old_item;
 166
 167     int result = get_phrase_item(token, old_item);
 168     if (result != ERROR_OK)
 169     return result;
 170
 171     item = new PhraseItem;
 172     //implictly copy data from m_chunk_content.
 173     item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
 174
 175     const table_offset_t zero_const = 0;
 176     m_phrase_index.set_content((token & PHRASE_MASK)
 177                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 178     m_total_freq -= item->get_unigram_frequency();
 179     return ERROR_OK;
 180 }
 181
 182 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 183     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 184     if ( !sub_phrases ){
 185         sub_phrases = new SubPhraseIndex;
 186     }
 187
 188     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 189     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 190     if ( !retval )
 191         return retval;
 192     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 193     return retval;
 194 }
 195
 196 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 197     table_offset_t end;
 198     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 199     if ( !sub_phrases )
 200         return false;
 201
 202     sub_phrases->store(new_chunk, 0, end);
 203     return true;
 204 }
 205
 206 bool FacadePhraseIndex::unload(guint8 phrase_index){
 207     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 208     if ( !sub_phrases )
 209         return false;
 210     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 211     delete sub_phrases;
 212     sub_phrases = NULL;
 213     return true;
 214 }
 215
 216 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
 217                              MemoryChunk * newlog){
 218     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 219     if ( !sub_phrases )
 220         return false;
 221
 222     SubPhraseIndex old_sub_phrases;
 223     old_sub_phrases.load(oldchunk, 0, oldchunk->size());
 224     PhraseIndexLogger logger;
 225
 226     bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
 227     logger.store(newlog);
 228     return retval;
 229 }
 230
 231 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
 232     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 233     if ( !sub_phrases )
 234         return false;
 235
 236     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 237     PhraseIndexLogger logger;
 238     logger.load(log);
 239
 240     bool retval = sub_phrases->merge(&logger);
 241     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 242
 243     return retval;
 244 }
 245
 246 bool SubPhraseIndex::load(MemoryChunk * chunk,
 247                           table_offset_t offset, table_offset_t end){
 248     //save the memory chunk
 249     if ( m_chunk ){
 250         delete m_chunk;
 251         m_chunk = NULL;
 252     }
 253     m_chunk = chunk;
 254
 255     char * buf_begin = (char *)chunk->begin();
 256     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 257     offset += sizeof(guint32);
 258     table_offset_t index_one, index_two, index_three;
 259     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 260     offset += sizeof(table_offset_t);
 261     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 262     offset += sizeof(table_offset_t);
 263     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 264     offset += sizeof(table_offset_t);
 265     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 266     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 267     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 268     m_phrase_index.set_chunk(buf_begin + index_one,
 269                              index_two - 1 - index_one, NULL);
 270     m_phrase_content.set_chunk(buf_begin + index_two,
 271                                  index_three - 1 - index_two, NULL);
 272     g_return_val_if_fail( index_three <= end, FALSE);
 273     return true;
 274 }
 275
 276 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 277                            table_offset_t offset, table_offset_t& end){
 278     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 279     table_offset_t index = offset + sizeof(guint32);
 280
 281     offset = index + sizeof(table_offset_t) * 3 ;
 282     new_chunk->set_content(offset, &c_separate, sizeof(char));
 283     offset += sizeof(char);
 284
 285     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 286     index += sizeof(table_offset_t);
 287     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 288     offset += m_phrase_index.size();
 289     new_chunk->set_content(offset, &c_separate, sizeof(char));
 290     offset += sizeof(char);
 291
 292     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 293     index += sizeof(table_offset_t);
 294
 295     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 296     offset += m_phrase_content.size();
 297     new_chunk->set_content(offset, &c_separate, sizeof(char));
 298     offset += sizeof(char);
 299     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 300     return true;
 301 }
 302
 303 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
 304     /* diff the header */
 305     MemoryChunk oldheader, newheader;
 306     guint32 total_freq = oldone->get_phrase_index_total_freq();
 307     oldheader.set_content(0, &total_freq, sizeof(guint32));
 308     total_freq = get_phrase_index_total_freq();
 309     newheader.set_content(0, &total_freq, sizeof(guint32));
 310     logger->append_record(LOG_MODIFY_HEADER, null_token,
 311                           &oldheader, &newheader);
 312
 313     /* diff phrase items */
 314     PhraseIndexRange oldrange, currange, range;
 315     oldone->get_range(oldrange); get_range(currange);
 316     range.m_range_begin = std_lite::min(oldrange.m_range_begin,
 317                                         currange.m_range_begin);
 318     range.m_range_end = std_lite::max(oldrange.m_range_end,
 319                                      currange.m_range_end);
 320     PhraseItem olditem, newitem;
 321
 322     for (phrase_token_t token = range.m_range_begin;
 323          token < range.m_range_end; ++token ){
 324         bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
 325         bool newretval = ERROR_OK == get_phrase_item(token, newitem);
 326
 327         if ( oldretval ){
 328             if ( newretval ) { /* compare phrase item. */
 329                 if ( olditem == newitem )
 330                     continue;
 331                 logger->append_record(LOG_MODIFY_RECORD, token,
 332                                       &(olditem.m_chunk), &(newitem.m_chunk));
 333             } else { /* remove phrase item. */
 334                 logger->append_record(LOG_REMOVE_RECORD, token,
 335                                       &(olditem.m_chunk), NULL);
 336             }
 337         } else {
 338             if ( newretval ){ /* add phrase item. */
 339                 logger->append_record(LOG_ADD_RECORD, token,
 340                                       NULL, &(newitem.m_chunk));
 341             } else { /* both empty. */
 342                     /* do nothing. */
 343             }
 344         }
 345     }
 346
 347     return true;
 348 }
 349
 350 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
 351     LOG_TYPE log_type; phrase_token_t token;
 352     MemoryChunk oldchunk, newchunk;
 353     PhraseItem olditem, newitem, item, * tmpitem;
 354
 355     while(logger->has_next_record()){
 356         logger->next_record(log_type, token, &oldchunk, &newchunk);
 357
 358         switch(log_type){
 359         case LOG_ADD_RECORD:{
 360             assert( 0 == oldchunk.size() );
 361             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 362                                       NULL);
 363             add_phrase_item(token, &newitem);
 364             break;
 365         }
 366         case LOG_REMOVE_RECORD:{
 367             assert( 0 == newchunk.size() );
 368             tmpitem = NULL;
 369             remove_phrase_item(token, tmpitem);
 370
 371             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 372                                    NULL);
 373             if (olditem != *tmpitem)
 374                 return false;
 375             delete tmpitem;
 376
 377             break;
 378         }
 379         case LOG_MODIFY_RECORD:{
 380             get_phrase_item(token, item);
 381             olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
 382                                       NULL);
 383             newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
 384                                       NULL);
 385             if (item != olditem)
 386                 return false;
 387
 388             if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
 389                 tmpitem = NULL;
 390                 remove_phrase_item(token, tmpitem);
 391                 assert(olditem == *tmpitem);
 392                 add_phrase_item(token, &newitem);
 393                 delete tmpitem;
 394             } else { /* in place editing. */
 395                 /* newchunk.size() <= item.m_chunk.size() */
 396                 /* Hack here: we assume the behaviour of get_phrase_item
 397                  * point to the actual data positon, so changes to item
 398                  * will be saved in SubPhraseIndex immediately.
 399                  */
 400                 memmove(item.m_chunk.begin(), newchunk.begin(),
 401                         newchunk.size());
 402             }
 403             break;
 404         }
 405         case LOG_MODIFY_HEADER:{
 406             guint32 total_freq = get_phrase_index_total_freq();
 407             guint32 tmp_freq = 0;
 408             assert(null_token == token);
 409             assert(oldchunk.size() == newchunk.size());
 410             oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
 411             if (total_freq != tmp_freq)
 412                 return false;
 413             newchunk.get_content(0, &tmp_freq, sizeof(guint32));
 414             m_total_freq = tmp_freq;
 415             break;
 416         }
 417         default:
 418             assert(false);
 419         }
 420     }
 421     return true;
 422 }
 423
 424 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 425     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 426     if ( !sub_phrases ){
 427         sub_phrases = new SubPhraseIndex;
 428     }
 429
 430     char pinyin[256];
 431     char phrase[256];
 432     phrase_token_t token;
 433     size_t freq;
 434     PhraseItem * item_ptr = new PhraseItem;
 435     phrase_token_t cur_token = 0;
 436     while ( !feof(infile)){
 437         fscanf(infile, "%s", pinyin);
 438         fscanf(infile, "%s", phrase);
 439         fscanf(infile, "%u", &token);
 440         fscanf(infile, "%ld", &freq);
 441         if ( feof(infile) )
 442             break;
 443
 444         assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
 445
 446         glong written;
 447         utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
 448                                                &written, NULL);
 449
 450         if ( 0 == cur_token ){
 451             cur_token = token;
 452             item_ptr->set_phrase_string(written, phrase_utf16);
 453         }
 454
 455         if ( cur_token != token ){
 456             add_phrase_item( cur_token, item_ptr);
 457             delete item_ptr;
 458             item_ptr = new PhraseItem;
 459             cur_token = token;
 460             item_ptr->set_phrase_string(written, phrase_utf16);
 461         }
 462
 463         PinyinDefaultParser parser;
 464         NullPinyinValidator validator;
 465         PinyinKeyVector keys;
 466         PinyinKeyPosVector poses;
 467
 468         keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
 469         poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
 470         parser.parse(validator, keys, poses, pinyin);
 471
 472         assert ( item_ptr->get_phrase_length() == keys->len );
 473         item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
 474
 475         g_array_free(keys, TRUE);
 476         g_array_free(poses, TRUE);
 477         g_free(phrase_utf16);
 478     }
 479
 480     add_phrase_item( cur_token, item_ptr);
 481     delete item_ptr;
 482     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 483     return true;
 484 }
 485
 486 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
 487                                             guint8 & max_index){
 488     min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
 489     for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
 490         if ( m_sub_phrase_indices[i] ) {
 491             min_index = std_lite::min(min_index, i);
 492             max_index = std_lite::max(max_index, i);
 493         }
 494     }
 495     return ERROR_OK;
 496 }
 497
 498 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
 499     SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
 500     if ( !sub_phrase )
 501         return ERROR_NO_SUB_PHRASE_INDEX;
 502
 503     int result = sub_phrase->get_range(range);
 504     if ( result )
 505         return result;
 506
 507     range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
 508     range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
 509     return ERROR_OK;
 510 }
 511
 512 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
 513     const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
 514     const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
 515
 516     range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
 517     range.m_range_end = end - begin;
 518
 519     return ERROR_OK;
 520 }
 521
 522 bool FacadePhraseIndex::compat(){
 523     for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
 524         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
 525         if ( !sub_phrase )
 526             continue;
 527
 528         SubPhraseIndex * new_sub_phrase =  new SubPhraseIndex;
 529         PhraseIndexRange range;
 530         int result = sub_phrase->get_range(range);
 531         if ( result != ERROR_OK ) {
 532             delete new_sub_phrase;
 533             continue;
 534         }
 535
 536         PhraseItem item;
 537         for ( phrase_token_t token = range.m_range_begin;
 538               token < range.m_range_end;
 539               ++token ) {
 540             result = sub_phrase->get_phrase_item(token, item);
 541             if ( result != ERROR_OK )
 542                 continue;
 543             new_sub_phrase->add_phrase_item(token, &item);
 544         }
 545
 546         delete sub_phrase;
 547         m_sub_phrase_indices[index] = new_sub_phrase;
 548     }
 549     return true;
 550 }