src/storage/phrase_index.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  20  */
  21
  22 #include "phrase_index.h"
  23
  24 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  25     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  26     return true;
  27 }
  28
  29 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
  30     guint8 phrase_length = get_phrase_length();
  31     table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  32     bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
  33     if ( !retval )
  34         return retval;
  35     return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
  36 }
  37
  38 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
  39     guint8 phrase_length = get_phrase_length();
  40     set_n_pronunciation(get_n_pronunciation() + 1);
  41     m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
  42     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  43 }
  44
  45 void PhraseItem::remove_nth_pronunciation(size_t index){
  46     guint8 phrase_length = get_phrase_length();
  47     set_n_pronunciation(get_n_pronunciation() - 1);
  48     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  49     m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
  50 }
  51
  52 bool PhraseItem::get_phrase_string(utf16_t * phrase){
  53     guint8 phrase_length = get_phrase_length();
  54     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  55 }
  56
  57 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
  58     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
  59     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  60     return true;
  61 }
  62
  63 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
  64                                              PinyinKey * pinyin_keys,
  65                                              gint32 delta){
  66     guint8 phrase_length = get_phrase_length();
  67     guint8 npron = get_n_pronunciation();
  68     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
  69     char * buf_begin = (char *) m_chunk.begin();
  70     guint32 total_freq = 0;
  71     for ( int i = 0 ; i < npron ; ++i){
  72         char * pinyin_begin = buf_begin + offset +
  73             i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
  74         guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
  75         total_freq += *freq;
  76         if ( 0 == pinyin_compare_with_ambiguities(custom,
  77                                                   (PinyinKey *)pinyin_begin,
  78                                                   pinyin_keys,
  79                                                   phrase_length)){
  80             //protect against total_freq overflow.
  81             if ( delta > 0 && total_freq > total_freq + delta )
  82                 return;
  83             *freq += delta;
  84             total_freq += delta;
  85         }
  86     }
  87 }
  88
  89
  90 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
  91     return m_total_freq;
  92 }
  93
  94 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
  95     table_offset_t offset;
  96     guint32 freq;
  97     bool result = m_phrase_index.get_content
  98         ((token & PHRASE_MASK)
  99          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 100
 101     if ( !result )
 102         return ERROR_OUT_OF_RANGE;
 103
 104     if ( 0 == offset )
 105     return ERROR_NO_ITEM;
 106
 107     result = m_phrase_content.get_content
 108         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 109
 110     if ( !result )
 111     return ERROR_FILE_CORRUPTION;
 112
 113     //protect total_freq overflow
 114     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 115         return ERROR_INTEGER_OVERFLOW;
 116
 117     freq += delta;
 118     m_total_freq += delta;
 119     m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 120
 121     return ERROR_OK;
 122 }
 123
 124 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 125     table_offset_t offset;
 126     guint8 phrase_length;
 127     guint8 n_prons;
 128
 129     bool result = m_phrase_index.get_content
 130         ((token & PHRASE_MASK)
 131          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 132
 133     if ( !result )
 134         return ERROR_OUT_OF_RANGE;
 135
 136     if ( 0 == offset )
 137     return ERROR_NO_ITEM;
 138
 139     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 140     if ( !result )
 141     return ERROR_FILE_CORRUPTION;
 142
 143     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 144     if ( !result )
 145         return ERROR_FILE_CORRUPTION;
 146
 147     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 148     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 149     return ERROR_OK;
 150 }
 151
 152 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 153     table_offset_t offset = m_phrase_content.size();
 154     if ( 0 == offset )
 155         offset = 8;
 156     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 157     m_phrase_index.set_content((token & PHRASE_MASK)
 158                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 159     m_total_freq += item->get_unigram_frequency();
 160     return ERROR_OK;
 161 }
 162
 163 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 164     PhraseItem old_item;
 165
 166     int result = get_phrase_item(token, old_item);
 167     if (result != ERROR_OK)
 168     return result;
 169
 170     item = new PhraseItem;
 171     //implictly copy data from m_chunk_content.
 172     item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
 173
 174     const table_offset_t zero_const = 0;
 175     m_phrase_index.set_content((token & PHRASE_MASK)
 176                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 177     m_total_freq -= item->get_unigram_frequency();
 178     return ERROR_OK;
 179 }
 180
 181 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 182     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 183     if ( !sub_phrases ){
 184         sub_phrases = new SubPhraseIndex;
 185     }
 186
 187     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 188     if ( !retval )
 189         return retval;
 190     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 191     return retval;
 192 }
 193
 194 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 195     table_offset_t end;
 196     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 197     if ( !sub_phrases )
 198         return false;
 199
 200     sub_phrases->store(new_chunk, 0, end);
 201     return true;
 202 }
 203
 204 bool FacadePhraseIndex::unload(guint8 phrase_index){
 205     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 206     if ( !sub_phrases )
 207         return false;
 208     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 209     delete sub_phrases;
 210     sub_phrases = NULL;
 211     return true;
 212 }
 213
 214 bool SubPhraseIndex::load(MemoryChunk * chunk,
 215                           table_offset_t offset, table_offset_t end){
 216     //save the memory chunk
 217     if ( m_chunk ){
 218         delete m_chunk;
 219         m_chunk = NULL;
 220     }
 221     m_chunk = chunk;
 222
 223     char * buf_begin = (char *)chunk->begin();
 224     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 225     offset += sizeof(guint32);
 226     table_offset_t index_one, index_two, index_three;
 227     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 228     offset += sizeof(table_offset_t);
 229     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 230     offset += sizeof(table_offset_t);
 231     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 232     offset += sizeof(table_offset_t);
 233     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 234     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 235     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 236     m_phrase_index.set_chunk(buf_begin + index_one,
 237                              index_two - 1 - index_one, NULL);
 238     m_phrase_content.set_chunk(buf_begin + index_two,
 239                                  index_three - 1 - index_two, NULL);
 240     g_return_val_if_fail( index_three <= end, FALSE);
 241     return true;
 242 }
 243
 244 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 245                            table_offset_t offset, table_offset_t& end){
 246     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 247     table_offset_t index = offset + sizeof(guint32);
 248
 249     offset = index + sizeof(table_offset_t) * 3 ;
 250     new_chunk->set_content(offset, &c_separate, sizeof(char));
 251     offset += sizeof(char);
 252
 253     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 254     index += sizeof(table_offset_t);
 255     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 256     offset += m_phrase_index.size();
 257     new_chunk->set_content(offset, &c_separate, sizeof(char));
 258     offset += sizeof(char);
 259
 260     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 261     index += sizeof(table_offset_t);
 262
 263     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 264     offset += m_phrase_content.size();
 265     new_chunk->set_content(offset, &c_separate, sizeof(char));
 266     offset += sizeof(char);
 267     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 268     return true;
 269 }
 270
 271 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 272     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 273     if ( !sub_phrases ){
 274         sub_phrases = new SubPhraseIndex;
 275     }
 276
 277     char pinyin[256];
 278     char phrase[256];
 279     phrase_token_t token;
 280     size_t freq;
 281     PhraseItem * item_ptr = new PhraseItem;
 282     phrase_token_t cur_token = 0;
 283     while ( !feof(infile)){
 284         fscanf(infile, "%s", pinyin);
 285         fscanf(infile, "%s", phrase);
 286         fscanf(infile, "%ld", &token);
 287         fscanf(infile, "%ld", &freq);
 288         if ( feof(infile) )
 289             break;
 290
 291         glong written;
 292         utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
 293                                                &written, NULL);
 294
 295         if ( 0 == cur_token ){
 296             cur_token = token;
 297             item_ptr->set_phrase_string(written, phrase_utf16);
 298         }
 299
 300         if ( cur_token != token ){
 301             add_phrase_item( cur_token, item_ptr);
 302             delete item_ptr;
 303             item_ptr = new PhraseItem;
 304             cur_token = token;
 305             item_ptr->set_phrase_string(written, phrase_utf16);
 306         }
 307
 308         PinyinDefaultParser parser;
 309         NullPinyinValidator validator;
 310         PinyinKeyVector keys;
 311         PinyinKeyPosVector poses;
 312
 313         keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
 314         poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
 315         parser.parse(validator, keys, poses, pinyin);
 316
 317         assert ( item_ptr->get_phrase_length() == keys->len );
 318         item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
 319
 320         g_array_free(keys, TRUE);
 321         g_array_free(poses, TRUE);
 322         g_free(phrase_utf16);
 323     }
 324
 325     add_phrase_item( cur_token, item_ptr);
 326     delete item_ptr;
 327     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 328     return true;
 329 }
 330
 331 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
 332     SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
 333     if ( !sub_phrase )
 334         return ERROR_NO_SUB_PHRASE_INDEX;
 335
 336     int result = sub_phrase->get_range(range);
 337     if ( result )
 338         return result;
 339
 340     range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
 341     range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
 342     return ERROR_OK;
 343 }
 344
 345 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
 346     const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
 347     const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
 348
 349     range.m_range_begin = 0;
 350     range.m_range_end = end - begin;
 351
 352     return ERROR_OK;
 353 }