src/storage/phrase_index.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  20  */
  21
  22 #include "phrase_index.h"
  23
  24 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  25     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  26     return true;
  27 }
  28
  29 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
  30     guint8 phrase_length = get_phrase_length();
  31     table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  32     bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
  33     if ( !retval )
  34         return retval;
  35     return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
  36 }
  37
  38 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
  39     guint8 phrase_length = get_phrase_length();
  40     set_n_pronunciation(get_n_pronunciation() + 1);
  41     m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
  42     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  43 }
  44
  45 void PhraseItem::remove_nth_pronunciation(size_t index){
  46     guint8 phrase_length = get_phrase_length();
  47     set_n_pronunciation(get_n_pronunciation() - 1);
  48     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  49     m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
  50 }
  51
  52 bool PhraseItem::get_phrase_string(utf16_t * phrase){
  53     guint8 phrase_length = get_phrase_length();
  54     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  55 }
  56
  57 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
  58     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
  59     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  60     return true;
  61 }
  62
  63 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
  64                                              PinyinKey * pinyin_keys,
  65                                              gint32 delta){
  66     guint8 phrase_length = get_phrase_length();
  67     guint8 npron = get_n_pronunciation();
  68     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
  69     char * buf_begin = (char *) m_chunk.begin();
  70     guint32 total_freq = 0;
  71     for ( int i = 0 ; i < npron ; ++i){
  72         char * pinyin_begin = buf_begin + offset +
  73             i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
  74         guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
  75         total_freq += *freq;
  76         if ( 0 == pinyin_compare_with_ambiguities(custom,
  77                                                   (PinyinKey *)pinyin_begin,
  78                                                   pinyin_keys,
  79                                                   phrase_length)){
  80             //protect against total_freq overflow.
  81             if ( delta > 0 && total_freq > total_freq + delta )
  82                 return;
  83             *freq += delta;
  84             total_freq += delta;
  85         }
  86     }
  87 }
  88
  89
  90 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
  91     return m_total_freq;
  92 }
  93
  94 bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
  95     table_offset_t offset;
  96     guint32 freq;
  97     bool result = m_phrase_index.get_content
  98         ((token & PHRASE_MASK)
  99          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 100
 101     if ( !result)
 102         return result;
 103
 104     if ( 0 == offset )
 105         return false;
 106
 107     result = m_phrase_content.get_content
 108         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 109     //protect total_freq overflow
 110     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 111         return false;
 112     freq += delta;
 113     m_total_freq += delta;
 114     return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 115 }
 116
 117 bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 118     table_offset_t offset;
 119     guint8 phrase_length;
 120     guint8 n_prons;
 121
 122     bool result = m_phrase_index.get_content
 123         ((token & PHRASE_MASK)
 124          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 125
 126     if ( !result )
 127         return result;
 128
 129     if ( 0 == offset )
 130         return false;
 131
 132     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 133     if ( !result )
 134         return result;
 135
 136     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 137     if ( !result )
 138         return result;
 139
 140     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 141     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 142     return true;
 143 }
 144
 145 bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 146     table_offset_t offset = m_phrase_content.size();
 147     if ( 0 == offset )
 148         offset = 8;
 149     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 150     m_phrase_index.set_content((token & PHRASE_MASK)
 151                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 152     m_total_freq += item->get_unigram_frequency();
 153     return true;
 154 }
 155
 156 bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 157     table_offset_t offset;
 158     guint8 phrase_length;
 159     guint8 n_prons;
 160
 161     bool result = m_phrase_index.get_content
 162         ((token & PHRASE_MASK)
 163          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 164
 165     if ( !result )
 166         return result;
 167
 168     if ( 0 == offset )
 169         return false;
 170
 171     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 172     if ( !result )
 173         return result;
 174
 175     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 176     if ( !result )
 177         return result;
 178
 179     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 180     item = new PhraseItem;
 181     //implictly copy data from m_chunk_content.
 182     item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length);
 183
 184     const table_offset_t zero_const = 0;
 185     m_phrase_index.set_content((token & PHRASE_MASK)
 186                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 187     m_total_freq -= item->get_unigram_frequency();
 188     return true;
 189 }
 190
 191 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 192     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 193     if ( !sub_phrases ){
 194         sub_phrases = new SubPhraseIndex;
 195     }
 196
 197     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 198     if ( !retval )
 199         return retval;
 200     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 201     return retval;
 202 }
 203
 204 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 205     table_offset_t end;
 206     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 207     if ( !sub_phrases )
 208         return false;
 209
 210     sub_phrases->store(new_chunk, 0, end);
 211     return true;
 212 }
 213
 214 bool FacadePhraseIndex::unload(guint8 phrase_index){
 215     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 216     if ( !sub_phrases )
 217         return false;
 218     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 219     delete sub_phrases;
 220     sub_phrases = NULL;
 221     return true;
 222 }
 223
 224 bool SubPhraseIndex::load(MemoryChunk * chunk,
 225                           table_offset_t offset, table_offset_t end){
 226     //save the memory chunk
 227     if ( m_chunk ){
 228         delete m_chunk;
 229         m_chunk = NULL;
 230     }
 231     m_chunk = chunk;
 232
 233     char * buf_begin = (char *)chunk->begin();
 234     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 235     offset += sizeof(guint32);
 236     table_offset_t index_one, index_two, index_three;
 237     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 238     offset += sizeof(table_offset_t);
 239     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 240     offset += sizeof(table_offset_t);
 241     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 242     offset += sizeof(table_offset_t);
 243     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 244     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 245     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 246     m_phrase_index.set_chunk(buf_begin + index_one,
 247                              index_two - 1 - index_one, NULL);
 248     m_phrase_content.set_chunk(buf_begin + index_two,
 249                                  index_three - 1 - index_two, NULL);
 250     g_return_val_if_fail( index_three <= end, FALSE);
 251     return true;
 252 }
 253
 254 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 255                            table_offset_t offset, table_offset_t& end){
 256     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 257     table_offset_t index = offset + sizeof(guint32);
 258
 259     offset = index + sizeof(table_offset_t) * 3 ;
 260     new_chunk->set_content(offset, &c_separate, sizeof(char));
 261     offset += sizeof(char);
 262
 263     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 264     index += sizeof(table_offset_t);
 265     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 266     offset += m_phrase_index.size();
 267     new_chunk->set_content(offset, &c_separate, sizeof(char));
 268     offset += sizeof(char);
 269
 270     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 271     index += sizeof(table_offset_t);
 272
 273     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 274     offset += m_phrase_content.size();
 275     new_chunk->set_content(offset, &c_separate, sizeof(char));
 276     offset += sizeof(char);
 277     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 278     return true;
 279 }
 280
 281 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 282     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 283     if ( !sub_phrases ){
 284         sub_phrases = new SubPhraseIndex;
 285     }
 286
 287     char pinyin[256];
 288     char phrase[256];
 289     phrase_token_t token;
 290     size_t freq;
 291     PhraseItem * item_ptr = new PhraseItem;
 292     phrase_token_t cur_token = 0;
 293     while ( !feof(infile)){
 294         fscanf(infile, "%s", pinyin);
 295         fscanf(infile, "%s", phrase);
 296         fscanf(infile, "%ld", &token);
 297         fscanf(infile, "%ld", &freq);
 298         if ( feof(infile) )
 299             break;
 300
 301         glong written;
 302         utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
 303                                                &written, NULL);
 304
 305         if ( 0 == cur_token ){
 306             cur_token = token;
 307             item_ptr->set_phrase_string(written, phrase_utf16);
 308         }
 309
 310         if ( cur_token != token ){
 311             add_phrase_item( cur_token, item_ptr);
 312             delete item_ptr;
 313             item_ptr = new PhraseItem;
 314             cur_token = token;
 315             item_ptr->set_phrase_string(written, phrase_utf16);
 316         }
 317
 318         PinyinDefaultParser parser;
 319         NullPinyinValidator validator;
 320         PinyinKeyVector keys;
 321         PinyinKeyPosVector poses;
 322
 323         keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
 324         poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
 325         parser.parse(validator, keys, poses, pinyin);
 326
 327         assert ( item_ptr->get_phrase_length() == keys->len );
 328         item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
 329
 330         g_array_free(keys, TRUE);
 331         g_array_free(poses, TRUE);
 332         g_free(phrase_utf16);
 333     }
 334
 335     add_phrase_item( cur_token, item_ptr);
 336     delete item_ptr;
 337     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 338     return true;
 339 }