src/storage/phrase_index.cpp

   1 /*
   2  *  novel-pinyin,
   3  *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
   4  *  Based On Markov Model.
   5  *
   6  *  Copyright (C) 2006-2007 Peng Wu
   7  *
   8  *  This program is free software; you can redistribute it and/or modify
   9  *  it under the terms of the GNU General Public License as published by
  10  *  the Free Software Foundation; either version 2 of the License, or
  11  *  (at your option) any later version.
  12  *
  13  *  This program is distributed in the hope that it will be useful,
  14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  *  GNU General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU General Public License
  19  *  along with this program; if not, write to the Free Software
  20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  21  */
  22
  23 #include "phrase_index.h"
  24
  25 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
  26     m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
  27     return true;
  28 }
  29
  30 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
  31     guint8 phrase_length = get_phrase_length();
  32     table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  33     bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
  34     if ( !retval )
  35         return retval;
  36     return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
  37 }
  38
  39 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
  40     guint8 phrase_length = get_phrase_length();
  41     set_n_pronunciation(get_n_pronunciation() + 1);
  42     m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
  43     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
  44 }
  45
  46 void PhraseItem::remove_nth_pronunciation(size_t index){
  47     guint8 phrase_length = get_phrase_length();
  48     set_n_pronunciation(get_n_pronunciation() - 1);
  49     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
  50     m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
  51 }
  52
  53 bool PhraseItem::get_phrase_string(utf16_t * phrase){
  54     guint8 phrase_length = get_phrase_length();
  55     return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  56 }
  57
  58 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
  59     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
  60     m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
  61     return true;
  62 }
  63
  64 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
  65                                              PinyinKey * pinyin_keys,
  66                                              gint32 delta){
  67     guint8 phrase_length = get_phrase_length();
  68     guint8 npron = get_n_pronunciation();
  69     size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
  70     char * buf_begin = (char *) m_chunk.begin();
  71     guint32 total_freq = 0;
  72     for ( int i = 0 ; i < npron ; ++i){
  73         char * pinyin_begin = buf_begin + offset +
  74             i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
  75         guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
  76         total_freq += *freq;
  77         if ( 0 == pinyin_compare_with_ambiguities(custom,
  78                                                   (PinyinKey *)pinyin_begin,
  79                                                   pinyin_keys,
  80                                                   phrase_length)){
  81             //protect against total_freq overflow.
  82             if ( delta > 0 && total_freq > total_freq + delta )
  83                 return;
  84             *freq += delta;
  85             total_freq += delta;
  86         }
  87     }
  88 }
  89
  90
  91 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
  92     return m_total_freq;
  93 }
  94
  95 bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
  96     table_offset_t offset;
  97     guint32 freq;
  98     bool result = m_phrase_index.get_content
  99         ((token & PHRASE_MASK)
 100          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 101
 102     if ( !result)
 103         return result;
 104
 105     if ( 0 == offset )
 106         return false;
 107
 108     result = m_phrase_content.get_content
 109         (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 110     //protect total_freq overflow
 111     if ( delta > 0 && m_total_freq > m_total_freq + delta )
 112         return false;
 113     freq += delta;
 114     m_total_freq += delta;
 115     return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 116 }
 117
 118 bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
 119     table_offset_t offset;
 120     guint8 phrase_length;
 121     guint8 n_prons;
 122
 123     bool result = m_phrase_index.get_content
 124         ((token & PHRASE_MASK)
 125          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 126
 127     if ( !result )
 128         return result;
 129
 130     if ( 0 == offset )
 131         return false;
 132
 133     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 134     if ( !result )
 135         return result;
 136
 137     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 138     if ( !result )
 139         return result;
 140
 141     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 142     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
 143     return true;
 144 }
 145
 146 bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
 147     table_offset_t offset = m_phrase_content.size();
 148     if ( 0 == offset )
 149         offset = 8;
 150     m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
 151     m_phrase_index.set_content((token & PHRASE_MASK)
 152                                * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 153     m_total_freq += item->get_unigram_frequency();
 154     return true;
 155 }
 156
 157 bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 158     table_offset_t offset;
 159     guint8 phrase_length;
 160     guint8 n_prons;
 161
 162     bool result = m_phrase_index.get_content
 163         ((token & PHRASE_MASK)
 164          * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
 165
 166     if ( !result )
 167         return result;
 168
 169     if ( 0 == offset )
 170         return false;
 171
 172     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
 173     if ( !result )
 174         return result;
 175
 176     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
 177     if ( !result )
 178         return result;
 179
 180     size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
 181     item = new PhraseItem;
 182     //implictly copy data from m_chunk_content.
 183     item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length);
 184
 185     const table_offset_t zero_const = 0;
 186     m_phrase_index.set_content((token & PHRASE_MASK)
 187                                * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
 188     m_total_freq -= item->get_unigram_frequency();
 189     return true;
 190 }
 191
 192 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
 193     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 194     if ( !sub_phrases ){
 195         sub_phrases = new SubPhraseIndex;
 196     }
 197
 198     bool retval = sub_phrases->load(chunk, 0, chunk->size());
 199     if ( !retval )
 200         return retval;
 201     m_total_freq += sub_phrases->get_phrase_index_total_freq();
 202     return retval;
 203 }
 204
 205 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
 206     table_offset_t end;
 207     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 208     if ( !sub_phrases )
 209         return false;
 210
 211     sub_phrases->store(new_chunk, 0, end);
 212     return true;
 213 }
 214
 215 bool FacadePhraseIndex::unload(guint8 phrase_index){
 216     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 217     if ( !sub_phrases )
 218         return false;
 219     m_total_freq -= sub_phrases->get_phrase_index_total_freq();
 220     delete sub_phrases;
 221     sub_phrases = NULL;
 222     return true;
 223 }
 224
 225 bool SubPhraseIndex::load(MemoryChunk * chunk,
 226                           table_offset_t offset, table_offset_t end){
 227     //save the memory chunk
 228     if ( m_chunk ){
 229         delete m_chunk;
 230         m_chunk = NULL;
 231     }
 232     m_chunk = chunk;
 233
 234     char * buf_begin = (char *)chunk->begin();
 235     chunk->get_content(offset, &m_total_freq, sizeof(guint32));
 236     offset += sizeof(guint32);
 237     table_offset_t index_one, index_two, index_three;
 238     chunk->get_content(offset, &index_one, sizeof(table_offset_t));
 239     offset += sizeof(table_offset_t);
 240     chunk->get_content(offset, &index_two, sizeof(table_offset_t));
 241     offset += sizeof(table_offset_t);
 242     chunk->get_content(offset, &index_three, sizeof(table_offset_t));
 243     offset += sizeof(table_offset_t);
 244     g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
 245     g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
 246     g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
 247     m_phrase_index.set_chunk(buf_begin + index_one,
 248                              index_two - 1 - index_one, NULL);
 249     m_phrase_content.set_chunk(buf_begin + index_two,
 250                                  index_three - 1 - index_two, NULL);
 251     g_return_val_if_fail( index_three <= end, FALSE);
 252     return true;
 253 }
 254
 255 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 256                            table_offset_t offset, table_offset_t& end){
 257     new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
 258     table_offset_t index = offset + sizeof(guint32);
 259
 260     offset = index + sizeof(table_offset_t) * 3 ;
 261     new_chunk->set_content(offset, &c_separate, sizeof(char));
 262     offset += sizeof(char);
 263
 264     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 265     index += sizeof(table_offset_t);
 266     new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
 267     offset += m_phrase_index.size();
 268     new_chunk->set_content(offset, &c_separate, sizeof(char));
 269     offset += sizeof(char);
 270
 271     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 272     index += sizeof(table_offset_t);
 273
 274     new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
 275     offset += m_phrase_content.size();
 276     new_chunk->set_content(offset, &c_separate, sizeof(char));
 277     offset += sizeof(char);
 278     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
 279     return true;
 280 }
 281
 282 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
 283     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
 284     if ( !sub_phrases ){
 285         sub_phrases = new SubPhraseIndex;
 286     }
 287
 288     char pinyin[256];
 289     char phrase[256];
 290     phrase_token_t token;
 291     size_t freq;
 292     PhraseItem * item_ptr = new PhraseItem;
 293     phrase_token_t cur_token = 0;
 294     while ( !feof(infile)){
 295         fscanf(infile, "%s", pinyin);
 296         fscanf(infile, "%s", phrase);
 297         fscanf(infile, "%ld", &token);
 298         fscanf(infile, "%ld", &freq);
 299         if ( feof(infile) )
 300             break;
 301
 302         glong written;
 303         utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
 304                                                &written, NULL);
 305
 306         if ( 0 == cur_token ){
 307             cur_token = token;
 308             item_ptr->set_phrase_string(written, phrase_utf16);
 309         }
 310
 311         if ( cur_token != token ){
 312             add_phrase_item( cur_token, item_ptr);
 313             delete item_ptr;
 314             item_ptr = new PhraseItem;
 315             cur_token = token;
 316             item_ptr->set_phrase_string(written, phrase_utf16);
 317         }
 318
 319         PinyinDefaultParser parser;
 320         NullPinyinValidator validator;
 321         PinyinKeyVector keys;
 322         PinyinKeyPosVector poses;
 323
 324         keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
 325         poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
 326         parser.parse(validator, keys, poses, pinyin);
 327
 328         assert ( item_ptr->get_phrase_length() == keys->len );
 329         item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
 330
 331         g_array_free(keys, TRUE);
 332         g_array_free(poses, TRUE);
 333         g_free(phrase_utf16);
 334     }
 335
 336     add_phrase_item( cur_token, item_ptr);
 337     delete item_ptr;
 338     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
 339     return true;
 340 }