src/storage/phrase_index.h

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #ifndef PHRASE_INDEX_H
  23 #define PHRASE_INDEX_H
  24
  25 #include <stdio.h>
  26 #include <glib.h>
  27 #include "novel_types.h"
  28 #include "chewing_key.h"
  29 #include "pinyin_parser2.h"
  30 #include "pinyin_phrase2.h"
  31 #include "memory_chunk.h"
  32 #include "phrase_index_logger.h"
  33
  34 /**
  35  * Phrase Index File Format
  36  *
  37  * Indirect Index: Index by Token
  38  * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  39  * + Phrase Offset + Phrase Offset + Phrase Offset + ......  +
  40  * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  41  * Phrase Content:
  42  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  43  * + Phrase Length + number of  Pronunciations  + Uni-gram Frequency+
  44  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  45  * + n Pronunciations + Phrase String(UCS2) +
  46  * ++++++++++++++++++++++++++++++++++++++++++
  47  */
  48
  49 namespace pinyin{
  50
  51 class PinyinLookup;
  52
  53 /* Store delta info by phrase index logger in user home directory.
  54  */
  55
  56 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
  57
  58 class PhraseItem{
  59     friend class SubPhraseIndex;
  60 private:
  61     MemoryChunk m_chunk;
  62     bool set_n_pronunciation(guint8 n_prouns);
  63 public:
  64     /* Null Constructor */
  65     PhraseItem(){
  66         m_chunk.set_size(phrase_item_header);
  67         memset(m_chunk.begin(), 0, m_chunk.size());
  68     }
  69
  70 #if 0
  71     PhraseItem(MemoryChunk & chunk){
  72         m_chunk.set_content(0, chunk->begin(), chunk->size());
  73         assert ( m_chunk.size() >= phrase_item_header);
  74     }
  75 #endif
  76
  77     /* functions */
  78     guint8 get_phrase_length(){
  79         char * buf_begin = (char *)m_chunk.begin();
  80         return (*(guint8 *)buf_begin);
  81     }
  82
  83     guint8 get_n_pronunciation(){
  84         char * buf_begin = ( char *) m_chunk.begin();
  85         return (*(guint8 *)(buf_begin + sizeof(guint8)));
  86     }
  87
  88     guint32 get_unigram_frequency(){
  89         char * buf_begin = (char *)m_chunk.begin();
  90         return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
  91     }
  92
  93     gfloat get_pronunciation_possibility(pinyin_option_t options,
  94                                   ChewingKey * keys){
  95         guint8 phrase_length = get_phrase_length();
  96         guint8 npron = get_n_pronunciation();
  97         size_t offset = phrase_item_header + phrase_length * sizeof (utf16_t);
  98         char * buf_begin = (char *)m_chunk.begin();
  99         guint32 matched = 0, total_freq =0;
 100         for ( int i = 0 ; i < npron ; ++i){
 101             char * chewing_begin = buf_begin + offset +
 102                 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
 103             guint32 * freq = (guint32 *)(chewing_begin +
 104                                          phrase_length * sizeof(ChewingKey));
 105             total_freq += *freq;
 106             if ( 0 == pinyin_compare_with_ambiguities2
 107                  (options,  keys,
 108                   (ChewingKey *)chewing_begin,phrase_length) ){
 109                 matched += *freq;
 110             }
 111         }
 112         // use preprocessor to avoid zero freq, in gen_pinyin_table.
 113         /*
 114         if ( 0 == total_freq )
 115             return 0.1;
 116         */
 117         gfloat retval = matched / (gfloat) total_freq;
 118         /*
 119         if ( 0 == retval )
 120             return 0.03;
 121         */
 122         return retval;
 123     }
 124
 125     void increase_pronunciation_possibility(pinyin_option_t options,
 126                                      ChewingKey * keys,
 127                                      gint32 delta);
 128
 129     bool get_phrase_string(utf16_t * phrase);
 130     bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
 131     bool get_nth_pronunciation(size_t index,
 132                                /* out */ ChewingKey * keys,
 133                                /* out */ guint32 & freq);
 134     /* Normally don't change the first pronunciation,
 135      * which decides the token number.
 136      */
 137     void append_pronunciation(ChewingKey * keys, guint32 freq);
 138     void remove_nth_pronunciation(size_t index);
 139
 140     bool operator == (const PhraseItem & rhs) const{
 141         if (m_chunk.size() != rhs.m_chunk.size())
 142             return false;
 143         return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
 144                       m_chunk.size()) == 0;
 145     }
 146
 147     bool operator != (const PhraseItem & rhs) const{
 148         return ! (*this == rhs);
 149     }
 150 };
 151
 152 /*
 153  *  In Sub Phrase Index, token == (token & PHRASE_MASK).
 154  */
 155
 156 class SubPhraseIndex{
 157 private:
 158     guint32 m_total_freq;
 159     MemoryChunk m_phrase_index;
 160     MemoryChunk m_phrase_content;
 161     MemoryChunk * m_chunk;
 162 public:
 163     SubPhraseIndex():m_total_freq(0){
 164         m_chunk = NULL;
 165     }
 166
 167     ~SubPhraseIndex(){
 168         reset();
 169     }
 170
 171     void reset(){
 172         if ( m_chunk ){
 173             delete m_chunk;
 174             m_chunk = NULL;
 175         }
 176     }
 177
 178     /* binary memory chunk load/store method */
 179     bool load(MemoryChunk * chunk,
 180               table_offset_t offset, table_offset_t end);
 181     bool store(MemoryChunk * new_chunk,
 182                table_offset_t offset, table_offset_t & end);
 183
 184     /* switch to logger format to reduce user storage */
 185     bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
 186     bool merge(PhraseIndexLogger * logger);
 187
 188     /* get token range in this sub phrase */
 189     int get_range(/* out */ PhraseIndexRange & range);
 190
 191     /* Zero-gram */
 192     guint32 get_phrase_index_total_freq();
 193     int add_unigram_frequency(phrase_token_t token, guint32 delta);
 194
 195     /* get_phrase_item function can't modify the phrase item size,
 196      * but can increment the freq of the special pronunciation,
 197      * or change the content without size increasing.
 198      */
 199     int get_phrase_item(phrase_token_t token, PhraseItem & item);
 200     int add_phrase_item(phrase_token_t token, PhraseItem * item);
 201     /* remove_phrase_item will substract item->get_unigram_frequency()
 202      * from m_total_freq
 203      */
 204     int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
 205
 206 };
 207
 208 class FacadePhraseIndex{
 209     friend class PinyinLookup;
 210 private:
 211     guint32 m_total_freq;
 212     SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
 213 public:
 214     FacadePhraseIndex(){
 215         m_total_freq = 0;
 216         memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
 217     }
 218
 219     ~FacadePhraseIndex(){
 220         for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
 221             if ( m_sub_phrase_indices[i] ){
 222                 delete m_sub_phrase_indices[i];
 223                 m_sub_phrase_indices[i] = NULL;
 224             }
 225         }
 226     }
 227
 228     /* load/store single sub phrase index, according to the config files. */
 229     bool load_text(guint8 phrase_index, FILE * infile);
 230     bool load(guint8 phrase_index, MemoryChunk * chunk);
 231     bool store(guint8 phrase_index, MemoryChunk * new_chunk);
 232     bool unload(guint8 phrase_index);
 233
 234     /* load/store logger format.
 235        the ownership of oldchunk and log is transfered to here. */
 236     bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
 237               MemoryChunk * newlog);
 238     bool merge(guint8 phrase_index, MemoryChunk * log);
 239
 240     /* compat all SubPhraseIndex m_phrase_content memory usage. */
 241     bool compat();
 242
 243     /* get all available sub phrase indices. */
 244     int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
 245
 246     /* get each sub phrase token range with phrase_index added */
 247     int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
 248
 249     /* Zero-gram */
 250     guint32 get_phrase_index_total_freq(){
 251         return m_total_freq;
 252     }
 253
 254     int add_unigram_frequency(phrase_token_t token, guint32 delta){
 255         guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
 256         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
 257         if ( !sub_phrase )
 258             return ERROR_NO_SUB_PHRASE_INDEX;
 259         m_total_freq += delta;
 260         return sub_phrase->add_unigram_frequency(token, delta);
 261     }
 262
 263     /* get_phrase_item function can't modify the phrase item */
 264     int get_phrase_item(phrase_token_t token, PhraseItem & item){
 265         guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
 266         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
 267         if ( !sub_phrase )
 268             return ERROR_NO_SUB_PHRASE_INDEX;
 269         return sub_phrase->get_phrase_item(token, item);
 270     }
 271
 272     int add_phrase_item(phrase_token_t token, PhraseItem * item){
 273         guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
 274         SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
 275         if ( !sub_phrase ){
 276             sub_phrase = new SubPhraseIndex;
 277         }
 278         m_total_freq += item->get_unigram_frequency();
 279         return sub_phrase->add_phrase_item(token, item);
 280     }
 281
 282     int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
 283         guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
 284         SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
 285         if ( !sub_phrase ){
 286             return ERROR_NO_SUB_PHRASE_INDEX;
 287         }
 288         int result = sub_phrase->remove_phrase_item(token, item);
 289         if ( result )
 290             return result;
 291         m_total_freq -= item->get_unigram_frequency();
 292         return result;
 293     }
 294
 295 };
 296
 297 };
 298
 299 #endif