3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #ifndef PHRASE_INDEX_H
23 #define PHRASE_INDEX_H
27 #include "novel_types.h"
28 #include "chewing_key.h"
29 #include "pinyin_parser2.h"
30 #include "pinyin_phrase2.h"
31 #include "memory_chunk.h"
32 #include "phrase_index_logger.h"
35 * Phrase Index File Format
37 * Indirect Index: Index by Token
38 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
39 * + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
40 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
42 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
43 * + Phrase Length + number of Pronunciations + Uni-gram Frequency+
44 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45 * + n Pronunciations + Phrase String(UCS2) +
46 * ++++++++++++++++++++++++++++++++++++++++++
53 /* Store delta info by phrase index logger in user home directory.
56 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
59 friend class SubPhraseIndex;
62 bool set_n_pronunciation(guint8 n_prouns);
64 /* Null Constructor */
66 m_chunk.set_size(phrase_item_header);
67 memset(m_chunk.begin(), 0, m_chunk.size());
71 PhraseItem(MemoryChunk & chunk){
72 m_chunk.set_content(0, chunk->begin(), chunk->size());
73 assert ( m_chunk.size() >= phrase_item_header);
78 guint8 get_phrase_length(){
79 char * buf_begin = (char *)m_chunk.begin();
80 return (*(guint8 *)buf_begin);
83 guint8 get_n_pronunciation(){
84 char * buf_begin = ( char *) m_chunk.begin();
85 return (*(guint8 *)(buf_begin + sizeof(guint8)));
88 guint32 get_unigram_frequency(){
89 char * buf_begin = (char *)m_chunk.begin();
90 return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
93 gfloat get_pronunciation_possibility(pinyin_option_t options,
95 guint8 phrase_length = get_phrase_length();
96 guint8 npron = get_n_pronunciation();
97 size_t offset = phrase_item_header + phrase_length * sizeof (utf16_t);
98 char * buf_begin = (char *)m_chunk.begin();
99 guint32 matched = 0, total_freq =0;
100 for ( int i = 0 ; i < npron ; ++i){
101 char * chewing_begin = buf_begin + offset +
102 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
103 guint32 * freq = (guint32 *)(chewing_begin +
104 phrase_length * sizeof(ChewingKey));
106 if ( 0 == pinyin_compare_with_ambiguities2
108 (ChewingKey *)chewing_begin,phrase_length) ){
112 // use preprocessor to avoid zero freq, in gen_pinyin_table.
114 if ( 0 == total_freq )
117 gfloat retval = matched / (gfloat) total_freq;
125 void increase_pronunciation_possibility(pinyin_option_t options,
129 bool get_phrase_string(utf16_t * phrase);
130 bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
131 bool get_nth_pronunciation(size_t index,
132 /* out */ ChewingKey * keys,
133 /* out */ guint32 & freq);
134 /* Normally don't change the first pronunciation,
135 * which decides the token number.
137 void append_pronunciation(ChewingKey * keys, guint32 freq);
138 void remove_nth_pronunciation(size_t index);
140 bool operator == (const PhraseItem & rhs) const{
141 if (m_chunk.size() != rhs.m_chunk.size())
143 return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
144 m_chunk.size()) == 0;
147 bool operator != (const PhraseItem & rhs) const{
148 return ! (*this == rhs);
153 * In Sub Phrase Index, token == (token & PHRASE_MASK).
156 class SubPhraseIndex{
158 guint32 m_total_freq;
159 MemoryChunk m_phrase_index;
160 MemoryChunk m_phrase_content;
161 MemoryChunk * m_chunk;
163 SubPhraseIndex():m_total_freq(0){
178 /* binary memory chunk load/store method */
179 bool load(MemoryChunk * chunk,
180 table_offset_t offset, table_offset_t end);
181 bool store(MemoryChunk * new_chunk,
182 table_offset_t offset, table_offset_t & end);
184 /* switch to logger format to reduce user storage */
185 bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
186 bool merge(PhraseIndexLogger * logger);
188 /* get token range in this sub phrase */
189 int get_range(/* out */ PhraseIndexRange & range);
192 guint32 get_phrase_index_total_freq();
193 int add_unigram_frequency(phrase_token_t token, guint32 delta);
195 /* get_phrase_item function can't modify the phrase item size,
196 * but can increment the freq of the special pronunciation,
197 * or change the content without size increasing.
199 int get_phrase_item(phrase_token_t token, PhraseItem & item);
200 int add_phrase_item(phrase_token_t token, PhraseItem * item);
201 /* remove_phrase_item will substract item->get_unigram_frequency()
204 int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
208 class FacadePhraseIndex{
209 friend class PinyinLookup;
211 guint32 m_total_freq;
212 SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
216 memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
219 ~FacadePhraseIndex(){
220 for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
221 if ( m_sub_phrase_indices[i] ){
222 delete m_sub_phrase_indices[i];
223 m_sub_phrase_indices[i] = NULL;
228 /* load/store single sub phrase index, according to the config files. */
229 bool load_text(guint8 phrase_index, FILE * infile);
230 bool load(guint8 phrase_index, MemoryChunk * chunk);
231 bool store(guint8 phrase_index, MemoryChunk * new_chunk);
232 bool unload(guint8 phrase_index);
234 /* load/store logger format.
235 the ownership of oldchunk and log is transfered to here. */
236 bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
237 MemoryChunk * newlog);
238 bool merge(guint8 phrase_index, MemoryChunk * log);
240 /* compat all SubPhraseIndex m_phrase_content memory usage. */
243 /* get all available sub phrase indices. */
244 int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
246 /* get each sub phrase token range with phrase_index added */
247 int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
250 guint32 get_phrase_index_total_freq(){
254 int add_unigram_frequency(phrase_token_t token, guint32 delta){
255 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
256 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
258 return ERROR_NO_SUB_PHRASE_INDEX;
259 m_total_freq += delta;
260 return sub_phrase->add_unigram_frequency(token, delta);
263 /* get_phrase_item function can't modify the phrase item */
264 int get_phrase_item(phrase_token_t token, PhraseItem & item){
265 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
266 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
268 return ERROR_NO_SUB_PHRASE_INDEX;
269 return sub_phrase->get_phrase_item(token, item);
272 int add_phrase_item(phrase_token_t token, PhraseItem * item){
273 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
274 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
276 sub_phrase = new SubPhraseIndex;
278 m_total_freq += item->get_unigram_frequency();
279 return sub_phrase->add_phrase_item(token, item);
282 int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
283 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
284 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
286 return ERROR_NO_SUB_PHRASE_INDEX;
288 int result = sub_phrase->remove_phrase_item(token, item);
291 m_total_freq -= item->get_unigram_frequency();