3 * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
4 * Based On Markov Model.
6 * Copyright (C) 2006-2007 Peng Wu
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include "phrase_index.h"
25 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
26 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
30 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
31 guint8 phrase_length = get_phrase_length();
32 table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
33 bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
36 return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
39 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
40 guint8 phrase_length = get_phrase_length();
41 set_n_pronunciation(get_n_pronunciation() + 1);
42 m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
43 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
46 void PhraseItem::remove_nth_pronunciation(size_t index){
47 guint8 phrase_length = get_phrase_length();
48 set_n_pronunciation(get_n_pronunciation() - 1);
49 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
50 m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
53 bool PhraseItem::get_phrase_string(utf16_t * phrase){
54 guint8 phrase_length = get_phrase_length();
55 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
58 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
59 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
60 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
64 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
65 PinyinKey * pinyin_keys,
67 guint8 phrase_length = get_phrase_length();
68 guint8 npron = get_n_pronunciation();
69 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
70 char * buf_begin = (char *) m_chunk.begin();
71 guint32 total_freq = 0;
72 for ( int i = 0 ; i < npron ; ++i){
73 char * pinyin_begin = buf_begin + offset +
74 i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
75 guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
77 if ( 0 == pinyin_compare_with_ambiguities(custom,
78 (PinyinKey *)pinyin_begin,
81 //protect against total_freq overflow.
82 if ( delta > 0 && total_freq > total_freq + delta )
91 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
95 bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
96 table_offset_t offset;
98 bool result = m_phrase_index.get_content
99 ((token & PHRASE_MASK)
100 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
108 result = m_phrase_content.get_content
109 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
110 //protect total_freq overflow
111 if ( delta > 0 && m_total_freq > m_total_freq + delta )
114 m_total_freq += delta;
115 return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
118 bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
119 table_offset_t offset;
120 guint8 phrase_length;
123 bool result = m_phrase_index.get_content
124 ((token & PHRASE_MASK)
125 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
133 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
137 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
141 size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
142 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
146 bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
147 table_offset_t offset = m_phrase_content.size();
150 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
151 m_phrase_index.set_content((token & PHRASE_MASK)
152 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
153 m_total_freq += item->get_unigram_frequency();
157 bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
158 table_offset_t offset;
159 guint8 phrase_length;
162 bool result = m_phrase_index.get_content
163 ((token & PHRASE_MASK)
164 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
172 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
176 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
180 size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
181 item = new PhraseItem;
182 //implictly copy data from m_chunk_content.
183 item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length);
185 const table_offset_t zero_const = 0;
186 m_phrase_index.set_content((token & PHRASE_MASK)
187 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
188 m_total_freq -= item->get_unigram_frequency();
192 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
193 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
195 sub_phrases = new SubPhraseIndex;
198 bool retval = sub_phrases->load(chunk, 0, chunk->size());
201 m_total_freq += sub_phrases->get_phrase_index_total_freq();
205 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
207 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
211 sub_phrases->store(new_chunk, 0, end);
215 bool FacadePhraseIndex::unload(guint8 phrase_index){
216 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
219 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
225 bool SubPhraseIndex::load(MemoryChunk * chunk,
226 table_offset_t offset, table_offset_t end){
227 //save the memory chunk
234 char * buf_begin = (char *)chunk->begin();
235 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
236 offset += sizeof(guint32);
237 table_offset_t index_one, index_two, index_three;
238 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
239 offset += sizeof(table_offset_t);
240 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
241 offset += sizeof(table_offset_t);
242 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
243 offset += sizeof(table_offset_t);
244 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
245 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
246 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
247 m_phrase_index.set_chunk(buf_begin + index_one,
248 index_two - 1 - index_one, NULL);
249 m_phrase_content.set_chunk(buf_begin + index_two,
250 index_three - 1 - index_two, NULL);
251 g_return_val_if_fail( index_three <= end, FALSE);
255 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
256 table_offset_t offset, table_offset_t& end){
257 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
258 table_offset_t index = offset + sizeof(guint32);
260 offset = index + sizeof(table_offset_t) * 3 ;
261 new_chunk->set_content(offset, &c_separate, sizeof(char));
262 offset += sizeof(char);
264 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
265 index += sizeof(table_offset_t);
266 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
267 offset += m_phrase_index.size();
268 new_chunk->set_content(offset, &c_separate, sizeof(char));
269 offset += sizeof(char);
271 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
272 index += sizeof(table_offset_t);
274 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
275 offset += m_phrase_content.size();
276 new_chunk->set_content(offset, &c_separate, sizeof(char));
277 offset += sizeof(char);
278 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
282 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
283 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
285 sub_phrases = new SubPhraseIndex;
290 phrase_token_t token;
292 PhraseItem * item_ptr = new PhraseItem;
293 phrase_token_t cur_token = 0;
294 while ( !feof(infile)){
295 fscanf(infile, "%s", pinyin);
296 fscanf(infile, "%s", phrase);
297 fscanf(infile, "%ld", &token);
298 fscanf(infile, "%ld", &freq);
303 utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
306 if ( 0 == cur_token ){
308 item_ptr->set_phrase_string(written, phrase_utf16);
311 if ( cur_token != token ){
312 add_phrase_item( cur_token, item_ptr);
314 item_ptr = new PhraseItem;
316 item_ptr->set_phrase_string(written, phrase_utf16);
319 PinyinDefaultParser parser;
320 NullPinyinValidator validator;
321 PinyinKeyVector keys;
322 PinyinKeyPosVector poses;
324 keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
325 poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
326 parser.parse(validator, keys, poses, pinyin);
328 assert ( item_ptr->get_phrase_length() == keys->len );
329 item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
331 g_array_free(keys, TRUE);
332 g_array_free(poses, TRUE);
333 g_free(phrase_utf16);
336 add_phrase_item( cur_token, item_ptr);
338 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();