3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "phrase_index.h"
24 using namespace pinyin;
26 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
27 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
31 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
32 guint8 phrase_length = get_phrase_length();
33 table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
34 bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
37 return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
40 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
41 guint8 phrase_length = get_phrase_length();
42 set_n_pronunciation(get_n_pronunciation() + 1);
43 m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
44 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
47 void PhraseItem::remove_nth_pronunciation(size_t index){
48 guint8 phrase_length = get_phrase_length();
49 set_n_pronunciation(get_n_pronunciation() - 1);
50 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
51 m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
54 bool PhraseItem::get_phrase_string(utf16_t * phrase){
55 guint8 phrase_length = get_phrase_length();
56 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
59 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
60 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
61 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
65 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
66 PinyinKey * pinyin_keys,
68 guint8 phrase_length = get_phrase_length();
69 guint8 npron = get_n_pronunciation();
70 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
71 char * buf_begin = (char *) m_chunk.begin();
72 guint32 total_freq = 0;
73 for ( int i = 0 ; i < npron ; ++i){
74 char * pinyin_begin = buf_begin + offset +
75 i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
76 guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
78 if ( 0 == pinyin_compare_with_ambiguities(custom,
79 (PinyinKey *)pinyin_begin,
82 //protect against total_freq overflow.
83 if ( delta > 0 && total_freq > total_freq + delta )
92 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
96 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
97 table_offset_t offset;
99 bool result = m_phrase_index.get_content
100 ((token & PHRASE_MASK)
101 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
104 return ERROR_OUT_OF_RANGE;
107 return ERROR_NO_ITEM;
109 result = m_phrase_content.get_content
110 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
113 return ERROR_FILE_CORRUPTION;
115 //protect total_freq overflow
116 if ( delta > 0 && m_total_freq > m_total_freq + delta )
117 return ERROR_INTEGER_OVERFLOW;
120 m_total_freq += delta;
121 m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
126 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
127 table_offset_t offset;
128 guint8 phrase_length;
131 bool result = m_phrase_index.get_content
132 ((token & PHRASE_MASK)
133 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
136 return ERROR_OUT_OF_RANGE;
139 return ERROR_NO_ITEM;
141 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
143 return ERROR_FILE_CORRUPTION;
145 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
147 return ERROR_FILE_CORRUPTION;
149 size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
150 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
154 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
155 table_offset_t offset = m_phrase_content.size();
158 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
159 m_phrase_index.set_content((token & PHRASE_MASK)
160 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
161 m_total_freq += item->get_unigram_frequency();
165 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
168 int result = get_phrase_item(token, old_item);
169 if (result != ERROR_OK)
172 item = new PhraseItem;
173 //implictly copy data from m_chunk_content.
174 item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
176 const table_offset_t zero_const = 0;
177 m_phrase_index.set_content((token & PHRASE_MASK)
178 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
179 m_total_freq -= item->get_unigram_frequency();
183 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
184 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
186 sub_phrases = new SubPhraseIndex;
189 bool retval = sub_phrases->load(chunk, 0, chunk->size());
192 m_total_freq += sub_phrases->get_phrase_index_total_freq();
196 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
198 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
202 sub_phrases->store(new_chunk, 0, end);
206 bool FacadePhraseIndex::unload(guint8 phrase_index){
207 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
210 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
216 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
217 MemoryChunk * newlog){
218 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
222 SubPhraseIndex old_sub_phrases;
223 old_sub_phrases.load(oldchunk, 0, oldchunk->size());
224 PhraseIndexLogger logger;
226 bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
227 logger.store(newlog);
231 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
232 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
236 PhraseIndexLogger logger;
239 return sub_phrases->merge(&logger);
242 bool SubPhraseIndex::load(MemoryChunk * chunk,
243 table_offset_t offset, table_offset_t end){
244 //save the memory chunk
251 char * buf_begin = (char *)chunk->begin();
252 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
253 offset += sizeof(guint32);
254 table_offset_t index_one, index_two, index_three;
255 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
256 offset += sizeof(table_offset_t);
257 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
258 offset += sizeof(table_offset_t);
259 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
260 offset += sizeof(table_offset_t);
261 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
262 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
263 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
264 m_phrase_index.set_chunk(buf_begin + index_one,
265 index_two - 1 - index_one, NULL);
266 m_phrase_content.set_chunk(buf_begin + index_two,
267 index_three - 1 - index_two, NULL);
268 g_return_val_if_fail( index_three <= end, FALSE);
272 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
273 table_offset_t offset, table_offset_t& end){
274 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
275 table_offset_t index = offset + sizeof(guint32);
277 offset = index + sizeof(table_offset_t) * 3 ;
278 new_chunk->set_content(offset, &c_separate, sizeof(char));
279 offset += sizeof(char);
281 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
282 index += sizeof(table_offset_t);
283 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
284 offset += m_phrase_index.size();
285 new_chunk->set_content(offset, &c_separate, sizeof(char));
286 offset += sizeof(char);
288 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
289 index += sizeof(table_offset_t);
291 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
292 offset += m_phrase_content.size();
293 new_chunk->set_content(offset, &c_separate, sizeof(char));
294 offset += sizeof(char);
295 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
299 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
300 PhraseIndexRange oldrange, currange, range;
301 oldone->get_range(oldrange); get_range(currange);
302 range.m_range_begin = std_lite::min(oldrange.m_range_begin,
303 currange.m_range_begin);
304 range.m_range_end = std_lite::max(oldrange.m_range_end,
305 currange.m_range_end);
306 PhraseItem olditem, newitem;
308 for (phrase_token_t token = range.m_range_begin;
309 token < range.m_range_end; ++token ){
310 bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
311 bool newretval = ERROR_OK == get_phrase_item(token, newitem);
314 if ( newretval ) { /* compare phrase item. */
315 if ( olditem == newitem )
317 logger->append_record(LOG_MODIFY_RECORD, token,
318 &(olditem.m_chunk), &(newitem.m_chunk));
319 } else { /* remove phrase item. */
320 logger->append_record(LOG_REMOVE_RECORD, token,
321 &(olditem.m_chunk), NULL);
324 if ( newretval ){ /* add phrase item. */
325 logger->append_record(LOG_ADD_RECORD, token,
326 NULL, &(newitem.m_chunk));
327 } else { /* both empty. */
336 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
337 LOG_TYPE log_type; phrase_token_t token;
338 MemoryChunk oldchunk, newchunk;
339 PhraseItem olditem, newitem, item, * tmpitem;
341 while(logger->has_next_record()){
342 logger->next_record(log_type, token, &oldchunk, &newchunk);
345 case LOG_ADD_RECORD:{
346 assert( 0 == oldchunk.size() );
347 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
349 add_phrase_item(token, &newitem);
352 case LOG_REMOVE_RECORD:{
353 assert( 0 == newchunk.size() );
355 remove_phrase_item(token, tmpitem);
357 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
359 if (olditem != *tmpitem)
365 case LOG_MODIFY_RECORD:{
366 get_phrase_item(token, item);
367 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
369 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
374 if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
376 remove_phrase_item(token, tmpitem);
377 assert(olditem == *tmpitem);
378 add_phrase_item(token, &newitem);
380 } else { /* in place editing. */
381 /* newchunk.size() <= item.m_chunk.size() */
382 /* Hack here: we assume the behaviour of get_phrase_item
383 * point to the actual data positon, so changes to item
384 * will be saved in SubPhraseIndex immediately.
386 memmove(item.m_chunk.begin(), newchunk.begin(),
397 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
398 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
400 sub_phrases = new SubPhraseIndex;
405 phrase_token_t token;
407 PhraseItem * item_ptr = new PhraseItem;
408 phrase_token_t cur_token = 0;
409 while ( !feof(infile)){
410 fscanf(infile, "%s", pinyin);
411 fscanf(infile, "%s", phrase);
412 fscanf(infile, "%u", &token);
413 fscanf(infile, "%ld", &freq);
417 assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
420 utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
423 if ( 0 == cur_token ){
425 item_ptr->set_phrase_string(written, phrase_utf16);
428 if ( cur_token != token ){
429 add_phrase_item( cur_token, item_ptr);
431 item_ptr = new PhraseItem;
433 item_ptr->set_phrase_string(written, phrase_utf16);
436 PinyinDefaultParser parser;
437 NullPinyinValidator validator;
438 PinyinKeyVector keys;
439 PinyinKeyPosVector poses;
441 keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
442 poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
443 parser.parse(validator, keys, poses, pinyin);
445 assert ( item_ptr->get_phrase_length() == keys->len );
446 item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
448 g_array_free(keys, TRUE);
449 g_array_free(poses, TRUE);
450 g_free(phrase_utf16);
453 add_phrase_item( cur_token, item_ptr);
455 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
459 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
460 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
462 return ERROR_NO_SUB_PHRASE_INDEX;
464 int result = sub_phrase->get_range(range);
468 range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
469 range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
473 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
474 const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
475 const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
477 range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
478 range.m_range_end = end - begin;
483 bool FacadePhraseIndex::compat(){
484 for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
485 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
489 SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
490 PhraseIndexRange range;
491 int result = sub_phrase->get_range(range);
492 if ( result != ERROR_OK ) {
493 delete new_sub_phrase;
498 for ( phrase_token_t token = range.m_range_begin;
499 token < range.m_range_end;
501 result = sub_phrase->get_phrase_item(token, item);
502 if ( result != ERROR_OK )
504 new_sub_phrase->add_phrase_item(token, &item);
508 m_sub_phrase_indices[index] = new_sub_phrase;