3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "phrase_index.h"
23 #include "pinyin_custom2.h"
25 using namespace pinyin;
27 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
28 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
32 bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
34 guint8 phrase_length = get_phrase_length();
35 table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
37 bool retval = m_chunk.get_content
38 (offset, keys, phrase_length * sizeof(ChewingKey));
41 return m_chunk.get_content
42 (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
45 void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
46 guint8 phrase_length = get_phrase_length();
47 set_n_pronunciation(get_n_pronunciation() + 1);
48 m_chunk.set_content(m_chunk.size(), keys,
49 phrase_length * sizeof(ChewingKey));
50 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
53 void PhraseItem::remove_nth_pronunciation(size_t index){
54 guint8 phrase_length = get_phrase_length();
55 set_n_pronunciation(get_n_pronunciation() - 1);
56 size_t offset = phrase_item_header + phrase_length * sizeof ( ucs4_t ) +
57 index * (phrase_length * sizeof (ChewingKey) + sizeof(guint32));
58 m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
61 bool PhraseItem::get_phrase_string(ucs4_t * phrase){
62 guint8 phrase_length = get_phrase_length();
63 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
66 bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
67 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
68 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
72 void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
75 guint8 phrase_length = get_phrase_length();
76 guint8 npron = get_n_pronunciation();
77 size_t offset = phrase_item_header + phrase_length * sizeof ( ucs4_t );
78 char * buf_begin = (char *) m_chunk.begin();
79 guint32 total_freq = 0;
80 for ( int i = 0 ; i < npron ; ++i){
81 char * chewing_begin = buf_begin + offset +
82 i * ( phrase_length * sizeof(ChewingKey) + sizeof(guint32) );
83 guint32 * freq = (guint32 *)(chewing_begin +
84 phrase_length * sizeof(ChewingKey));
86 if ( 0 == pinyin_compare_with_ambiguities2
88 (ChewingKey *)chewing_begin, phrase_length) ){
89 //protect against total_freq overflow.
90 if ( delta > 0 && total_freq > total_freq + delta )
99 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
103 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
104 table_offset_t offset;
106 bool result = m_phrase_index.get_content
107 ((token & PHRASE_MASK)
108 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
111 return ERROR_OUT_OF_RANGE;
114 return ERROR_NO_ITEM;
116 result = m_phrase_content.get_content
117 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
120 return ERROR_FILE_CORRUPTION;
122 //protect total_freq overflow
123 if ( delta > 0 && m_total_freq > m_total_freq + delta )
124 return ERROR_INTEGER_OVERFLOW;
127 m_total_freq += delta;
128 m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
133 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
134 table_offset_t offset;
135 guint8 phrase_length;
138 bool result = m_phrase_index.get_content
139 ((token & PHRASE_MASK)
140 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
143 return ERROR_OUT_OF_RANGE;
146 return ERROR_NO_ITEM;
148 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
150 return ERROR_FILE_CORRUPTION;
152 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
154 return ERROR_FILE_CORRUPTION;
156 size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
157 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
161 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
162 table_offset_t offset = m_phrase_content.size();
165 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
166 m_phrase_index.set_content((token & PHRASE_MASK)
167 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
168 m_total_freq += item->get_unigram_frequency();
172 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
175 int result = get_phrase_item(token, old_item);
176 if (result != ERROR_OK)
179 item = new PhraseItem;
180 //implictly copy data from m_chunk_content.
181 item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
183 const table_offset_t zero_const = 0;
184 m_phrase_index.set_content((token & PHRASE_MASK)
185 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
186 m_total_freq -= item->get_unigram_frequency();
190 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
191 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
193 sub_phrases = new SubPhraseIndex;
196 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
197 bool retval = sub_phrases->load(chunk, 0, chunk->size());
200 m_total_freq += sub_phrases->get_phrase_index_total_freq();
204 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
206 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
210 sub_phrases->store(new_chunk, 0, end);
214 bool FacadePhraseIndex::unload(guint8 phrase_index){
215 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
218 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
224 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
225 MemoryChunk * newlog){
226 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
230 SubPhraseIndex old_sub_phrases;
231 old_sub_phrases.load(oldchunk, 0, oldchunk->size());
232 PhraseIndexLogger logger;
234 bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
235 logger.store(newlog);
239 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
240 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
244 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
245 PhraseIndexLogger logger;
248 bool retval = sub_phrases->merge(&logger);
249 m_total_freq += sub_phrases->get_phrase_index_total_freq();
254 bool SubPhraseIndex::load(MemoryChunk * chunk,
255 table_offset_t offset, table_offset_t end){
256 //save the memory chunk
263 char * buf_begin = (char *)chunk->begin();
264 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
265 offset += sizeof(guint32);
266 table_offset_t index_one, index_two, index_three;
267 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
268 offset += sizeof(table_offset_t);
269 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
270 offset += sizeof(table_offset_t);
271 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
272 offset += sizeof(table_offset_t);
273 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
274 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
275 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
276 m_phrase_index.set_chunk(buf_begin + index_one,
277 index_two - 1 - index_one, NULL);
278 m_phrase_content.set_chunk(buf_begin + index_two,
279 index_three - 1 - index_two, NULL);
280 g_return_val_if_fail( index_three <= end, FALSE);
284 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
285 table_offset_t offset, table_offset_t& end){
286 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
287 table_offset_t index = offset + sizeof(guint32);
289 offset = index + sizeof(table_offset_t) * 3 ;
290 new_chunk->set_content(offset, &c_separate, sizeof(char));
291 offset += sizeof(char);
293 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
294 index += sizeof(table_offset_t);
295 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
296 offset += m_phrase_index.size();
297 new_chunk->set_content(offset, &c_separate, sizeof(char));
298 offset += sizeof(char);
300 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
301 index += sizeof(table_offset_t);
303 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
304 offset += m_phrase_content.size();
305 new_chunk->set_content(offset, &c_separate, sizeof(char));
306 offset += sizeof(char);
307 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
311 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
312 /* diff the header */
313 MemoryChunk oldheader, newheader;
314 guint32 total_freq = oldone->get_phrase_index_total_freq();
315 oldheader.set_content(0, &total_freq, sizeof(guint32));
316 total_freq = get_phrase_index_total_freq();
317 newheader.set_content(0, &total_freq, sizeof(guint32));
318 logger->append_record(LOG_MODIFY_HEADER, null_token,
319 &oldheader, &newheader);
321 /* diff phrase items */
322 PhraseIndexRange oldrange, currange, range;
323 oldone->get_range(oldrange); get_range(currange);
324 range.m_range_begin = std_lite::min(oldrange.m_range_begin,
325 currange.m_range_begin);
326 range.m_range_end = std_lite::max(oldrange.m_range_end,
327 currange.m_range_end);
328 PhraseItem olditem, newitem;
330 for (phrase_token_t token = range.m_range_begin;
331 token < range.m_range_end; ++token ){
332 bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
333 bool newretval = ERROR_OK == get_phrase_item(token, newitem);
336 if ( newretval ) { /* compare phrase item. */
337 if ( olditem == newitem )
339 logger->append_record(LOG_MODIFY_RECORD, token,
340 &(olditem.m_chunk), &(newitem.m_chunk));
341 } else { /* remove phrase item. */
342 logger->append_record(LOG_REMOVE_RECORD, token,
343 &(olditem.m_chunk), NULL);
346 if ( newretval ){ /* add phrase item. */
347 logger->append_record(LOG_ADD_RECORD, token,
348 NULL, &(newitem.m_chunk));
349 } else { /* both empty. */
358 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
359 LOG_TYPE log_type; phrase_token_t token;
360 MemoryChunk oldchunk, newchunk;
361 PhraseItem olditem, newitem, item, * tmpitem;
363 while(logger->has_next_record()){
364 bool retval = logger->next_record
365 (log_type, token, &oldchunk, &newchunk);
371 case LOG_ADD_RECORD:{
372 assert( 0 == oldchunk.size() );
373 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
375 add_phrase_item(token, &newitem);
378 case LOG_REMOVE_RECORD:{
379 assert( 0 == newchunk.size() );
381 remove_phrase_item(token, tmpitem);
383 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
385 if (olditem != *tmpitem)
391 case LOG_MODIFY_RECORD:{
392 get_phrase_item(token, item);
393 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
395 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
400 if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
402 remove_phrase_item(token, tmpitem);
403 assert(olditem == *tmpitem);
404 add_phrase_item(token, &newitem);
406 } else { /* in place editing. */
407 /* newchunk.size() <= item.m_chunk.size() */
408 /* Hack here: we assume the behaviour of get_phrase_item
409 * point to the actual data positon, so changes to item
410 * will be saved in SubPhraseIndex immediately.
412 memmove(item.m_chunk.begin(), newchunk.begin(),
417 case LOG_MODIFY_HEADER:{
418 guint32 total_freq = get_phrase_index_total_freq();
419 guint32 tmp_freq = 0;
420 assert(null_token == token);
421 assert(oldchunk.size() == newchunk.size());
422 oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
423 if (total_freq != tmp_freq)
425 newchunk.get_content(0, &tmp_freq, sizeof(guint32));
426 m_total_freq = tmp_freq;
436 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
437 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
439 sub_phrases = new SubPhraseIndex;
444 phrase_token_t token;
446 PhraseItem * item_ptr = new PhraseItem;
447 phrase_token_t cur_token = 0;
448 while ( !feof(infile)){
449 fscanf(infile, "%s", pinyin);
450 fscanf(infile, "%s", phrase);
451 fscanf(infile, "%u", &token);
452 fscanf(infile, "%ld", &freq);
456 assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
459 ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
462 if ( 0 == cur_token ){
464 item_ptr->set_phrase_string(written, phrase_ucs4);
467 if ( cur_token != token ){
468 add_phrase_item( cur_token, item_ptr);
470 item_ptr = new PhraseItem;
472 item_ptr->set_phrase_string(written, phrase_ucs4);
475 pinyin_option_t options = USE_TONE;
476 FullPinyinParser2 parser;
477 ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
478 ChewingKeyRestVector key_rests =
479 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
481 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
483 if (item_ptr->get_phrase_length() == keys->len) {
484 item_ptr->append_pronunciation((ChewingKey *)keys->data, freq);
486 fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
490 g_array_free(keys, TRUE);
491 g_array_free(key_rests, TRUE);
495 add_phrase_item( cur_token, item_ptr);
498 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
503 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
505 min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
506 for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
507 if ( m_sub_phrase_indices[i] ) {
508 min_index = std_lite::min(min_index, i);
509 max_index = std_lite::max(max_index, i);
515 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
516 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
518 return ERROR_NO_SUB_PHRASE_INDEX;
520 int result = sub_phrase->get_range(range);
524 range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
525 range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
529 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
530 const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
531 const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
533 range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
534 range.m_range_end = end - begin;
539 bool FacadePhraseIndex::compact(){
540 for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
541 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
545 SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
546 PhraseIndexRange range;
547 int result = sub_phrase->get_range(range);
548 if ( result != ERROR_OK ) {
549 delete new_sub_phrase;
554 for ( phrase_token_t token = range.m_range_begin;
555 token < range.m_range_end;
557 result = sub_phrase->get_phrase_item(token, item);
558 if ( result != ERROR_OK )
560 new_sub_phrase->add_phrase_item(token, &item);
564 m_sub_phrase_indices[index] = new_sub_phrase;
570 const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT] =
572 {NULL, NULL, NULL, NOT_USED},
573 {"gb_char.table", "gb_char.bin", "gb_char.dbin", SYSTEM_FILE},
574 {"gbk_char.table", "gbk_char.bin", "gbk_char.dbin", SYSTEM_FILE},
575 {NULL, NULL, NULL, NOT_USED},
576 {NULL, NULL, NULL, NOT_USED},
578 {NULL, NULL, NULL, NOT_USED},
579 {NULL, NULL, NULL, NOT_USED},
580 {NULL, NULL, NULL, NOT_USED},
581 {NULL, NULL, NULL, NOT_USED},
582 {NULL, NULL, NULL, NOT_USED},
584 {NULL, NULL, NULL, NOT_USED},
585 {NULL, NULL, NULL, NOT_USED},
586 {NULL, NULL, NULL, NOT_USED},
587 {NULL, NULL, NULL, NOT_USED},
588 {NULL, NULL, NULL, NOT_USED},
590 {NULL, NULL, "user.bin", USER_FILE}
594 static bool _peek_header(PhraseIndexLogger * logger,
595 guint32 & old_total_freq){
598 size_t header_count = 0;
599 LOG_TYPE log_type; phrase_token_t token;
600 MemoryChunk oldchunk, newchunk;
602 while (logger->has_next_record()) {
603 bool retval = logger->next_record
604 (log_type, token, &oldchunk, &newchunk);
609 if (LOG_MODIFY_HEADER != log_type)
614 oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
617 /* 1 for normal case, 0 for corrupted file. */
618 assert(1 >= header_count);
620 return 1 == header_count? true : false;
623 bool _compute_new_header(PhraseIndexLogger * logger,
625 phrase_token_t value,
626 guint32 & new_total_freq) {
628 LOG_TYPE log_type; phrase_token_t token;
629 MemoryChunk oldchunk, newchunk;
630 PhraseItem olditem, newitem;
632 while(logger->has_next_record()) {
633 bool retval = logger->next_record
634 (log_type, token, &oldchunk, &newchunk);
639 if (LOG_MODIFY_HEADER == log_type)
642 if ((token & mask) == value)
646 case LOG_ADD_RECORD:{
647 assert( 0 == oldchunk.size() );
648 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
650 new_total_freq += newitem.get_unigram_frequency();
653 case LOG_REMOVE_RECORD:{
654 assert( 0 == newchunk.size() );
655 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
657 new_total_freq -= olditem.get_unigram_frequency();
660 case LOG_MODIFY_RECORD:{
661 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
663 new_total_freq -= olditem.get_unigram_frequency();
665 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
667 new_total_freq += newitem.get_unigram_frequency();
678 static bool _write_header(PhraseIndexLogger * logger,
679 guint32 & old_total_freq,
680 guint32 & new_total_freq) {
681 MemoryChunk oldheader, newheader;
682 oldheader.set_content(0, &old_total_freq, sizeof(guint32));
683 newheader.set_content(0, &new_total_freq, sizeof(guint32));
684 logger->append_record(LOG_MODIFY_HEADER, null_token,
685 &oldheader, &newheader);
689 static bool _mask_out_records(PhraseIndexLogger * oldlogger,
691 phrase_token_t value,
692 PhraseIndexLogger * newlogger) {
693 LOG_TYPE log_type; phrase_token_t token;
694 MemoryChunk oldchunk, newchunk;
696 while(oldlogger->has_next_record()) {
697 bool retval = oldlogger->next_record
698 (log_type, token, &oldchunk, &newchunk);
703 if (LOG_MODIFY_HEADER == log_type)
706 if ((token & mask) == value)
709 newlogger->append_record(log_type, token, &oldchunk, &newchunk);
715 PhraseIndexLogger * mask_out_phrase_index_logger
716 (PhraseIndexLogger * oldlogger, phrase_token_t mask,
717 phrase_token_t value) {
718 PhraseIndexLogger * newlogger = new PhraseIndexLogger;
719 guint32 old_total_freq = 0, new_total_freq = 0;
721 /* peek the header value. */
722 if (!_peek_header(oldlogger, old_total_freq))
725 new_total_freq = old_total_freq;
727 /* compute the new header based on add/modify/remove records. */
729 if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
732 /* write out the modify header record. */
733 _write_header(newlogger, old_total_freq, new_total_freq);
735 /* mask out the matched records. */
737 _mask_out_records(oldlogger, mask, value, newlogger);