3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "phrase_index.h"
23 #include "pinyin_custom2.h"
25 using namespace pinyin;
27 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
28 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
32 bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
34 guint8 phrase_length = get_phrase_length();
35 table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
37 bool retval = m_chunk.get_content
38 (offset, keys, phrase_length * sizeof(ChewingKey));
41 return m_chunk.get_content
42 (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
46 void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
47 guint8 phrase_length = get_phrase_length();
48 set_n_pronunciation(get_n_pronunciation() + 1);
49 m_chunk.set_content(m_chunk.size(), keys,
50 phrase_length * sizeof(ChewingKey));
51 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
55 bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
56 guint8 phrase_length = get_phrase_length();
57 guint8 npron = get_n_pronunciation();
58 size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
59 char * buf_begin = (char *) m_chunk.begin();
60 guint32 total_freq = 0;
62 for (int i = 0; i < npron; ++i) {
63 char * chewing_begin = buf_begin + offset +
64 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
65 guint32 * freq = (guint32 *)(chewing_begin +
66 phrase_length * sizeof(ChewingKey));
70 if (0 == pinyin_exact_compare2
71 (keys, (ChewingKey *)chewing_begin, phrase_length)) {
72 /* found the exact match pinyin keys. */
74 /* protect against total_freq overflow. */
75 if (delta > 0 && total_freq > total_freq + delta)
84 set_n_pronunciation(npron + 1);
85 m_chunk.set_content(m_chunk.size(), keys,
86 phrase_length * sizeof(ChewingKey));
87 m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32));
91 void PhraseItem::remove_nth_pronunciation(size_t index){
92 guint8 phrase_length = get_phrase_length();
93 set_n_pronunciation(get_n_pronunciation() - 1);
94 size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) +
95 index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
96 m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
99 bool PhraseItem::get_phrase_string(ucs4_t * phrase){
100 guint8 phrase_length = get_phrase_length();
101 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
104 bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
105 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
106 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
110 void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
113 guint8 phrase_length = get_phrase_length();
114 guint8 npron = get_n_pronunciation();
115 size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
116 char * buf_begin = (char *) m_chunk.begin();
117 guint32 total_freq = 0;
119 for (int i = 0; i < npron; ++i) {
120 char * chewing_begin = buf_begin + offset +
121 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
122 guint32 * freq = (guint32 *)(chewing_begin +
123 phrase_length * sizeof(ChewingKey));
126 if (0 == pinyin_compare_with_ambiguities2
128 (ChewingKey *)chewing_begin, phrase_length)) {
130 /* protect against total_freq overflow. */
131 if (delta > 0 && total_freq > total_freq + delta)
141 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
145 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
146 table_offset_t offset;
148 bool result = m_phrase_index.get_content
149 ((token & PHRASE_MASK)
150 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
153 return ERROR_OUT_OF_RANGE;
156 return ERROR_NO_ITEM;
158 result = m_phrase_content.get_content
159 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
162 return ERROR_FILE_CORRUPTION;
164 //protect total_freq overflow
165 if ( delta > 0 && m_total_freq > m_total_freq + delta )
166 return ERROR_INTEGER_OVERFLOW;
169 m_total_freq += delta;
170 m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
175 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
176 table_offset_t offset;
177 guint8 phrase_length;
180 bool result = m_phrase_index.get_content
181 ((token & PHRASE_MASK)
182 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
185 return ERROR_OUT_OF_RANGE;
188 return ERROR_NO_ITEM;
190 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
192 return ERROR_FILE_CORRUPTION;
194 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
196 return ERROR_FILE_CORRUPTION;
198 size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
199 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
203 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
204 table_offset_t offset = m_phrase_content.size();
207 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
208 m_phrase_index.set_content((token & PHRASE_MASK)
209 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
210 m_total_freq += item->get_unigram_frequency();
214 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
217 int result = get_phrase_item(token, old_item);
218 if (result != ERROR_OK)
221 item = new PhraseItem;
222 //implictly copy data from m_chunk_content.
223 item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
225 const table_offset_t zero_const = 0;
226 m_phrase_index.set_content((token & PHRASE_MASK)
227 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
228 m_total_freq -= item->get_unigram_frequency();
232 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
233 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
235 sub_phrases = new SubPhraseIndex;
238 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
239 bool retval = sub_phrases->load(chunk, 0, chunk->size());
242 m_total_freq += sub_phrases->get_phrase_index_total_freq();
246 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
248 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
252 sub_phrases->store(new_chunk, 0, end);
256 bool FacadePhraseIndex::unload(guint8 phrase_index){
257 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
260 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
266 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
267 MemoryChunk * newlog){
268 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
272 SubPhraseIndex old_sub_phrases;
273 old_sub_phrases.load(oldchunk, 0, oldchunk->size());
274 PhraseIndexLogger logger;
276 bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
277 logger.store(newlog);
281 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
282 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
286 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
287 PhraseIndexLogger logger;
290 bool retval = sub_phrases->merge(&logger);
291 m_total_freq += sub_phrases->get_phrase_index_total_freq();
296 bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
299 phrase_token_t value){
300 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
304 /* check mask and value. */
305 phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
306 phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
307 if ((phrase_index & index_mask) != index_value)
310 /* unload old sub phrase index */
311 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
313 /* calculate the sub phrase index mask and value. */
314 mask &= PHRASE_MASK; value &= PHRASE_MASK;
316 /* prepare the new logger. */
317 PhraseIndexLogger oldlogger;
319 PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
320 (&oldlogger, mask, value);
322 bool retval = sub_phrases->merge(newlogger);
323 m_total_freq += sub_phrases->get_phrase_index_total_freq();
330 bool SubPhraseIndex::load(MemoryChunk * chunk,
331 table_offset_t offset, table_offset_t end){
332 //save the memory chunk
339 char * buf_begin = (char *)chunk->begin();
340 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
341 offset += sizeof(guint32);
342 table_offset_t index_one, index_two, index_three;
343 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
344 offset += sizeof(table_offset_t);
345 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
346 offset += sizeof(table_offset_t);
347 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
348 offset += sizeof(table_offset_t);
349 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
350 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
351 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
352 m_phrase_index.set_chunk(buf_begin + index_one,
353 index_two - 1 - index_one, NULL);
354 m_phrase_content.set_chunk(buf_begin + index_two,
355 index_three - 1 - index_two, NULL);
356 g_return_val_if_fail( index_three <= end, FALSE);
360 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
361 table_offset_t offset, table_offset_t& end){
362 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
363 table_offset_t index = offset + sizeof(guint32);
365 offset = index + sizeof(table_offset_t) * 3 ;
366 new_chunk->set_content(offset, &c_separate, sizeof(char));
367 offset += sizeof(char);
369 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
370 index += sizeof(table_offset_t);
371 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
372 offset += m_phrase_index.size();
373 new_chunk->set_content(offset, &c_separate, sizeof(char));
374 offset += sizeof(char);
376 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
377 index += sizeof(table_offset_t);
379 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
380 offset += m_phrase_content.size();
381 new_chunk->set_content(offset, &c_separate, sizeof(char));
382 offset += sizeof(char);
383 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
387 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
388 /* diff the header */
389 MemoryChunk oldheader, newheader;
390 guint32 total_freq = oldone->get_phrase_index_total_freq();
391 oldheader.set_content(0, &total_freq, sizeof(guint32));
392 total_freq = get_phrase_index_total_freq();
393 newheader.set_content(0, &total_freq, sizeof(guint32));
394 logger->append_record(LOG_MODIFY_HEADER, null_token,
395 &oldheader, &newheader);
397 /* diff phrase items */
398 PhraseIndexRange oldrange, currange, range;
399 oldone->get_range(oldrange); get_range(currange);
400 range.m_range_begin = std_lite::min(oldrange.m_range_begin,
401 currange.m_range_begin);
402 range.m_range_end = std_lite::max(oldrange.m_range_end,
403 currange.m_range_end);
404 PhraseItem olditem, newitem;
406 for (phrase_token_t token = range.m_range_begin;
407 token < range.m_range_end; ++token ){
408 bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
409 bool newretval = ERROR_OK == get_phrase_item(token, newitem);
412 if ( newretval ) { /* compare phrase item. */
413 if ( olditem == newitem )
415 logger->append_record(LOG_MODIFY_RECORD, token,
416 &(olditem.m_chunk), &(newitem.m_chunk));
417 } else { /* remove phrase item. */
418 logger->append_record(LOG_REMOVE_RECORD, token,
419 &(olditem.m_chunk), NULL);
422 if ( newretval ){ /* add phrase item. */
423 logger->append_record(LOG_ADD_RECORD, token,
424 NULL, &(newitem.m_chunk));
425 } else { /* both empty. */
434 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
435 LOG_TYPE log_type; phrase_token_t token;
436 MemoryChunk oldchunk, newchunk;
437 PhraseItem olditem, newitem, item, * tmpitem;
439 while(logger->has_next_record()){
440 bool retval = logger->next_record
441 (log_type, token, &oldchunk, &newchunk);
447 case LOG_ADD_RECORD:{
448 assert( 0 == oldchunk.size() );
449 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
451 add_phrase_item(token, &newitem);
454 case LOG_REMOVE_RECORD:{
455 assert( 0 == newchunk.size() );
457 remove_phrase_item(token, tmpitem);
459 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
462 if (olditem != *tmpitem) {
471 case LOG_MODIFY_RECORD:{
472 get_phrase_item(token, item);
473 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
475 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
480 if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
482 remove_phrase_item(token, tmpitem);
483 assert(olditem == *tmpitem);
484 add_phrase_item(token, &newitem);
486 } else { /* in place editing. */
487 /* newchunk.size() <= item.m_chunk.size() */
488 /* Hack here: we assume the behaviour of get_phrase_item
489 * point to the actual data positon, so changes to item
490 * will be saved in SubPhraseIndex immediately.
492 memmove(item.m_chunk.begin(), newchunk.begin(),
497 case LOG_MODIFY_HEADER:{
498 guint32 total_freq = get_phrase_index_total_freq();
499 guint32 tmp_freq = 0;
500 assert(null_token == token);
501 assert(oldchunk.size() == newchunk.size());
502 oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
503 if (total_freq != tmp_freq)
505 newchunk.get_content(0, &tmp_freq, sizeof(guint32));
506 m_total_freq = tmp_freq;
516 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
517 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
519 sub_phrases = new SubPhraseIndex;
524 phrase_token_t token;
527 PhraseItem * item_ptr = new PhraseItem;
528 phrase_token_t cur_token = 0;
530 while (!feof(infile)){
531 int num = fscanf(infile, "%s %s %u %ld",
532 pinyin, phrase, &token, &freq);
540 assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
543 ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
546 if ( 0 == cur_token ){
548 item_ptr->set_phrase_string(written, phrase_ucs4);
551 if ( cur_token != token ){
552 add_phrase_item( cur_token, item_ptr);
554 item_ptr = new PhraseItem;
556 item_ptr->set_phrase_string(written, phrase_ucs4);
559 pinyin_option_t options = USE_TONE;
560 FullPinyinParser2 parser;
561 ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
562 ChewingKeyRestVector key_rests =
563 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
565 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
567 if (item_ptr->get_phrase_length() == keys->len) {
568 item_ptr->add_pronunciation((ChewingKey *)keys->data, freq);
570 fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
574 g_array_free(keys, TRUE);
575 g_array_free(key_rests, TRUE);
579 add_phrase_item( cur_token, item_ptr);
582 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
587 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
589 min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
590 for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
591 if ( m_sub_phrase_indices[i] ) {
592 min_index = std_lite::min(min_index, i);
593 max_index = std_lite::max(max_index, i);
599 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
600 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
602 return ERROR_NO_SUB_PHRASE_INDEX;
604 int result = sub_phrase->get_range(range);
608 range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
609 range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
613 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
614 const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
615 const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
618 /* skip empty sub phrase index. */
619 range.m_range_begin = 1;
620 range.m_range_end = 1;
624 /* remove trailing zeros. */
625 const table_offset_t * poffset = 0;
626 for (poffset = end - 1; poffset >= begin + 1; --poffset) {
631 range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
632 range.m_range_end = poffset + 1 - begin; /* removed zeros. */
637 bool FacadePhraseIndex::compact(){
638 for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
639 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
643 PhraseIndexRange range;
644 int result = sub_phrase->get_range(range);
645 if ( result != ERROR_OK )
648 SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
651 for ( phrase_token_t token = range.m_range_begin;
652 token < range.m_range_end;
654 result = sub_phrase->get_phrase_item(token, item);
655 if ( result != ERROR_OK )
657 new_sub_phrase->add_phrase_item(token, &item);
661 m_sub_phrase_indices[index] = new_sub_phrase;
666 bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
667 PhraseIndexRange range;
668 if (ERROR_OK != get_range(range))
671 /* calculate mask and value for sub phrase index. */
672 mask &= PHRASE_MASK; value &= PHRASE_MASK;
674 for (phrase_token_t token = range.m_range_begin;
675 token < range.m_range_end; ++token) {
676 if ((token & mask) != value)
679 PhraseItem * item = NULL;
680 remove_phrase_item(token, item);
688 bool FacadePhraseIndex::mask_out(guint8 phrase_index,
690 phrase_token_t value){
691 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
695 /* check mask and value. */
696 phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
697 phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
699 if ((phrase_index & index_mask ) != index_value)
702 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
703 bool retval = sub_phrases->mask_out(mask, value);
704 m_total_freq += sub_phrases->get_phrase_index_total_freq();
712 static bool _peek_header(PhraseIndexLogger * logger,
713 guint32 & old_total_freq){
716 size_t header_count = 0;
717 LOG_TYPE log_type; phrase_token_t token;
718 MemoryChunk oldchunk, newchunk;
720 while (logger->has_next_record()) {
721 bool retval = logger->next_record
722 (log_type, token, &oldchunk, &newchunk);
727 if (LOG_MODIFY_HEADER != log_type)
732 oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
735 /* 1 for normal case, 0 for corrupted file. */
736 assert(1 >= header_count);
738 return 1 == header_count? true : false;
741 bool _compute_new_header(PhraseIndexLogger * logger,
743 phrase_token_t value,
744 guint32 & new_total_freq) {
746 LOG_TYPE log_type; phrase_token_t token;
747 MemoryChunk oldchunk, newchunk;
748 PhraseItem olditem, newitem;
750 while(logger->has_next_record()) {
751 bool retval = logger->next_record
752 (log_type, token, &oldchunk, &newchunk);
757 if (LOG_MODIFY_HEADER == log_type)
760 if ((token & mask) == value)
764 case LOG_ADD_RECORD:{
765 assert( 0 == oldchunk.size() );
766 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
768 new_total_freq += newitem.get_unigram_frequency();
771 case LOG_REMOVE_RECORD:{
772 assert( 0 == newchunk.size() );
773 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
775 new_total_freq -= olditem.get_unigram_frequency();
778 case LOG_MODIFY_RECORD:{
779 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
781 new_total_freq -= olditem.get_unigram_frequency();
783 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
785 new_total_freq += newitem.get_unigram_frequency();
796 static bool _write_header(PhraseIndexLogger * logger,
797 guint32 & old_total_freq,
798 guint32 & new_total_freq) {
799 MemoryChunk oldheader, newheader;
800 oldheader.set_content(0, &old_total_freq, sizeof(guint32));
801 newheader.set_content(0, &new_total_freq, sizeof(guint32));
802 logger->append_record(LOG_MODIFY_HEADER, null_token,
803 &oldheader, &newheader);
807 static bool _mask_out_records(PhraseIndexLogger * oldlogger,
809 phrase_token_t value,
810 PhraseIndexLogger * newlogger) {
811 LOG_TYPE log_type; phrase_token_t token;
812 MemoryChunk oldchunk, newchunk;
814 while(oldlogger->has_next_record()) {
815 bool retval = oldlogger->next_record
816 (log_type, token, &oldchunk, &newchunk);
821 if (LOG_MODIFY_HEADER == log_type)
824 if ((token & mask) == value)
827 newlogger->append_record(log_type, token, &oldchunk, &newchunk);
833 PhraseIndexLogger * mask_out_phrase_index_logger
834 (PhraseIndexLogger * oldlogger, phrase_token_t mask,
835 phrase_token_t value) {
836 PhraseIndexLogger * newlogger = new PhraseIndexLogger;
837 guint32 old_total_freq = 0, new_total_freq = 0;
839 /* peek the header value. */
840 if (!_peek_header(oldlogger, old_total_freq))
843 new_total_freq = old_total_freq;
845 /* compute the new header based on add/modify/remove records. */
847 if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
850 /* write out the modify header record. */
851 _write_header(newlogger, old_total_freq, new_total_freq);
853 /* mask out the matched records. */
855 _mask_out_records(oldlogger, mask, value, newlogger);