*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "phrase_index.h"
+#include "pinyin_custom2.h"
using namespace pinyin;
return true;
}
-bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
+bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
+ guint32 & freq){
guint8 phrase_length = get_phrase_length();
- table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
- bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
+ table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
+
+ bool retval = m_chunk.get_content
+ (offset, keys, phrase_length * sizeof(ChewingKey));
if ( !retval )
return retval;
- return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
+ return m_chunk.get_content
+ (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
}
-void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
+#if 0
+void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
guint8 phrase_length = get_phrase_length();
set_n_pronunciation(get_n_pronunciation() + 1);
- m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
+ m_chunk.set_content(m_chunk.size(), keys,
+ phrase_length * sizeof(ChewingKey));
m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
}
+#endif
+
+bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
+ char * buf_begin = (char *) m_chunk.begin();
+ guint32 total_freq = 0;
+
+ for (int i = 0; i < npron; ++i) {
+ char * chewing_begin = buf_begin + offset +
+ i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ guint32 * freq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+
+ total_freq += *freq;
+
+ if (0 == pinyin_exact_compare2
+ (keys, (ChewingKey *)chewing_begin, phrase_length)) {
+ /* found the exact match pinyin keys. */
+
+ /* protect against total_freq overflow. */
+ if (delta > 0 && total_freq > total_freq + delta)
+ return false;
+
+ *freq += delta;
+ total_freq += delta;
+ return true;
+ }
+ }
+
+ set_n_pronunciation(npron + 1);
+ m_chunk.set_content(m_chunk.size(), keys,
+ phrase_length * sizeof(ChewingKey));
+ m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32));
+ return true;
+}
void PhraseItem::remove_nth_pronunciation(size_t index){
guint8 phrase_length = get_phrase_length();
set_n_pronunciation(get_n_pronunciation() - 1);
- size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
- m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) +
+ index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
}
-bool PhraseItem::get_phrase_string(utf16_t * phrase){
+bool PhraseItem::get_phrase_string(ucs4_t * phrase){
guint8 phrase_length = get_phrase_length();
- return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+ return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
}
-bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
+bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
m_chunk.set_content(0, &phrase_length, sizeof(guint8));
- m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+ m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
return true;
}
-void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
- PinyinKey * pinyin_keys,
- gint32 delta){
+void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
+ ChewingKey * keys,
+ gint32 delta){
guint8 phrase_length = get_phrase_length();
guint8 npron = get_n_pronunciation();
- size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
char * buf_begin = (char *) m_chunk.begin();
guint32 total_freq = 0;
- for ( int i = 0 ; i < npron ; ++i){
- char * pinyin_begin = buf_begin + offset +
- i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
- guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+
+ for (int i = 0; i < npron; ++i) {
+ char * chewing_begin = buf_begin + offset +
+ i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ guint32 * freq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
total_freq += *freq;
- if ( 0 == pinyin_compare_with_ambiguities(custom,
- (PinyinKey *)pinyin_begin,
- pinyin_keys,
- phrase_length)){
- //protect against total_freq overflow.
- if ( delta > 0 && total_freq > total_freq + delta )
+
+ if (0 == pinyin_compare_with_ambiguities2
+ (options, keys,
+ (ChewingKey *)chewing_begin, phrase_length)) {
+
+ /* protect against total_freq overflow. */
+ if (delta > 0 && total_freq > total_freq + delta)
return;
+
*freq += delta;
total_freq += delta;
}
return ERROR_OUT_OF_RANGE;
if ( 0 == offset )
- return ERROR_NO_ITEM;
+ return ERROR_NO_ITEM;
result = m_phrase_content.get_content
(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
if ( !result )
- return ERROR_FILE_CORRUPTION;
+ return ERROR_FILE_CORRUPTION;
//protect total_freq overflow
if ( delta > 0 && m_total_freq > m_total_freq + delta )
return ERROR_OUT_OF_RANGE;
if ( 0 == offset )
- return ERROR_NO_ITEM;
+ return ERROR_NO_ITEM;
result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
if ( !result )
- return ERROR_FILE_CORRUPTION;
+ return ERROR_FILE_CORRUPTION;
result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
if ( !result )
return ERROR_FILE_CORRUPTION;
- size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+ size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
return ERROR_OK;
}
int result = get_phrase_item(token, old_item);
if (result != ERROR_OK)
- return result;
+ return result;
item = new PhraseItem;
//implictly copy data from m_chunk_content.
if ( !sub_phrases ){
sub_phrases = new SubPhraseIndex;
}
-
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
bool retval = sub_phrases->load(chunk, 0, chunk->size());
if ( !retval )
return retval;
if ( !sub_phrases )
return false;
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
PhraseIndexLogger logger;
logger.load(log);
- return sub_phrases->merge(&logger);
+ bool retval = sub_phrases->merge(&logger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
}
+bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
+ MemoryChunk * log,
+ phrase_token_t mask,
+ phrase_token_t value){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ /* check mask and value. */
+ phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+ phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+ if ((phrase_index & index_mask) != index_value)
+ return false;
+
+ /* unload old sub phrase index */
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+
+ /* calculate the sub phrase index mask and value. */
+ mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+ /* prepare the new logger. */
+ PhraseIndexLogger oldlogger;
+ oldlogger.load(log);
+ PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
+ (&oldlogger, mask, value);
+
+ bool retval = sub_phrases->merge(newlogger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+ delete newlogger;
+
+ return retval;
+}
+
+
bool SubPhraseIndex::load(MemoryChunk * chunk,
table_offset_t offset, table_offset_t end){
//save the memory chunk
m_phrase_index.set_chunk(buf_begin + index_one,
index_two - 1 - index_one, NULL);
m_phrase_content.set_chunk(buf_begin + index_two,
- index_three - 1 - index_two, NULL);
+ index_three - 1 - index_two, NULL);
g_return_val_if_fail( index_three <= end, FALSE);
return true;
}
return true;
}
+bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+ /* diff the header */
+ MemoryChunk oldheader, newheader;
+ guint32 total_freq = oldone->get_phrase_index_total_freq();
+ oldheader.set_content(0, &total_freq, sizeof(guint32));
+ total_freq = get_phrase_index_total_freq();
+ newheader.set_content(0, &total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+
+ /* diff phrase items */
+ PhraseIndexRange oldrange, currange, range;
+ oldone->get_range(oldrange); get_range(currange);
+ range.m_range_begin = std_lite::min(oldrange.m_range_begin,
+ currange.m_range_begin);
+ range.m_range_end = std_lite::max(oldrange.m_range_end,
+ currange.m_range_end);
+ PhraseItem olditem, newitem;
+
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; ++token ){
+ bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
+ bool newretval = ERROR_OK == get_phrase_item(token, newitem);
+
+ if ( oldretval ){
+ if ( newretval ) { /* compare phrase item. */
+ if ( olditem == newitem )
+ continue;
+ logger->append_record(LOG_MODIFY_RECORD, token,
+ &(olditem.m_chunk), &(newitem.m_chunk));
+ } else { /* remove phrase item. */
+ logger->append_record(LOG_REMOVE_RECORD, token,
+ &(olditem.m_chunk), NULL);
+ }
+ } else {
+ if ( newretval ){ /* add phrase item. */
+ logger->append_record(LOG_ADD_RECORD, token,
+ NULL, &(newitem.m_chunk));
+ } else { /* both empty. */
+ /* do nothing. */
+ }
+ }
+ }
+
+ return true;
+}
+
+bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+ PhraseItem olditem, newitem, item, * tmpitem;
+
+ while(logger->has_next_record()){
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ switch(log_type){
+ case LOG_ADD_RECORD:{
+ assert( 0 == oldchunk.size() );
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ add_phrase_item(token, &newitem);
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert( 0 == newchunk.size() );
+ tmpitem = NULL;
+ remove_phrase_item(token, tmpitem);
+
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+
+ if (olditem != *tmpitem) {
+ delete tmpitem;
+ return false;
+ }
+
+ delete tmpitem;
+
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ get_phrase_item(token, item);
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ if (item != olditem)
+ return false;
+
+ if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
+ tmpitem = NULL;
+ remove_phrase_item(token, tmpitem);
+ assert(olditem == *tmpitem);
+ add_phrase_item(token, &newitem);
+ delete tmpitem;
+ } else { /* in place editing. */
+ /* newchunk.size() <= item.m_chunk.size() */
+ /* Hack here: we assume the behaviour of get_phrase_item
+ * point to the actual data positon, so changes to item
+ * will be saved in SubPhraseIndex immediately.
+ */
+ memmove(item.m_chunk.begin(), newchunk.begin(),
+ newchunk.size());
+ }
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ guint32 total_freq = get_phrase_index_total_freq();
+ guint32 tmp_freq = 0;
+ assert(null_token == token);
+ assert(oldchunk.size() == newchunk.size());
+ oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ if (total_freq != tmp_freq)
+ return false;
+ newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ m_total_freq = tmp_freq;
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+ return true;
+}
+
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
if ( !sub_phrases ){
char phrase[256];
phrase_token_t token;
size_t freq;
+
PhraseItem * item_ptr = new PhraseItem;
phrase_token_t cur_token = 0;
- while ( !feof(infile)){
- fscanf(infile, "%s", pinyin);
- fscanf(infile, "%s", phrase);
- fscanf(infile, "%u", &token);
- fscanf(infile, "%ld", &freq);
- if ( feof(infile) )
+
+ while (!feof(infile)){
+ int num = fscanf(infile, "%s %s %u %ld",
+ pinyin, phrase, &token, &freq);
+
+ if (4 != num)
+ continue;
+
+ if (feof(infile))
break;
assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
glong written;
- utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
- &written, NULL);
+ ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
+ &written, NULL);
if ( 0 == cur_token ){
cur_token = token;
- item_ptr->set_phrase_string(written, phrase_utf16);
+ item_ptr->set_phrase_string(written, phrase_ucs4);
}
if ( cur_token != token ){
delete item_ptr;
item_ptr = new PhraseItem;
cur_token = token;
- item_ptr->set_phrase_string(written, phrase_utf16);
+ item_ptr->set_phrase_string(written, phrase_ucs4);
}
- PinyinDefaultParser parser;
- NullPinyinValidator validator;
- PinyinKeyVector keys;
- PinyinKeyPosVector poses;
-
- keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
- poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
- parser.parse(validator, keys, poses, pinyin);
+ pinyin_option_t options = USE_TONE;
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
- assert ( item_ptr->get_phrase_length() == keys->len );
- item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
+ if (item_ptr->get_phrase_length() == keys->len) {
+ item_ptr->add_pronunciation((ChewingKey *)keys->data, freq);
+ } else {
+ fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
+ pinyin, phrase);
+ }
g_array_free(keys, TRUE);
- g_array_free(poses, TRUE);
- g_free(phrase_utf16);
+ g_array_free(key_rests, TRUE);
+ g_free(phrase_ucs4);
}
add_phrase_item( cur_token, item_ptr);
delete item_ptr;
+#if 0
m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
+#endif
return true;
}
+int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
+ guint8 & max_index){
+ min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
+ for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+ if ( m_sub_phrase_indices[i] ) {
+ min_index = std_lite::min(min_index, i);
+ max_index = std_lite::max(max_index, i);
+ }
+ }
+ return ERROR_OK;
+}
+
int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
if ( !sub_phrase )
const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
+ if (begin == end) {
+ /* skip empty sub phrase index. */
+ range.m_range_begin = 1;
+ range.m_range_end = 1;
+ return ERROR_OK;
+ }
+
+ /* remove trailing zeros. */
+ const table_offset_t * poffset = 0;
+ for (poffset = end - 1; poffset >= begin + 1; --poffset) {
+ if (0 != *poffset)
+ break;
+ }
+
range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
- range.m_range_end = end - begin;
+ range.m_range_end = poffset + 1 - begin; /* removed zeros. */
return ERROR_OK;
}
-bool FacadePhraseIndex::compat(){
+bool FacadePhraseIndex::compact(){
for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
if ( !sub_phrase )
continue;
- SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
PhraseIndexRange range;
int result = sub_phrase->get_range(range);
- if ( result != ERROR_OK ) {
- delete new_sub_phrase;
+ if ( result != ERROR_OK )
continue;
- }
+
+ SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
PhraseItem item;
for ( phrase_token_t token = range.m_range_begin;
}
return true;
}
+
+bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
+ PhraseIndexRange range;
+ if (ERROR_OK != get_range(range))
+ return false;
+
+ /* calculate mask and value for sub phrase index. */
+ mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; ++token) {
+ if ((token & mask) != value)
+ continue;
+
+ PhraseItem * item = NULL;
+ remove_phrase_item(token, item);
+ if (item)
+ delete item;
+ }
+
+ return true;
+}
+
+bool FacadePhraseIndex::mask_out(guint8 phrase_index,
+ phrase_token_t mask,
+ phrase_token_t value){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if (!sub_phrases)
+ return false;
+
+ /* check mask and value. */
+ phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+ phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+
+ if ((phrase_index & index_mask ) != index_value)
+ return false;
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ bool retval = sub_phrases->mask_out(mask, value);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
+}
+
+namespace pinyin{
+
+
+static bool _peek_header(PhraseIndexLogger * logger,
+ guint32 & old_total_freq){
+ old_total_freq = 0;
+
+ size_t header_count = 0;
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+
+ while (logger->has_next_record()) {
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER != log_type)
+ continue;
+
+ ++header_count;
+
+ oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
+ }
+
+ /* 1 for normal case, 0 for corrupted file. */
+ assert(1 >= header_count);
+
+ return 1 == header_count? true : false;
+}
+
+bool _compute_new_header(PhraseIndexLogger * logger,
+ phrase_token_t mask,
+ phrase_token_t value,
+ guint32 & new_total_freq) {
+
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+ PhraseItem olditem, newitem;
+
+ while(logger->has_next_record()) {
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER == log_type)
+ continue;
+
+ if ((token & mask) == value)
+ continue;
+
+ switch(log_type) {
+ case LOG_ADD_RECORD:{
+ assert( 0 == oldchunk.size() );
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ new_total_freq += newitem.get_unigram_frequency();
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert( 0 == newchunk.size() );
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ new_total_freq -= olditem.get_unigram_frequency();
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ new_total_freq -= olditem.get_unigram_frequency();
+
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ new_total_freq += newitem.get_unigram_frequency();
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+
+ return true;
+}
+
+static bool _write_header(PhraseIndexLogger * logger,
+ guint32 & old_total_freq,
+ guint32 & new_total_freq) {
+ MemoryChunk oldheader, newheader;
+ oldheader.set_content(0, &old_total_freq, sizeof(guint32));
+ newheader.set_content(0, &new_total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+ return true;
+}
+
+static bool _mask_out_records(PhraseIndexLogger * oldlogger,
+ phrase_token_t mask,
+ phrase_token_t value,
+ PhraseIndexLogger * newlogger) {
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+
+ while(oldlogger->has_next_record()) {
+ bool retval = oldlogger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER == log_type)
+ continue;
+
+ if ((token & mask) == value)
+ continue;
+
+ newlogger->append_record(log_type, token, &oldchunk, &newchunk);
+ }
+
+ return true;
+}
+
+PhraseIndexLogger * mask_out_phrase_index_logger
+(PhraseIndexLogger * oldlogger, phrase_token_t mask,
+ phrase_token_t value) {
+ PhraseIndexLogger * newlogger = new PhraseIndexLogger;
+ guint32 old_total_freq = 0, new_total_freq = 0;
+
+ /* peek the header value. */
+ if (!_peek_header(oldlogger, old_total_freq))
+ return newlogger;
+
+ new_total_freq = old_total_freq;
+
+ /* compute the new header based on add/modify/remove records. */
+ oldlogger->rewind();
+ if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
+ return newlogger;
+
+ /* write out the modify header record. */
+ _write_header(newlogger, old_total_freq, new_total_freq);
+
+ /* mask out the matched records. */
+ oldlogger->rewind();
+ _mask_out_records(oldlogger, mask, value, newlogger);
+
+ return newlogger;
+}
+
+};