Allow libpinyin to build in cross compile mode.
[platform/upstream/libpinyin.git] / src / storage / phrase_index.cpp
index 59b166f..5fe61c2 100644 (file)
  *  
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
 #include "phrase_index.h"
+#include "pinyin_custom2.h"
 
 using namespace pinyin;
 
@@ -28,60 +29,108 @@ bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
     return true;
 }
 
-bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
+bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
+                                       guint32 & freq){
     guint8 phrase_length = get_phrase_length();
-    table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
-    bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
+    table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
+
+    bool retval = m_chunk.get_content
+        (offset, keys, phrase_length * sizeof(ChewingKey));
     if ( !retval )
        return retval;
-    return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
+    return m_chunk.get_content
+        (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
 }
 
-void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
+#if 0
+void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
     guint8 phrase_length = get_phrase_length();
     set_n_pronunciation(get_n_pronunciation() + 1);
-    m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
+    m_chunk.set_content(m_chunk.size(), keys,
+                        phrase_length * sizeof(ChewingKey));
     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
 }
+#endif
+
+bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
+    guint8 phrase_length = get_phrase_length();
+    guint8 npron = get_n_pronunciation();
+    size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
+    char * buf_begin = (char *) m_chunk.begin();
+    guint32 total_freq = 0;
+
+    for (int i = 0; i < npron; ++i) {
+        char * chewing_begin = buf_begin + offset +
+            i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+        guint32 * freq = (guint32 *)(chewing_begin +
+                                     phrase_length * sizeof(ChewingKey));
+
+        total_freq += *freq;
+
+        if (0 == pinyin_exact_compare2
+            (keys, (ChewingKey *)chewing_begin, phrase_length)) {
+            /* found the exact match pinyin keys. */
+
+           /* protect against total_freq overflow. */
+            if (delta > 0 && total_freq > total_freq + delta)
+                return false;
+
+            *freq += delta;
+            total_freq += delta;
+            return true;
+        }
+    }
+
+    set_n_pronunciation(npron + 1);
+    m_chunk.set_content(m_chunk.size(), keys,
+                        phrase_length * sizeof(ChewingKey));
+    m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32));
+    return true;
+}
 
 void PhraseItem::remove_nth_pronunciation(size_t index){
     guint8 phrase_length = get_phrase_length();
     set_n_pronunciation(get_n_pronunciation() - 1);
-    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
-    m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
+    size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) +
+        index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+    m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
 }
 
-bool PhraseItem::get_phrase_string(utf16_t * phrase){
+bool PhraseItem::get_phrase_string(ucs4_t * phrase){
     guint8 phrase_length = get_phrase_length();
-    return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+    return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
 }
 
-bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
+bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
     m_chunk.set_content(0, &phrase_length, sizeof(guint8));
-    m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+    m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
     return true;
 }
 
-void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
-                                            PinyinKey * pinyin_keys,
-                                            gint32 delta){
+void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
+                                                    ChewingKey * keys,
+                                                    gint32 delta){
     guint8 phrase_length = get_phrase_length();
     guint8 npron = get_n_pronunciation();
-    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+    size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
     char * buf_begin = (char *) m_chunk.begin();
     guint32 total_freq = 0;
-    for ( int i = 0 ; i < npron ; ++i){
-       char * pinyin_begin = buf_begin + offset +
-           i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
-       guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+
+    for (int i = 0; i < npron; ++i) {
+       char * chewing_begin = buf_begin + offset +
+           i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+       guint32 * freq = (guint32 *)(chewing_begin +
+                                     phrase_length * sizeof(ChewingKey));
        total_freq += *freq;
-       if ( 0 == pinyin_compare_with_ambiguities(custom,
-                                                 (PinyinKey *)pinyin_begin,
-                                                 pinyin_keys,
-                                                 phrase_length)){
-           //protect against total_freq overflow.
-           if ( delta > 0 && total_freq > total_freq + delta )
+
+       if (0 == pinyin_compare_with_ambiguities2
+            (options, keys,
+             (ChewingKey *)chewing_begin, phrase_length)) {
+
+           /* protect against total_freq overflow. */
+           if (delta > 0 && total_freq > total_freq + delta)
                return;
+
            *freq += delta;
            total_freq += delta;
        }
@@ -104,13 +153,13 @@ int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
        return ERROR_OUT_OF_RANGE;
 
     if ( 0 == offset )
-    return ERROR_NO_ITEM;
+        return ERROR_NO_ITEM;
 
     result = m_phrase_content.get_content
        (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
 
     if ( !result )
-    return ERROR_FILE_CORRUPTION;
+        return ERROR_FILE_CORRUPTION;
 
     //protect total_freq overflow
     if ( delta > 0 && m_total_freq > m_total_freq + delta )
@@ -136,17 +185,17 @@ int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
        return ERROR_OUT_OF_RANGE;
 
     if ( 0 == offset )
-    return ERROR_NO_ITEM;
+        return ERROR_NO_ITEM;
 
     result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
     if ( !result ) 
-    return ERROR_FILE_CORRUPTION;
+        return ERROR_FILE_CORRUPTION;
     
     result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
     if ( !result ) 
        return ERROR_FILE_CORRUPTION;
 
-    size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+    size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
     return ERROR_OK;
 }
@@ -167,7 +216,7 @@ int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item
 
     int result = get_phrase_item(token, old_item);
     if (result != ERROR_OK)
-    return result;
+        return result;
 
     item = new PhraseItem;
     //implictly copy data from m_chunk_content.
@@ -185,7 +234,8 @@ bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
     if ( !sub_phrases ){
        sub_phrases = new SubPhraseIndex;
     }
-    
+
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
     bool retval = sub_phrases->load(chunk, 0, chunk->size());
     if ( !retval )
        return retval;
@@ -233,12 +283,50 @@ bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
     if ( !sub_phrases )
         return false;
 
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
     PhraseIndexLogger logger;
     logger.load(log);
 
-    return sub_phrases->merge(&logger);
+    bool retval = sub_phrases->merge(&logger);
+    m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+    return retval;
 }
 
+bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
+                                        MemoryChunk * log,
+                                        phrase_token_t mask,
+                                        phrase_token_t value){
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if ( !sub_phrases )
+        return false;
+
+    /* check mask and value. */
+    phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+    phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+    if ((phrase_index & index_mask) != index_value)
+        return false;
+
+    /* unload old sub phrase index */
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+
+    /* calculate the sub phrase index mask and value. */
+    mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+    /* prepare the new logger. */
+    PhraseIndexLogger oldlogger;
+    oldlogger.load(log);
+    PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
+        (&oldlogger, mask, value);
+
+    bool retval = sub_phrases->merge(newlogger);
+    m_total_freq += sub_phrases->get_phrase_index_total_freq();
+    delete newlogger;
+
+    return retval;
+}
+
+
 bool SubPhraseIndex::load(MemoryChunk * chunk, 
                          table_offset_t offset, table_offset_t end){
     //save the memory chunk
@@ -264,7 +352,7 @@ bool SubPhraseIndex::load(MemoryChunk * chunk,
     m_phrase_index.set_chunk(buf_begin + index_one, 
                             index_two - 1 - index_one, NULL);
     m_phrase_content.set_chunk(buf_begin + index_two, 
-                                index_three - 1 - index_two, NULL);
+                               index_three - 1 - index_two, NULL);
     g_return_val_if_fail( index_three <= end, FALSE);
     return true;
 }
@@ -296,6 +384,135 @@ bool SubPhraseIndex::store(MemoryChunk * new_chunk,
     return true;
 }
 
+bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+    /* diff the header */
+    MemoryChunk oldheader, newheader;
+    guint32 total_freq = oldone->get_phrase_index_total_freq();
+    oldheader.set_content(0, &total_freq, sizeof(guint32));
+    total_freq = get_phrase_index_total_freq();
+    newheader.set_content(0, &total_freq, sizeof(guint32));
+    logger->append_record(LOG_MODIFY_HEADER, null_token,
+                          &oldheader, &newheader);
+
+    /* diff phrase items */
+    PhraseIndexRange oldrange, currange, range;
+    oldone->get_range(oldrange); get_range(currange);
+    range.m_range_begin = std_lite::min(oldrange.m_range_begin,
+                                        currange.m_range_begin);
+    range.m_range_end = std_lite::max(oldrange.m_range_end,
+                                      currange.m_range_end);
+    PhraseItem olditem, newitem;
+
+    for (phrase_token_t token = range.m_range_begin;
+         token < range.m_range_end; ++token ){
+        bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
+        bool newretval = ERROR_OK == get_phrase_item(token, newitem);
+
+        if ( oldretval ){
+            if ( newretval ) { /* compare phrase item. */
+                if ( olditem == newitem )
+                    continue;
+                logger->append_record(LOG_MODIFY_RECORD, token,
+                                      &(olditem.m_chunk), &(newitem.m_chunk));
+            } else { /* remove phrase item. */
+                logger->append_record(LOG_REMOVE_RECORD, token,
+                                      &(olditem.m_chunk), NULL);
+            }
+        } else {
+            if ( newretval ){ /* add phrase item. */
+                logger->append_record(LOG_ADD_RECORD, token,
+                                      NULL, &(newitem.m_chunk));
+            } else { /* both empty. */
+                /* do nothing. */
+            }
+        }
+    }
+
+    return true;
+}
+
+bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
+    LOG_TYPE log_type; phrase_token_t token;
+    MemoryChunk oldchunk, newchunk;
+    PhraseItem olditem, newitem, item, * tmpitem;
+
+    while(logger->has_next_record()){
+        bool retval = logger->next_record
+            (log_type, token, &oldchunk, &newchunk);
+
+        if (!retval)
+            break;
+
+        switch(log_type){
+        case LOG_ADD_RECORD:{
+            assert( 0 == oldchunk.size() );
+            newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+                                      NULL);
+            add_phrase_item(token, &newitem);
+            break;
+        }
+        case LOG_REMOVE_RECORD:{
+            assert( 0 == newchunk.size() );
+            tmpitem = NULL;
+            remove_phrase_item(token, tmpitem);
+
+            olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+                                      NULL);
+
+            if (olditem != *tmpitem) {
+                delete tmpitem;
+                return false;
+            }
+
+            delete tmpitem;
+
+            break;
+        }
+        case LOG_MODIFY_RECORD:{
+            get_phrase_item(token, item);
+            olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+                                      NULL);
+            newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+                                      NULL);
+            if (item != olditem)
+                return false;
+
+            if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
+                tmpitem = NULL;
+                remove_phrase_item(token, tmpitem);
+                assert(olditem == *tmpitem);
+                add_phrase_item(token, &newitem);
+                delete tmpitem;
+            } else { /* in place editing. */
+                /* newchunk.size() <= item.m_chunk.size() */
+                /* Hack here: we assume the behaviour of get_phrase_item
+                 * point to the actual data positon, so changes to item
+                 * will be saved in SubPhraseIndex immediately.
+                 */
+                memmove(item.m_chunk.begin(), newchunk.begin(),
+                        newchunk.size());
+            }
+            break;
+        }
+        case LOG_MODIFY_HEADER:{
+            guint32 total_freq = get_phrase_index_total_freq();
+            guint32 tmp_freq = 0;
+            assert(null_token == token);
+            assert(oldchunk.size() == newchunk.size());
+            oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+            if (total_freq != tmp_freq)
+                return false;
+            newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+            m_total_freq = tmp_freq;
+            break;
+        }
+        default:
+            assert(false);
+        }
+    }
+    return true;
+}
+
 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
     SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
     if ( !sub_phrases ){
@@ -306,25 +523,29 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
     char phrase[256];
     phrase_token_t token;
     size_t freq;
+
     PhraseItem * item_ptr = new PhraseItem;
     phrase_token_t cur_token = 0;
-    while ( !feof(infile)){
-        fscanf(infile, "%s", pinyin);
-        fscanf(infile, "%s", phrase);
-        fscanf(infile, "%u", &token);
-       fscanf(infile, "%ld", &freq);
-       if ( feof(infile) )
+
+    while (!feof(infile)){
+        int num = fscanf(infile, "%s %s %u %ld",
+                         pinyin, phrase, &token, &freq);
+
+        if (4 != num)
+            continue;
+
+       if (feof(infile))
            break;
 
         assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
 
        glong written;
-       utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, 
-                                              &written, NULL);
+       ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL, 
+                                              &written, NULL);
        
        if ( 0 == cur_token ){
            cur_token = token;
-           item_ptr->set_phrase_string(written, phrase_utf16);
+           item_ptr->set_phrase_string(written, phrase_ucs4);
        }
 
        if ( cur_token != token ){
@@ -332,32 +553,49 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
            delete item_ptr;
            item_ptr = new PhraseItem;
            cur_token = token;
-           item_ptr->set_phrase_string(written, phrase_utf16);
+           item_ptr->set_phrase_string(written, phrase_ucs4);
        }
 
-       PinyinDefaultParser parser;
-       NullPinyinValidator validator;
-       PinyinKeyVector keys;
-       PinyinKeyPosVector poses;
-       
-       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
-       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
-       parser.parse(validator, keys, poses, pinyin);
+        pinyin_option_t options = USE_TONE;
+       FullPinyinParser2 parser;
+       ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+       ChewingKeyRestVector key_rests =
+            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+       parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
        
-       assert ( item_ptr->get_phrase_length() == keys->len );
-       item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
+       if (item_ptr->get_phrase_length() == keys->len) {
+            item_ptr->add_pronunciation((ChewingKey *)keys->data, freq);
+        } else {
+            fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
+                    pinyin, phrase);
+        }
 
        g_array_free(keys, TRUE);
-       g_array_free(poses, TRUE);
-       g_free(phrase_utf16);
+       g_array_free(key_rests, TRUE);
+       g_free(phrase_ucs4);
     }
 
     add_phrase_item( cur_token, item_ptr);
     delete item_ptr;
+#if 0
     m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
+#endif
     return true;
 }
 
+int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
+                                            guint8 & max_index){
+    min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
+    for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+        if ( m_sub_phrase_indices[i] ) {
+            min_index = std_lite::min(min_index, i);
+            max_index = std_lite::max(max_index, i);
+        }
+    }
+    return ERROR_OK;
+}
+
 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
     SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
     if ( !sub_phrase )
@@ -376,25 +614,38 @@ int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
     const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
     const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
 
+    if (begin == end) {
+        /* skip empty sub phrase index. */
+        range.m_range_begin = 1;
+        range.m_range_end = 1;
+        return ERROR_OK;
+    }
+
+    /* remove trailing zeros. */
+    const table_offset_t * poffset = 0;
+    for (poffset = end - 1; poffset >= begin + 1; --poffset) {
+        if (0 !=  *poffset)
+            break;
+    }
+
     range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
-    range.m_range_end = end - begin;
+    range.m_range_end = poffset + 1 - begin; /* removed zeros. */
 
     return ERROR_OK;
 }
 
-bool FacadePhraseIndex::compat(){
+bool FacadePhraseIndex::compact(){
     for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
         SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
         if ( !sub_phrase )
             continue;
 
-        SubPhraseIndex * new_sub_phrase =  new SubPhraseIndex;
         PhraseIndexRange range;
         int result = sub_phrase->get_range(range);
-        if ( result != ERROR_OK ) {
-            delete new_sub_phrase;
+        if ( result != ERROR_OK )
             continue;
-        }
+
+        SubPhraseIndex * new_sub_phrase =  new SubPhraseIndex;
 
         PhraseItem item;
         for ( phrase_token_t token = range.m_range_begin;
@@ -411,3 +662,199 @@ bool FacadePhraseIndex::compat(){
     }
     return true;
 }
+
+bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
+    PhraseIndexRange range;
+    if (ERROR_OK != get_range(range))
+        return false;
+
+    /* calculate mask and value for sub phrase index. */
+    mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+    for (phrase_token_t token = range.m_range_begin;
+         token < range.m_range_end; ++token) {
+        if ((token & mask) != value)
+            continue;
+
+        PhraseItem * item = NULL;
+        remove_phrase_item(token, item);
+        if (item)
+            delete item;
+    }
+
+    return true;
+}
+
+bool FacadePhraseIndex::mask_out(guint8 phrase_index,
+                                 phrase_token_t mask,
+                                 phrase_token_t value){
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if (!sub_phrases)
+        return false;
+
+    /* check mask and value. */
+    phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+    phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+
+    if ((phrase_index & index_mask ) != index_value)
+        return false;
+
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+    bool retval = sub_phrases->mask_out(mask, value);
+    m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+    return retval;
+}
+
+namespace pinyin{
+
+
+static bool _peek_header(PhraseIndexLogger * logger,
+                         guint32 & old_total_freq){
+    old_total_freq = 0;
+
+    size_t header_count = 0;
+    LOG_TYPE log_type; phrase_token_t token;
+    MemoryChunk oldchunk, newchunk;
+
+    while (logger->has_next_record()) {
+        bool retval = logger->next_record
+            (log_type, token, &oldchunk, &newchunk);
+
+        if (!retval)
+            break;
+
+        if (LOG_MODIFY_HEADER != log_type)
+            continue;
+
+        ++header_count;
+
+        oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
+    }
+
+    /* 1 for normal case, 0 for corrupted file. */
+    assert(1 >= header_count);
+
+    return  1 == header_count? true : false;
+}
+
+bool _compute_new_header(PhraseIndexLogger * logger,
+                         phrase_token_t mask,
+                         phrase_token_t value,
+                         guint32 & new_total_freq) {
+
+    LOG_TYPE log_type; phrase_token_t token;
+    MemoryChunk oldchunk, newchunk;
+    PhraseItem olditem, newitem;
+
+    while(logger->has_next_record()) {
+        bool retval = logger->next_record
+            (log_type, token, &oldchunk, &newchunk);
+
+        if (!retval)
+            break;
+
+        if (LOG_MODIFY_HEADER == log_type)
+            continue;
+
+        if ((token & mask) == value)
+            continue;
+
+        switch(log_type) {
+        case LOG_ADD_RECORD:{
+            assert( 0 == oldchunk.size() );
+            newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+                                      NULL);
+            new_total_freq += newitem.get_unigram_frequency();
+            break;
+        }
+        case LOG_REMOVE_RECORD:{
+            assert( 0 == newchunk.size() );
+            olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+                                      NULL);
+            new_total_freq -= olditem.get_unigram_frequency();
+            break;
+        }
+        case LOG_MODIFY_RECORD:{
+            olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+                                      NULL);
+            new_total_freq -= olditem.get_unigram_frequency();
+
+            newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+                                      NULL);
+            new_total_freq += newitem.get_unigram_frequency();
+            break;
+        }
+        default:
+            assert(false);
+        }
+    }
+
+    return true;
+}
+
+static bool _write_header(PhraseIndexLogger * logger,
+                          guint32 & old_total_freq,
+                          guint32 & new_total_freq) {
+    MemoryChunk oldheader, newheader;
+    oldheader.set_content(0, &old_total_freq, sizeof(guint32));
+    newheader.set_content(0, &new_total_freq, sizeof(guint32));
+    logger->append_record(LOG_MODIFY_HEADER, null_token,
+                          &oldheader, &newheader);
+    return true;
+}
+
+static bool _mask_out_records(PhraseIndexLogger * oldlogger,
+                              phrase_token_t mask,
+                              phrase_token_t value,
+                              PhraseIndexLogger * newlogger) {
+    LOG_TYPE log_type; phrase_token_t token;
+    MemoryChunk oldchunk, newchunk;
+
+    while(oldlogger->has_next_record()) {
+        bool retval = oldlogger->next_record
+            (log_type, token, &oldchunk, &newchunk);
+
+        if (!retval)
+            break;
+
+        if (LOG_MODIFY_HEADER == log_type)
+            continue;
+
+        if ((token & mask) == value)
+            continue;
+
+        newlogger->append_record(log_type, token, &oldchunk, &newchunk);
+    }
+
+    return true;
+}
+
+PhraseIndexLogger * mask_out_phrase_index_logger
+(PhraseIndexLogger * oldlogger, phrase_token_t mask,
+ phrase_token_t value) {
+    PhraseIndexLogger * newlogger = new PhraseIndexLogger;
+    guint32 old_total_freq = 0, new_total_freq = 0;
+
+    /* peek the header value. */
+    if (!_peek_header(oldlogger, old_total_freq))
+        return newlogger;
+
+    new_total_freq = old_total_freq;
+
+    /* compute the new header based on add/modify/remove records. */
+    oldlogger->rewind();
+    if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
+        return newlogger;
+
+    /* write out the modify header record. */
+    _write_header(newlogger, old_total_freq, new_total_freq);
+
+    /* mask out the matched records. */
+    oldlogger->rewind();
+    _mask_out_records(oldlogger, mask, value, newlogger);
+
+    return newlogger;
+}
+
+};