write test case for phrase index logger
authorPeng Wu <alexepico@gmail.com>
Mon, 22 Aug 2011 10:23:12 +0000 (18:23 +0800)
committerPeng Wu <alexepico@gmail.com>
Mon, 22 Aug 2011 10:37:35 +0000 (18:37 +0800)
src/storage/phrase_index.cpp
src/storage/phrase_index_logger.h
tests/storage/test_phrase_index_logger.cpp

index 551716980c6bef9b6c0fde56f5cb2687e8e1c0df..b4339048794ea8812bbd22ec4ea946d5302ade14 100644 (file)
@@ -185,7 +185,8 @@ bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
     if ( !sub_phrases ){
        sub_phrases = new SubPhraseIndex;
     }
-    
+
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
     bool retval = sub_phrases->load(chunk, 0, chunk->size());
     if ( !retval )
        return retval;
@@ -233,10 +234,14 @@ bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
     if ( !sub_phrases )
         return false;
 
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
     PhraseIndexLogger logger;
     logger.load(log);
 
-    return sub_phrases->merge(&logger);
+    bool retval = sub_phrases->merge(&logger);
+    m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+    return retval;
 }
 
 bool SubPhraseIndex::load(MemoryChunk * chunk, 
@@ -297,6 +302,16 @@ bool SubPhraseIndex::store(MemoryChunk * new_chunk,
 }
 
 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+    /* diff the header */
+    MemoryChunk oldheader, newheader;
+    guint32 total_freq = oldone->get_phrase_index_total_freq();
+    oldheader.set_content(0, &total_freq, sizeof(guint32));
+    total_freq = get_phrase_index_total_freq();
+    newheader.set_content(0, &total_freq, sizeof(guint32));
+    logger->append_record(LOG_MODIFY_HEADER, null_token,
+                          &oldheader, &newheader);
+
+    /* diff phrase items */
     PhraseIndexRange oldrange, currange, range;
     oldone->get_range(oldrange); get_range(currange);
     range.m_range_begin = std_lite::min(oldrange.m_range_begin,
@@ -388,10 +403,23 @@ bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
             }
             break;
         }
+        case LOG_MODIFY_HEADER:{
+            guint32 total_freq = get_phrase_index_total_freq();
+            guint32 tmp_freq = 0;
+            assert(null_token == token);
+            assert(oldchunk.size() == newchunk.size());
+            oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+            if (total_freq != tmp_freq)
+                return false;
+            newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+            m_total_freq = tmp_freq;
+            break;
+        }
         default:
             assert(false);
         }
     }
+    return true;
 }
 
 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
index 3cff9b89036c5729e8eb75245a17bcd797134923..95f8e8bc395b7ff0348cc482040b41a9bb0172de 100644 (file)
@@ -31,6 +31,8 @@
  *  File Format
  *  Logger Record type: add/remove/modify
  *
+ *  Modify Header: header/null token/len/old data chunk/new data chunk
+ *
  *  Add Record:    add/token/len/data chunk
  *  Remove Record: remove/token/len/data chunk
  *  Modify Record: modify/token/old len/new len/old data chunk/new data chunk
@@ -41,8 +43,9 @@ namespace pinyin{
 
 enum LOG_TYPE{
     LOG_ADD_RECORD = 1,
-    LOG_REMOVE_RECORD = 2,
-    LOG_MODIFY_RECORD = 3
+    LOG_REMOVE_RECORD,
+    LOG_MODIFY_RECORD,
+    LOG_MODIFY_HEADER
 };
 
 class PhraseIndexLogger{
@@ -127,6 +130,19 @@ public:
             offset += newlen;
             break;
         }
+        case LOG_MODIFY_HEADER:{
+            assert(token == null_token);
+            size_t len = 0;
+            m_chunk->get_content(offset, &len, sizeof(size_t));
+            offset += sizeof(size_t);
+            oldone->set_content(0, ((char *)m_chunk->begin()) + offset,
+                                len);
+            offset += len;
+            newone->set_content(0, ((char *)m_chunk->begin()) + offset,
+                                len);
+            offset += len;
+            break;
+        }
         default:
             assert(false);
         }
@@ -178,9 +194,24 @@ public:
             chunk.set_content(offset, &newlen, sizeof(size_t));
             offset += sizeof(size_t);
             chunk.set_content(offset, oldone->begin(), oldone->size());
-            offset += oldone->size();
+            offset += oldlen;
             chunk.set_content(offset, newone->begin(), newone->size());
-            offset += newone->size();
+            offset += newlen;
+            break;
+        }
+        case LOG_MODIFY_HEADER:{
+            assert(NULL != oldone);
+            assert(NULL != newone);
+            assert(null_token == token);
+            size_t oldlen = oldone->size();
+            size_t newlen = newone->size();
+            assert(oldlen == newlen);
+            chunk.set_content(offset, &oldlen, sizeof(size_t));
+            offset += sizeof(size_t);
+            chunk.set_content(offset, oldone->begin(), oldone->size());
+            offset += oldlen;
+            chunk.set_content(offset, newone->begin(), newone->size());
+            offset += newlen;
             break;
         }
         default:
index 4248db486a37bf4029eddfa1081336c8d6eea9ec..965d2c6f8900315ba817bbe7778202e6df88147c 100644 (file)
 #include "pinyin.h"
 
 
+/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */
+
 int main(int argc, char * argv[]){
-    
+    FacadePhraseIndex phrase_index;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+
+    PhraseIndexRange range;
+    phrase_index.get_range(1, range);
+    for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) {
+        phrase_index.add_unigram_frequency(i, 1);
+    }
+
+    printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
+    MemoryChunk * new_chunk = new MemoryChunk;
+    phrase_index.store(1, new_chunk);
+    new_chunk->save("/tmp/gb_char.bin");
+    delete new_chunk;
+
+    chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    new_chunk = new MemoryChunk;
+    phrase_index.diff(1, chunk, new_chunk);
+    new_chunk->save("/tmp/gb_char.dbin");
+    delete new_chunk;
+
+    chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+    new_chunk = new MemoryChunk;
+    new_chunk->load("/tmp/gb_char.dbin");
+    phrase_index.merge(1, new_chunk);
+    chunk = new MemoryChunk;
+    phrase_index.store(1, chunk);
+    chunk->save("/tmp/gb_char2.bin");
+    delete chunk;
+
+    printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
     return 0;
 }