if ( !sub_phrases ){
sub_phrases = new SubPhraseIndex;
}
-
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
bool retval = sub_phrases->load(chunk, 0, chunk->size());
if ( !retval )
return retval;
if ( !sub_phrases )
return false;
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
PhraseIndexLogger logger;
logger.load(log);
- return sub_phrases->merge(&logger);
+ bool retval = sub_phrases->merge(&logger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
}
bool SubPhraseIndex::load(MemoryChunk * chunk,
}
bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+ /* diff the header */
+ MemoryChunk oldheader, newheader;
+ guint32 total_freq = oldone->get_phrase_index_total_freq();
+ oldheader.set_content(0, &total_freq, sizeof(guint32));
+ total_freq = get_phrase_index_total_freq();
+ newheader.set_content(0, &total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+
+ /* diff phrase items */
PhraseIndexRange oldrange, currange, range;
oldone->get_range(oldrange); get_range(currange);
range.m_range_begin = std_lite::min(oldrange.m_range_begin,
}
break;
}
+ case LOG_MODIFY_HEADER:{
+ guint32 total_freq = get_phrase_index_total_freq();
+ guint32 tmp_freq = 0;
+ assert(null_token == token);
+ assert(oldchunk.size() == newchunk.size());
+ oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ if (total_freq != tmp_freq)
+ return false;
+ newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ m_total_freq = tmp_freq;
+ break;
+ }
default:
assert(false);
}
}
+ return true;
}
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
* File Format
* Logger Record type: add/remove/modify
*
+ * Modify Header: header/null token/len/old data chunk/new data chunk
+ *
* Add Record: add/token/len/data chunk
* Remove Record: remove/token/len/data chunk
* Modify Record: modify/token/old len/new len/old data chunk/new data chunk
enum LOG_TYPE{
LOG_ADD_RECORD = 1,
- LOG_REMOVE_RECORD = 2,
- LOG_MODIFY_RECORD = 3
+ LOG_REMOVE_RECORD,
+ LOG_MODIFY_RECORD,
+ LOG_MODIFY_HEADER
};
class PhraseIndexLogger{
offset += newlen;
break;
}
+ case LOG_MODIFY_HEADER:{
+ assert(token == null_token);
+ size_t len = 0;
+ m_chunk->get_content(offset, &len, sizeof(size_t));
+ offset += sizeof(size_t);
+ oldone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ newone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ break;
+ }
default:
assert(false);
}
chunk.set_content(offset, &newlen, sizeof(size_t));
offset += sizeof(size_t);
chunk.set_content(offset, oldone->begin(), oldone->size());
- offset += oldone->size();
+ offset += oldlen;
chunk.set_content(offset, newone->begin(), newone->size());
- offset += newone->size();
+ offset += newlen;
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ assert(NULL != oldone);
+ assert(NULL != newone);
+ assert(null_token == token);
+ size_t oldlen = oldone->size();
+ size_t newlen = newone->size();
+ assert(oldlen == newlen);
+ chunk.set_content(offset, &oldlen, sizeof(size_t));
+ offset += sizeof(size_t);
+ chunk.set_content(offset, oldone->begin(), oldone->size());
+ offset += oldlen;
+ chunk.set_content(offset, newone->begin(), newone->size());
+ offset += newlen;
break;
}
default:
#include "pinyin.h"
+/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */
+
int main(int argc, char * argv[]){
-
+ FacadePhraseIndex phrase_index;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ PhraseIndexRange range;
+ phrase_index.get_range(1, range);
+ for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) {
+ phrase_index.add_unigram_frequency(i, 1);
+ }
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ phrase_index.store(1, new_chunk);
+ new_chunk->save("/tmp/gb_char.bin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ new_chunk = new MemoryChunk;
+ phrase_index.diff(1, chunk, new_chunk);
+ new_chunk->save("/tmp/gb_char.dbin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+ new_chunk = new MemoryChunk;
+ new_chunk->load("/tmp/gb_char.dbin");
+ phrase_index.merge(1, new_chunk);
+ chunk = new MemoryChunk;
+ phrase_index.store(1, chunk);
+ chunk->save("/tmp/gb_char2.bin");
+ delete chunk;
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
return 0;
}