From: Peng Wu Date: Tue, 14 Sep 2010 03:05:19 +0000 (+0800) Subject: re-factor gen_unigram X-Git-Tag: 0.2.99~264 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ea1c658ff751273429667537d70a4473eb884886;p=platform%2Fupstream%2Flibpinyin.git re-factor gen_unigram --- diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index d7fb4fd..ce23180 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -346,7 +346,7 @@ int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){ const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin(); const table_offset_t * end = (const table_offset_t *)m_phrase_index.end(); - range.m_range_begin = 0; + range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */ range.m_range_end = end - begin; return ERROR_OK; diff --git a/src/training/gen_unigram.cpp b/src/training/gen_unigram.cpp index ec35fc5..1c70665 100644 --- a/src/training/gen_unigram.cpp +++ b/src/training/gen_unigram.cpp @@ -40,13 +40,20 @@ int main(int argc, char * argv[]){ chunk->load("../../data/gbk_char.bin"); phrase_index.load(2, chunk); - for ( size_t i = 16777217; i <= 16870566; ++i){ - phrase_index.add_unigram_frequency(i, 1); + PhraseIndexRange range; + int result = phrase_index.get_range(1, range); + if ( result == ERROR_OK ) { + for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i){ + phrase_index.add_unigram_frequency(i, 1); + } } #if 0 - for ( size_t i = 33554433; i <= 33570193 ; ++i){ - phrase_index.add_unigram_frequency(i, 1); + int result = phrase_index.get_range(2, range); + if ( result == ERROR_OK ) { + for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i){ + phrase_index.add_unigram_frequency(i, 1); + } } #endif