4 #include "memory_chunk.h"
5 #include "novel_types.h"
6 #include "phrase_index.h"
9 /* export interpolation model as textual format */
11 void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
12 void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
14 /* consider moving the following function to utils/storage/utility.h */
15 char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token);
17 void begin_data(FILE * file){
18 fprintf(file, "\\data\n");
21 void end_data(FILE * file){
22 fprintf(file, "\\end\n");
25 int main(int argc, char * argv[]){
27 const char * bigram_filename = "../../data/bigram.db";
29 FacadePhraseIndex phrase_index;
32 MemoryChunk * chunk = new MemoryChunk;
33 chunk->load("../../data/gb_char.bin");
34 phrase_index.load(1, chunk);
36 //gbk_char binary file
37 chunk = new MemoryChunk;
38 chunk->load("../../data/gbk_char.bin");
39 phrase_index.load(2, chunk);
42 bigram.attach(bigram_filename, NULL);
46 gen_unigram(stdout, &phrase_index);
47 gen_bigram(stdout, &phrase_index, &bigram);
53 void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
54 fprintf(output, "\\1-gram\n");
55 for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
56 /* Generate each phrase index library */
57 const phrase_token_t min = PHRASE_INDEX_MAKE_TOKEN(i, token_min);
58 const phrase_token_t max = PHRASE_INDEX_MAKE_TOKEN(i, token_max);
61 for ( size_t j = min; j < max; j++) {
62 int result = phrase_index->get_phrase_item(j, item);
63 if ( result == ERROR_NO_SUB_PHRASE_INDEX ||
64 result == ERROR_OUT_OF_RANGE)
66 if ( result == ERROR_NO_ITEM )
68 assert( result == ERROR_OK);
70 size_t freq = item.get_unigram_frequency();
71 char * phrase = token_to_string(phrase_index, j);
73 fprintf(output, "\\item %s count %d\n", phrase, freq);
80 void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
81 fprintf(output, "\\2-gram\n");
83 /* Retrieve all user items. */
84 GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
85 GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
87 bigram->get_all_items(system_items, user_items);
88 assert(0 == user_items->len);
89 g_array_free(user_items, TRUE);
92 utf16_t buffer[MAX_PHRASE_LENGTH];
94 for(int i = 0; i < system_items->len; i++){
95 phrase_token_t token = g_array_index(system_items, phrase_token_t, i);
96 SingleGram * system = NULL, * user = NULL;
97 bigram->load(token, system, user);
100 BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
101 system->retrieve_all(array);
102 for(int j = 0; j < array->len; j++) {
103 BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
105 char * word1 = token_to_string(phrase_index, token);
106 char * word2 = token_to_string(phrase_index, item->m_token);
107 guint32 freq = item->m_count;
110 fprintf(output, "\\item %s %s count %d\n", word1, word2, freq);
112 g_free(word1); g_free(word2);
115 g_array_free(array, TRUE);
118 g_array_free(system_items, TRUE);
121 static const char * special_token_to_string(phrase_token_t token){
123 phrase_token_t token;
127 static const token_pair tokens [] = {
128 {sentence_start, "<start>"},
132 const token_pair * pair = tokens;
133 while (pair->token) {
134 if ( token == pair->token )
141 char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token) {
143 utf16_t buffer[MAX_PHRASE_LENGTH];
146 /* deal with the special phrase index, for "<start>..." */
147 if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
148 return g_strdup(special_token_to_string(token));
151 int result = phrase_index->get_phrase_item(token, item);
152 if (result != ERROR_OK)
155 item.get_phrase_string(buffer);
156 guint8 length = item.get_phrase_length();
157 phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);