utils/storage/export_interpolation.cpp

   1 #include <stdio.h>
   2 #include <assert.h>
   3 #include <glib.h>
   4 #include "memory_chunk.h"
   5 #include "novel_types.h"
   6 #include "phrase_index.h"
   7 #include "ngram.h"
   8
   9 /* export interpolation model as textual format */
  10
  11 void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
  12 void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
  13
  14 /* consider moving the following function to utils/storage/utility.h */
  15 char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token);
  16
  17 void begin_data(FILE * file){
  18     fprintf(file, "\\data\n");
  19 }
  20
  21 void end_data(FILE * file){
  22     fprintf(file, "\\end\n");
  23 }
  24
  25 int main(int argc, char * argv[]){
  26     FILE * file = stdout;
  27     const char * bigram_filename = "../../data/bigram.db";
  28
  29     FacadePhraseIndex phrase_index;
  30
  31     //gb_char binary file
  32     MemoryChunk * chunk = new MemoryChunk;
  33     chunk->load("../../data/gb_char.bin");
  34     phrase_index.load(1, chunk);
  35
  36     //gbk_char binary file
  37     chunk = new MemoryChunk;
  38     chunk->load("../../data/gbk_char.bin");
  39     phrase_index.load(2, chunk);
  40
  41     Bigram bigram;
  42     bigram.attach(bigram_filename, NULL);
  43
  44     begin_data(file);
  45
  46     gen_unigram(stdout, &phrase_index);
  47     gen_bigram(stdout, &phrase_index, &bigram);
  48
  49     end_data(stdout);
  50     return 0;
  51 }
  52
  53 void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
  54     fprintf(output, "\\1-gram\n");
  55     for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
  56         /* Generate each phrase index library */
  57         const phrase_token_t min = PHRASE_INDEX_MAKE_TOKEN(i, token_min);
  58         const phrase_token_t max = PHRASE_INDEX_MAKE_TOKEN(i, token_max);
  59
  60         PhraseItem item;
  61         for ( size_t j = min; j < max; j++) {
  62             int result = phrase_index->get_phrase_item(j, item);
  63             if ( result == ERROR_NO_SUB_PHRASE_INDEX ||
  64                  result == ERROR_OUT_OF_RANGE)
  65                 break;
  66             if ( result == ERROR_NO_ITEM )
  67                 continue;
  68             assert( result == ERROR_OK);
  69
  70             size_t freq = item.get_unigram_frequency();
  71             char * phrase = token_to_string(phrase_index, j);
  72             if ( phrase )
  73                 fprintf(output, "\\item %s count %d\n", phrase, freq);
  74
  75             g_free(phrase);
  76         }
  77     }
  78 }
  79
  80 void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
  81     fprintf(output, "\\2-gram\n");
  82
  83     /* Retrieve all user items. */
  84     GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
  85     GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
  86
  87     bigram->get_all_items(system_items, user_items);
  88     assert(0 == user_items->len);
  89     g_array_free(user_items, TRUE);
  90
  91     PhraseItem item;
  92     utf16_t buffer[MAX_PHRASE_LENGTH];
  93
  94     for(int i = 0; i < system_items->len; i++){
  95         phrase_token_t token = g_array_index(system_items, phrase_token_t, i);
  96         SingleGram * system = NULL, * user = NULL;
  97         bigram->load(token, system, user);
  98         assert(NULL == user);
  99
 100         BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
 101         system->retrieve_all(array);
 102         for(int j = 0; j < array->len; j++) {
 103             BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
 104
 105             char * word1 = token_to_string(phrase_index, token);
 106             char * word2 = token_to_string(phrase_index, item->m_token);
 107             guint32 freq = item->m_count;
 108
 109             if ( word1 && word2)
 110                 fprintf(output, "\\item %s %s count %d\n", word1, word2, freq);
 111
 112             g_free(word1); g_free(word2);
 113         }
 114
 115         g_array_free(array, TRUE);
 116     }
 117
 118     g_array_free(system_items, TRUE);
 119 }
 120
 121 static const char * special_token_to_string(phrase_token_t token){
 122     struct token_pair{
 123         phrase_token_t token;
 124         const char * string;
 125     };
 126
 127     static const token_pair tokens [] = {
 128         {sentence_start, "<start>"},
 129         {0, NULL}
 130     };
 131
 132     const token_pair * pair = tokens;
 133     while (pair->token) {
 134         if ( token == pair->token )
 135             return pair->string;
 136     }
 137
 138     return NULL;
 139 }
 140
 141 char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token) {
 142     PhraseItem item;
 143     utf16_t buffer[MAX_PHRASE_LENGTH];
 144
 145     gchar * phrase;
 146     /* deal with the special phrase index, for "<start>..." */
 147     if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
 148         return g_strdup(special_token_to_string(token));
 149     }
 150
 151     int result = phrase_index->get_phrase_item(token, item);
 152     if (result != ERROR_OK)
 153         return NULL;
 154
 155     item.get_phrase_string(buffer);
 156     guint8 length = item.get_phrase_length();
 157     phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
 158     return phrase;
 159 }