src/lookup/phrase_lookup.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2010 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include <math.h>
  23 #include "stl_lite.h"
  24 #include "novel_types.h"
  25 #include "phrase_index.h"
  26 #include "facade_phrase_table2.h"
  27 #include "ngram.h"
  28 #include "phrase_lookup.h"
  29
  30 using namespace pinyin;
  31
  32
  33 const gfloat PhraseLookup::bigram_lambda = LAMBDA_PARAMETER;
  34 const gfloat PhraseLookup::unigram_lambda = 1 - LAMBDA_PARAMETER;
  35
  36
  37 static bool populate_prefixes(GPtrArray * steps_index,
  38                               GPtrArray * steps_content) {
  39
  40     lookup_key_t initial_key = sentence_start;
  41     lookup_value_t initial_value(log(1));
  42     initial_value.m_handles[1] = sentence_start;
  43
  44     LookupStepContent initial_step_content = (LookupStepContent)
  45         g_ptr_array_index(steps_content, 0);
  46     g_array_append_val(initial_step_content, initial_value);
  47
  48     LookupStepIndex initial_step_index = (LookupStepIndex)
  49         g_ptr_array_index(steps_index, 0);
  50     g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
  51                         GUINT_TO_POINTER(initial_step_content->len - 1));
  52
  53     return true;
  54 }
  55
  56 static bool init_steps(GPtrArray * steps_index,
  57                        GPtrArray * steps_content,
  58                        int nstep) {
  59
  60     /* add null start step */
  61     g_ptr_array_set_size(steps_index, nstep);
  62     g_ptr_array_set_size(steps_content, nstep);
  63
  64     for ( int i = 0; i < nstep; ++i ){
  65         /* initialize steps_index */
  66         g_ptr_array_index(steps_index, i) = g_hash_table_new
  67             (g_direct_hash, g_direct_equal);
  68         /* initialize steps_content */
  69         g_ptr_array_index(steps_content, i) = g_array_new
  70             (FALSE, FALSE, sizeof(lookup_value_t));
  71     }
  72
  73     return true;
  74 }
  75
  76 static void clear_steps(GPtrArray * steps_index,
  77                         GPtrArray * steps_content){
  78     /* clear steps_index */
  79     for ( size_t i = 0; i < steps_index->len; ++i){
  80         GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
  81         g_hash_table_destroy(table);
  82         g_ptr_array_index(steps_index, i) = NULL;
  83     }
  84
  85     /* free steps_content */
  86     for ( size_t i = 0; i < steps_content->len; ++i){
  87         GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
  88         g_array_free(array, TRUE);
  89         g_ptr_array_index(steps_content, i) = NULL;
  90     }
  91 }
  92
  93 PhraseLookup::PhraseLookup(FacadePhraseTable2 * phrase_table,
  94                            FacadePhraseIndex * phrase_index,
  95                            Bigram * system_bigram,
  96                            Bigram * user_bigram){
  97     m_phrase_table = phrase_table;
  98     m_phrase_index = phrase_index;
  99     m_system_bigram = system_bigram;
 100     m_user_bigram = user_bigram;
 101
 102     m_steps_index = g_ptr_array_new();
 103     m_steps_content = g_ptr_array_new();
 104 }
 105
 106 PhraseLookup::~PhraseLookup(){
 107     clear_steps(m_steps_index, m_steps_content);
 108     g_ptr_array_free(m_steps_index, TRUE);
 109     g_ptr_array_free(m_steps_content, TRUE);
 110 }
 111
 112 bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
 113                                   MatchResults & results){
 114     m_sentence_length = sentence_length;
 115     m_sentence = sentence;
 116     int nstep = m_sentence_length + 1;
 117
 118     clear_steps(m_steps_index, m_steps_content);
 119
 120     init_steps(m_steps_index, m_steps_content, nstep);
 121
 122     populate_prefixes(m_steps_index, m_steps_content);
 123
 124     PhraseTokens tokens;
 125     memset(tokens, 0, sizeof(PhraseTokens));
 126     m_phrase_index->prepare_tokens(tokens);
 127
 128     for ( int i = 0; i < nstep - 1; ++i ){
 129         for ( int m = i + 1; m < nstep; ++m ){
 130
 131             /* do one phrase table search. */
 132             int result = m_phrase_table->search(m - i, sentence + i, tokens);
 133
 134             /* found next phrase */
 135             if ( result & SEARCH_OK ) {
 136                 search_bigram2(i, tokens),
 137                     search_unigram2(i, tokens);
 138             }
 139
 140             /* no longer phrase */
 141             if (!(result & SEARCH_CONTINUED))
 142                 break;
 143         }
 144     }
 145
 146     m_phrase_index->destroy_tokens(tokens);
 147
 148     return final_step(results);
 149 }
 150
 151 #if 0
 152
 153 bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
 154
 155     LookupStepContent lookup_content = (LookupStepContent)
 156         g_ptr_array_index(m_steps_content, nstep);
 157     if ( 0 == lookup_content->len )
 158         return false;
 159
 160     lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
 161     /* find the maximum node */
 162     for ( size_t i = 1; i < lookup_content->len; ++i ){
 163         lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
 164         if ( cur_value->m_poss > max_value->m_poss )
 165             max_value = cur_value;
 166     }
 167
 168     return unigram_gen_next_step(nstep, max_value, token);
 169 }
 170
 171 bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
 172     bool found = false;
 173
 174     LookupStepContent lookup_content = (LookupStepContent)
 175         g_ptr_array_index(m_steps_content, nstep);
 176     if ( 0 == lookup_content->len )
 177         return false;
 178
 179     for ( size_t i = 0; i < lookup_content->len; ++i ){
 180         lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
 181         phrase_token_t index_token = cur_value->m_handles[1];
 182         SingleGram * system, * user;
 183         m_system_bigram->load(index_token, system);
 184         m_user_bigram->load(index_token, user);
 185
 186         if ( !merge_single_gram(&m_merged_single_gram, system, user) )
 187             continue;
 188
 189         guint32 freq;
 190         if ( m_merged_single_gram.get_freq(token, freq) ){
 191             guint32 total_freq;
 192             m_merged_single_gram.get_total_freq(total_freq);
 193             gfloat bigram_poss = freq / (gfloat) total_freq;
 194             found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
 195         }
 196
 197         if (system)
 198             delete system;
 199         if (user)
 200             delete user;
 201     }
 202
 203     return found;
 204 }
 205
 206 #endif
 207
 208 bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
 209     bool found = false;
 210
 211     LookupStepContent lookup_content = (LookupStepContent)
 212         g_ptr_array_index(m_steps_content, nstep);
 213     if ( 0 == lookup_content->len )
 214         return found;
 215
 216     /* find the maximum node */
 217     lookup_value_t * max_value = &g_array_index
 218         (lookup_content, lookup_value_t, 0);
 219
 220     for (size_t i = 1; i < lookup_content->len; ++i) {
 221         lookup_value_t * cur_value = &g_array_index
 222             (lookup_content, lookup_value_t, i);
 223         if (cur_value->m_poss > max_value->m_poss)
 224             max_value = cur_value;
 225     }
 226
 227     /* iterate over tokens */
 228     for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
 229         GArray * array = tokens[n];
 230         if (NULL == array)
 231             continue;
 232
 233         /* just skip the loop when the length is zero. */
 234         for (size_t k = 0; k < array->len; ++k) {
 235             phrase_token_t token =
 236                 g_array_index(array, phrase_token_t, k);
 237
 238             found = unigram_gen_next_step
 239                 (nstep, max_value, token) || found;
 240         }
 241     }
 242
 243     return found;
 244 }
 245
 246 bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
 247     bool found = false;
 248
 249     LookupStepContent lookup_content = (LookupStepContent)
 250         g_ptr_array_index(m_steps_content, nstep);
 251     if (0 == lookup_content->len)
 252         return found;
 253
 254     for (size_t i = 0; i < lookup_content->len; ++i) {
 255         lookup_value_t * cur_value = &g_array_index
 256             (lookup_content, lookup_value_t, i);
 257         phrase_token_t index_token = cur_value->m_handles[1];
 258
 259         SingleGram * system = NULL, * user = NULL;
 260         m_system_bigram->load(index_token, system);
 261         m_user_bigram->load(index_token, user);
 262
 263         if (!merge_single_gram
 264             (&m_merged_single_gram, system, user))
 265             continue;
 266
 267         /* iterate over tokens */
 268         for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
 269             GArray * array = tokens[n];
 270             if (NULL == array)
 271                 continue;
 272
 273             /* just skip the loop when the length is zero. */
 274             for (size_t k = 0; k < array->len; ++k) {
 275                 phrase_token_t token =
 276                     g_array_index(array, phrase_token_t, k);
 277
 278                 guint32 freq = 0;
 279                 if (m_merged_single_gram.get_freq(token, freq)) {
 280                     guint32 total_freq = 0;
 281                     m_merged_single_gram.get_total_freq(total_freq);
 282
 283                     gfloat bigram_poss = freq / (gfloat) total_freq;
 284                     found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
 285                 }
 286             }
 287         }
 288
 289         if (system)
 290             delete system;
 291         if (user)
 292             delete user;
 293     }
 294
 295     return found;
 296 }
 297
 298 bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
 299 phrase_token_t token){
 300
 301     if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
 302         return false;
 303
 304     size_t phrase_length = m_cache_phrase_item.get_phrase_length();
 305     gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
 306         m_phrase_index->get_phrase_index_total_freq();
 307     if ( elem_poss < DBL_EPSILON )
 308         return false;
 309
 310     lookup_value_t next_value;
 311     next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
 312     next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
 313     next_value.m_last_step = nstep;
 314
 315     return save_next_step(nstep + phrase_length, cur_value, &next_value);
 316 }
 317
 318 bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
 319
 320     if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
 321         return false;
 322
 323     size_t phrase_length = m_cache_phrase_item.get_phrase_length();
 324     gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
 325         (gdouble) m_phrase_index->get_phrase_index_total_freq();
 326
 327     if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
 328         return false;
 329
 330     lookup_value_t next_value;
 331     next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
 332     next_value.m_poss = cur_value->m_poss +
 333         log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
 334     next_value.m_last_step = nstep;
 335
 336     return save_next_step(nstep + phrase_length, cur_value, &next_value);
 337 }
 338
 339 bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
 340
 341     LookupStepIndex next_lookup_index = (LookupStepIndex)
 342         g_ptr_array_index(m_steps_index, next_step_pos);
 343     LookupStepContent next_lookup_content = (LookupStepContent)
 344         g_ptr_array_index(m_steps_content, next_step_pos);
 345
 346     lookup_key_t next_key = next_value->m_handles[1];
 347
 348     gpointer key = NULL, value = NULL;
 349     gboolean lookup_result = g_hash_table_lookup_extended
 350         (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
 351
 352     if (!lookup_result){
 353         g_array_append_val(next_lookup_content, *next_value);
 354         g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
 355                             GUINT_TO_POINTER(next_lookup_content->len - 1));
 356         return true;
 357     }else{
 358         size_t step_index = GPOINTER_TO_UINT(value);
 359         lookup_value_t * orig_next_value = &g_array_index
 360             (next_lookup_content, lookup_value_t, step_index);
 361
 362         if ( orig_next_value->m_poss < next_value->m_poss ){
 363             orig_next_value->m_handles[0] = next_value->m_handles[0];
 364             assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
 365             orig_next_value->m_poss = next_value->m_poss;
 366             orig_next_value->m_last_step = next_value->m_last_step;
 367             return true;
 368         }
 369         return false;
 370     }
 371 }
 372
 373 bool PhraseLookup::final_step(MatchResults & results ){
 374
 375     /* reset results */
 376     g_array_set_size(results, m_steps_content->len - 1);
 377     for ( size_t i = 0; i < results->len; ++i ){
 378         phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
 379         *token = null_token;
 380     }
 381
 382     /* find max element */
 383     size_t last_step_pos = m_steps_content->len - 1;
 384     LookupStepContent last_step_content =  (LookupStepContent) g_ptr_array_index
 385         (m_steps_content, last_step_pos);
 386     if ( last_step_content->len == 0 )
 387         return false;
 388
 389     lookup_value_t * max_value = &g_array_index
 390         (last_step_content, lookup_value_t, 0);
 391     for ( size_t i = 1; i < last_step_content->len; ++i ){
 392         lookup_value_t * cur_value = &g_array_index
 393             (last_step_content, lookup_value_t, i);
 394         if ( cur_value->m_poss > max_value->m_poss )
 395             max_value = cur_value;
 396     }
 397
 398     /* backtracing */
 399     while( true ){
 400         int cur_step_pos = max_value->m_last_step;
 401         if ( -1 == cur_step_pos )
 402             break;
 403
 404         phrase_token_t * token = &g_array_index
 405             (results, phrase_token_t, cur_step_pos);
 406         *token = max_value->m_handles[1];
 407
 408         phrase_token_t last_token = max_value->m_handles[0];
 409         LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
 410
 411         gpointer key = NULL, value = NULL;
 412         gboolean result = g_hash_table_lookup_extended
 413             (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
 414         if ( !result )
 415             return false;
 416
 417         LookupStepContent lookup_step_content = (LookupStepContent)
 418             g_ptr_array_index(m_steps_content, cur_step_pos);
 419         max_value = &g_array_index
 420             (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
 421     }
 422
 423     /* no need to reverse the result */
 424     return true;
 425 }