3 * Library to deal with pinyin.
5 * Copyright (C) 2010 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "novel_types.h"
25 #include "phrase_index.h"
26 #include "facade_phrase_table2.h"
28 #include "phrase_lookup.h"
30 using namespace pinyin;
33 const gfloat PhraseLookup::bigram_lambda = LAMBDA_PARAMETER;
34 const gfloat PhraseLookup::unigram_lambda = 1 - LAMBDA_PARAMETER;
37 static bool populate_prefixes(GPtrArray * steps_index,
38 GPtrArray * steps_content) {
40 lookup_key_t initial_key = sentence_start;
41 lookup_value_t initial_value(log(1));
42 initial_value.m_handles[1] = sentence_start;
44 LookupStepContent initial_step_content = (LookupStepContent)
45 g_ptr_array_index(steps_content, 0);
46 g_array_append_val(initial_step_content, initial_value);
48 LookupStepIndex initial_step_index = (LookupStepIndex)
49 g_ptr_array_index(steps_index, 0);
50 g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
51 GUINT_TO_POINTER(initial_step_content->len - 1));
56 static bool init_steps(GPtrArray * steps_index,
57 GPtrArray * steps_content,
60 /* add null start step */
61 g_ptr_array_set_size(steps_index, nstep);
62 g_ptr_array_set_size(steps_content, nstep);
64 for ( int i = 0; i < nstep; ++i ){
65 /* initialize steps_index */
66 g_ptr_array_index(steps_index, i) = g_hash_table_new
67 (g_direct_hash, g_direct_equal);
68 /* initialize steps_content */
69 g_ptr_array_index(steps_content, i) = g_array_new
70 (FALSE, FALSE, sizeof(lookup_value_t));
76 static void clear_steps(GPtrArray * steps_index,
77 GPtrArray * steps_content){
78 /* clear steps_index */
79 for ( size_t i = 0; i < steps_index->len; ++i){
80 GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
81 g_hash_table_destroy(table);
82 g_ptr_array_index(steps_index, i) = NULL;
85 /* free steps_content */
86 for ( size_t i = 0; i < steps_content->len; ++i){
87 GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
88 g_array_free(array, TRUE);
89 g_ptr_array_index(steps_content, i) = NULL;
93 PhraseLookup::PhraseLookup(FacadePhraseTable2 * phrase_table,
94 FacadePhraseIndex * phrase_index,
95 Bigram * system_bigram,
96 Bigram * user_bigram){
97 m_phrase_table = phrase_table;
98 m_phrase_index = phrase_index;
99 m_system_bigram = system_bigram;
100 m_user_bigram = user_bigram;
102 m_steps_index = g_ptr_array_new();
103 m_steps_content = g_ptr_array_new();
106 PhraseLookup::~PhraseLookup(){
107 clear_steps(m_steps_index, m_steps_content);
108 g_ptr_array_free(m_steps_index, TRUE);
109 g_ptr_array_free(m_steps_content, TRUE);
112 bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
113 MatchResults & results){
114 m_sentence_length = sentence_length;
115 m_sentence = sentence;
116 int nstep = m_sentence_length + 1;
118 clear_steps(m_steps_index, m_steps_content);
120 init_steps(m_steps_index, m_steps_content, nstep);
122 populate_prefixes(m_steps_index, m_steps_content);
125 memset(tokens, 0, sizeof(PhraseTokens));
126 m_phrase_index->prepare_tokens(tokens);
128 for ( int i = 0; i < nstep - 1; ++i ){
129 for ( int m = i + 1; m < nstep; ++m ){
131 /* do one phrase table search. */
132 int result = m_phrase_table->search(m - i, sentence + i, tokens);
134 /* found next phrase */
135 if ( result & SEARCH_OK ) {
136 search_bigram2(i, tokens),
137 search_unigram2(i, tokens);
140 /* no longer phrase */
141 if (!(result & SEARCH_CONTINUED))
146 m_phrase_index->destroy_tokens(tokens);
148 return final_step(results);
153 bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
155 LookupStepContent lookup_content = (LookupStepContent)
156 g_ptr_array_index(m_steps_content, nstep);
157 if ( 0 == lookup_content->len )
160 lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
161 /* find the maximum node */
162 for ( size_t i = 1; i < lookup_content->len; ++i ){
163 lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
164 if ( cur_value->m_poss > max_value->m_poss )
165 max_value = cur_value;
168 return unigram_gen_next_step(nstep, max_value, token);
171 bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
174 LookupStepContent lookup_content = (LookupStepContent)
175 g_ptr_array_index(m_steps_content, nstep);
176 if ( 0 == lookup_content->len )
179 for ( size_t i = 0; i < lookup_content->len; ++i ){
180 lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
181 phrase_token_t index_token = cur_value->m_handles[1];
182 SingleGram * system, * user;
183 m_system_bigram->load(index_token, system);
184 m_user_bigram->load(index_token, user);
186 if ( !merge_single_gram(&m_merged_single_gram, system, user) )
190 if ( m_merged_single_gram.get_freq(token, freq) ){
192 m_merged_single_gram.get_total_freq(total_freq);
193 gfloat bigram_poss = freq / (gfloat) total_freq;
194 found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
208 bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
211 LookupStepContent lookup_content = (LookupStepContent)
212 g_ptr_array_index(m_steps_content, nstep);
213 if ( 0 == lookup_content->len )
216 /* find the maximum node */
217 lookup_value_t * max_value = &g_array_index
218 (lookup_content, lookup_value_t, 0);
220 for (size_t i = 1; i < lookup_content->len; ++i) {
221 lookup_value_t * cur_value = &g_array_index
222 (lookup_content, lookup_value_t, i);
223 if (cur_value->m_poss > max_value->m_poss)
224 max_value = cur_value;
227 /* iterate over tokens */
228 for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
229 GArray * array = tokens[n];
233 /* just skip the loop when the length is zero. */
234 for (size_t k = 0; k < array->len; ++k) {
235 phrase_token_t token =
236 g_array_index(array, phrase_token_t, k);
238 found = unigram_gen_next_step
239 (nstep, max_value, token) || found;
246 bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
249 LookupStepContent lookup_content = (LookupStepContent)
250 g_ptr_array_index(m_steps_content, nstep);
251 if (0 == lookup_content->len)
254 for (size_t i = 0; i < lookup_content->len; ++i) {
255 lookup_value_t * cur_value = &g_array_index
256 (lookup_content, lookup_value_t, i);
257 phrase_token_t index_token = cur_value->m_handles[1];
259 SingleGram * system = NULL, * user = NULL;
260 m_system_bigram->load(index_token, system);
261 m_user_bigram->load(index_token, user);
263 if (!merge_single_gram
264 (&m_merged_single_gram, system, user))
267 /* iterate over tokens */
268 for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
269 GArray * array = tokens[n];
273 /* just skip the loop when the length is zero. */
274 for (size_t k = 0; k < array->len; ++k) {
275 phrase_token_t token =
276 g_array_index(array, phrase_token_t, k);
279 if (m_merged_single_gram.get_freq(token, freq)) {
280 guint32 total_freq = 0;
281 m_merged_single_gram.get_total_freq(total_freq);
283 gfloat bigram_poss = freq / (gfloat) total_freq;
284 found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
298 bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
299 phrase_token_t token){
301 if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
304 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
305 gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
306 m_phrase_index->get_phrase_index_total_freq();
307 if ( elem_poss < DBL_EPSILON )
310 lookup_value_t next_value;
311 next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
312 next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
313 next_value.m_last_step = nstep;
315 return save_next_step(nstep + phrase_length, cur_value, &next_value);
318 bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
320 if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
323 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
324 gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
325 (gdouble) m_phrase_index->get_phrase_index_total_freq();
327 if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
330 lookup_value_t next_value;
331 next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
332 next_value.m_poss = cur_value->m_poss +
333 log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
334 next_value.m_last_step = nstep;
336 return save_next_step(nstep + phrase_length, cur_value, &next_value);
339 bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
341 LookupStepIndex next_lookup_index = (LookupStepIndex)
342 g_ptr_array_index(m_steps_index, next_step_pos);
343 LookupStepContent next_lookup_content = (LookupStepContent)
344 g_ptr_array_index(m_steps_content, next_step_pos);
346 lookup_key_t next_key = next_value->m_handles[1];
348 gpointer key = NULL, value = NULL;
349 gboolean lookup_result = g_hash_table_lookup_extended
350 (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
353 g_array_append_val(next_lookup_content, *next_value);
354 g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
355 GUINT_TO_POINTER(next_lookup_content->len - 1));
358 size_t step_index = GPOINTER_TO_UINT(value);
359 lookup_value_t * orig_next_value = &g_array_index
360 (next_lookup_content, lookup_value_t, step_index);
362 if ( orig_next_value->m_poss < next_value->m_poss ){
363 orig_next_value->m_handles[0] = next_value->m_handles[0];
364 assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
365 orig_next_value->m_poss = next_value->m_poss;
366 orig_next_value->m_last_step = next_value->m_last_step;
373 bool PhraseLookup::final_step(MatchResults & results ){
376 g_array_set_size(results, m_steps_content->len - 1);
377 for ( size_t i = 0; i < results->len; ++i ){
378 phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
382 /* find max element */
383 size_t last_step_pos = m_steps_content->len - 1;
384 LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
385 (m_steps_content, last_step_pos);
386 if ( last_step_content->len == 0 )
389 lookup_value_t * max_value = &g_array_index
390 (last_step_content, lookup_value_t, 0);
391 for ( size_t i = 1; i < last_step_content->len; ++i ){
392 lookup_value_t * cur_value = &g_array_index
393 (last_step_content, lookup_value_t, i);
394 if ( cur_value->m_poss > max_value->m_poss )
395 max_value = cur_value;
400 int cur_step_pos = max_value->m_last_step;
401 if ( -1 == cur_step_pos )
404 phrase_token_t * token = &g_array_index
405 (results, phrase_token_t, cur_step_pos);
406 *token = max_value->m_handles[1];
408 phrase_token_t last_token = max_value->m_handles[0];
409 LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
411 gpointer key = NULL, value = NULL;
412 gboolean result = g_hash_table_lookup_extended
413 (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
417 LookupStepContent lookup_step_content = (LookupStepContent)
418 g_ptr_array_index(m_steps_content, cur_step_pos);
419 max_value = &g_array_index
420 (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
423 /* no need to reverse the result */