3 * Library to deal with pinyin.
5 * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 #include "facade_chewing_table.h"
24 #include "pinyin_lookup2.h"
27 using namespace pinyin;
30 const gfloat PinyinLookup2::bigram_lambda = lambda;
31 const gfloat PinyinLookup2::unigram_lambda = 1 - lambda;
34 /* internal definition */
35 static const size_t nbeam = 32;
37 static bool dump_max_value(GPtrArray * values){
41 const lookup_value_t * max =
42 (const lookup_value_t *) g_ptr_array_index(values, 0);
44 for (size_t i = 1; i < values->len; ++i) {
45 const lookup_value_t * cur =
46 (const lookup_value_t *) g_ptr_array_index(values, i);
48 if (cur->m_poss > max->m_poss)
52 printf("max value: %f\n", max->m_poss);
57 static bool dump_all_values(GPtrArray * values) {
62 for (size_t i = 0; i < values->len; ++i) {
63 const lookup_value_t * cur =
64 (const lookup_value_t *) g_ptr_array_index(values, i);
66 printf("%f\t", cur->m_poss);
73 /* populate the candidates. */
74 static bool populate_candidates(/* out */ GPtrArray * candidates,
75 /* in */ LookupStepContent step) {
76 g_ptr_array_set_size(candidates, 0);
81 for (size_t i = 0; i < step->len; ++i) {
82 lookup_value_t * value = &g_array_index
83 (step, lookup_value_t, i);
85 g_ptr_array_add(candidates, value);
88 /* dump_max_value(candidates); */
93 static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){
94 return lhs->m_poss < rhs->m_poss;
97 /* use maximum heap to get the topest results. */
98 static bool get_top_results(/* out */ GPtrArray * topresults,
99 /* in */ GPtrArray * candidates) {
100 g_ptr_array_set_size(topresults, 0);
102 if (0 == candidates->len)
105 lookup_value_t ** begin =
106 (lookup_value_t **) &g_ptr_array_index(candidates, 0);
107 lookup_value_t ** end =
108 (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len);
110 std_lite::make_heap(begin, end, lookup_value_less_than);
112 while (end != begin) {
113 lookup_value_t * one = *begin;
114 g_ptr_array_add(topresults, one);
116 std_lite::pop_heap(begin, end, lookup_value_less_than);
119 if (topresults->len >= nbeam)
123 /* dump_all_values(topresults); */
128 static bool populate_prefixes(GPtrArray * steps_index,
129 GPtrArray * steps_content,
130 TokenVector prefixes) {
131 assert(prefixes->len > 0);
133 for (size_t i = 0; i < prefixes->len; ++i) {
134 phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
135 lookup_key_t initial_key = token;
136 lookup_value_t initial_value(log(1));
137 initial_value.m_handles[1] = token;
139 LookupStepContent initial_step_content = (LookupStepContent)
140 g_ptr_array_index(steps_content, 0);
141 initial_step_content = g_array_append_val
142 (initial_step_content, initial_value);
144 LookupStepIndex initial_step_index = (LookupStepIndex)
145 g_ptr_array_index(steps_index, 0);
146 g_hash_table_insert(initial_step_index,
147 GUINT_TO_POINTER(initial_key),
148 GUINT_TO_POINTER(initial_step_content->len - 1));
154 static bool init_steps(GPtrArray * steps_index,
155 GPtrArray * steps_content,
157 /* add null start step */
158 g_ptr_array_set_size(steps_index, nstep);
159 g_ptr_array_set_size(steps_content, nstep);
161 for (int i = 0; i < nstep; ++i) {
162 /* initialize steps_index */
163 g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
164 /* initialize steps_content */
165 g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
171 static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){
172 /* clear steps_index */
173 for ( size_t i = 0; i < steps_index->len; ++i){
174 GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
175 g_hash_table_destroy(table);
176 g_ptr_array_index(steps_index, i) = NULL;
179 /* clear steps_content */
180 for ( size_t i = 0; i < steps_content->len; ++i){
181 GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
182 g_array_free(array, TRUE);
183 g_ptr_array_index(steps_content, i) = NULL;
188 PinyinLookup2::PinyinLookup2(const gfloat lambda,
189 pinyin_option_t options,
190 FacadeChewingTable * pinyin_table,
191 FacadePhraseIndex * phrase_index,
192 Bigram * system_bigram,
193 Bigram * user_bigram)
194 : bigram_lambda(lambda),
195 unigram_lambda(1. - lambda)
198 m_pinyin_table = pinyin_table;
199 m_phrase_index = phrase_index;
200 m_system_bigram = system_bigram;
201 m_user_bigram = user_bigram;
203 m_steps_index = g_ptr_array_new();
204 m_steps_content = g_ptr_array_new();
207 PinyinLookup2::~PinyinLookup2(){
208 clear_steps(m_steps_index, m_steps_content);
209 g_ptr_array_free(m_steps_index, TRUE);
210 g_ptr_array_free(m_steps_content, TRUE);
214 bool PinyinLookup2::get_best_match(TokenVector prefixes,
215 ChewingKeyVector keys,
216 CandidateConstraints constraints,
217 MatchResults & results){
218 m_constraints = constraints;
220 int nstep = keys->len + 1;
222 clear_steps(m_steps_index, m_steps_content);
224 init_steps(m_steps_index, m_steps_content, nstep);
226 populate_prefixes(m_steps_index, m_steps_content, prefixes);
228 PhraseIndexRanges ranges;
229 memset(ranges, 0, sizeof(PhraseIndexRanges));
230 m_phrase_index->prepare_ranges(ranges);
232 GPtrArray * candidates = g_ptr_array_new();
233 GPtrArray * topresults = g_ptr_array_new();
235 /* begin the viterbi beam search. */
236 for ( int i = 0; i < nstep - 1; ++i ){
237 lookup_constraint_t * cur_constraint = &g_array_index
238 (m_constraints, lookup_constraint_t, i);
240 if (CONSTRAINT_NOSEARCH == cur_constraint->m_type)
243 LookupStepContent step = (LookupStepContent)
244 g_ptr_array_index(m_steps_content, i);
246 populate_candidates(candidates, step);
247 get_top_results(topresults, candidates);
249 if (0 == topresults->len)
252 for ( int m = i + 1; m < nstep; ++m ){
253 const int len = m - i;
254 if (len > MAX_PHRASE_LENGTH)
257 lookup_constraint_t * next_constraint = &g_array_index
258 (m_constraints, lookup_constraint_t, m - 1);
260 if (CONSTRAINT_NOSEARCH == next_constraint->m_type)
263 ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data;
264 /* do one pinyin table search. */
265 int result = m_pinyin_table->search(len, pinyin_keys + i, ranges);
267 if (result & SEARCH_OK) {
268 /* assume topresults always contains items. */
269 search_bigram2(topresults, i, ranges),
270 search_unigram2(topresults, i, ranges);
273 /* poke the next constraint. */
275 if (CONSTRAINT_ONESTEP == next_constraint->m_type)
278 /* no longer pinyin */
279 if (!(result & SEARCH_CONTINUED))
284 m_phrase_index->destroy_ranges(ranges);
286 g_ptr_array_free(candidates, TRUE);
287 g_ptr_array_free(topresults, TRUE);
289 return final_step(results);
292 bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
293 PhraseIndexRanges ranges) {
295 if (0 == topresults->len)
298 lookup_value_t * max = (lookup_value_t *)
299 g_ptr_array_index(topresults, 0);
301 lookup_constraint_t * constraint =
302 &g_array_index(m_constraints, lookup_constraint_t, nstep);
304 if (CONSTRAINT_ONESTEP == constraint->m_type) {
305 return unigram_gen_next_step(nstep, max, constraint->m_token);
310 if (NO_CONSTRAINT == constraint->m_type) {
311 for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
312 GArray * array = ranges[m];
313 if ( !array ) continue;
315 for ( size_t n = 0; n < array->len; ++n){
316 PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
317 for ( phrase_token_t token = range->m_range_begin;
318 token != range->m_range_end; ++token){
319 found = unigram_gen_next_step(nstep, max, token)|| found;
328 bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
329 PhraseIndexRanges ranges) {
331 lookup_constraint_t * constraint =
332 &g_array_index(m_constraints, lookup_constraint_t, nstep);
335 BigramPhraseArray bigram_phrase_items = g_array_new
336 (FALSE, FALSE, sizeof(BigramPhraseItem));
338 for (size_t i = 0; i < topresults->len; ++i) {
339 lookup_value_t * value = (lookup_value_t *)
340 g_ptr_array_index(topresults, i);
342 phrase_token_t index_token = value->m_handles[1];
344 SingleGram * system = NULL, * user = NULL;
345 m_system_bigram->load(index_token, system);
346 m_user_bigram->load(index_token, user);
348 if ( !merge_single_gram(&m_merged_single_gram, system, user) )
351 if ( CONSTRAINT_ONESTEP == constraint->m_type ){
352 phrase_token_t token = constraint->m_token;
355 if( m_merged_single_gram.get_freq(token, freq) ){
357 m_merged_single_gram.get_total_freq(total_freq);
358 gfloat bigram_poss = freq / (gfloat) total_freq;
359 found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found;
363 if (NO_CONSTRAINT == constraint->m_type) {
364 for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
365 GArray * array = ranges[m];
366 if ( !array ) continue;
368 for ( size_t n = 0; n < array->len; ++n){
369 PhraseIndexRange * range =
370 &g_array_index(array, PhraseIndexRange, n);
372 g_array_set_size(bigram_phrase_items, 0);
373 m_merged_single_gram.search(range, bigram_phrase_items);
374 for( size_t k = 0; k < bigram_phrase_items->len; ++k) {
375 BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
376 found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found;
387 g_array_free(bigram_phrase_items, TRUE);
392 bool PinyinLookup2::unigram_gen_next_step(int nstep,
393 lookup_value_t * cur_step,
394 phrase_token_t token) {
396 if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
399 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
400 gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
401 m_phrase_index->get_phrase_index_total_freq();
402 if ( elem_poss < DBL_EPSILON )
405 ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
406 gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
407 if (pinyin_poss < FLT_EPSILON )
410 lookup_value_t next_step;
411 next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
412 next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
413 next_step.m_last_step = nstep;
415 return save_next_step(nstep + phrase_length, cur_step, &next_step);
418 bool PinyinLookup2::bigram_gen_next_step(int nstep,
419 lookup_value_t * cur_step,
420 phrase_token_t token,
421 gfloat bigram_poss) {
423 if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
426 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
427 gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
428 (gdouble) m_phrase_index->get_phrase_index_total_freq();
429 if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
432 ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
433 gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
434 if ( pinyin_poss < FLT_EPSILON )
437 lookup_value_t next_step;
438 next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
439 next_step.m_poss = cur_step->m_poss +
440 log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss);
441 next_step.m_last_step = nstep;
443 return save_next_step(nstep + phrase_length, cur_step, &next_step);
446 bool PinyinLookup2::save_next_step(int next_step_pos,
447 lookup_value_t * cur_step,
448 lookup_value_t * next_step){
450 lookup_key_t next_key = next_step->m_handles[1];
451 LookupStepIndex next_lookup_index = (LookupStepIndex)
452 g_ptr_array_index(m_steps_index, next_step_pos);
453 LookupStepContent next_lookup_content = (LookupStepContent)
454 g_ptr_array_index(m_steps_content, next_step_pos);
456 gpointer key = NULL, value = NULL;
457 gboolean lookup_result = g_hash_table_lookup_extended
458 (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
460 if ( !lookup_result ){
461 g_array_append_val(next_lookup_content, *next_step);
462 g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
465 size_t step_index = GPOINTER_TO_UINT(value);
466 lookup_value_t * orig_next_value = &g_array_index
467 (next_lookup_content, lookup_value_t, step_index);
469 if ( orig_next_value->m_poss < next_step->m_poss) {
470 /* found better result. */
471 orig_next_value->m_handles[0] = next_step->m_handles[0];
472 assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
473 orig_next_value->m_poss = next_step->m_poss;
474 orig_next_value->m_last_step = next_step->m_last_step;
482 bool PinyinLookup2::final_step(MatchResults & results){
485 g_array_set_size(results, m_steps_content->len - 1);
486 for (size_t i = 0; i < results->len; ++i){
487 phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
491 /* find max element */
492 size_t last_step_pos = m_steps_content->len - 1;
493 GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
494 if ( last_step_array->len == 0 )
497 lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
498 for ( size_t i = 1; i < last_step_array->len; ++i){
499 lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
500 if ( cur_value->m_poss > max_value->m_poss )
501 max_value = cur_value;
506 int cur_step_pos = max_value->m_last_step;
507 if ( -1 == cur_step_pos )
510 phrase_token_t * token = &g_array_index
511 (results, phrase_token_t, cur_step_pos);
512 *token = max_value->m_handles[1];
514 phrase_token_t last_token = max_value->m_handles[0];
515 LookupStepIndex lookup_step_index = (LookupStepIndex)
516 g_ptr_array_index(m_steps_index, cur_step_pos);
518 gpointer key = NULL, value = NULL;
519 gboolean result = g_hash_table_lookup_extended
520 (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
524 LookupStepContent lookup_step_content = (LookupStepContent)
525 g_ptr_array_index(m_steps_content, cur_step_pos);
526 max_value = &g_array_index
527 (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
530 /* no need to reverse the result */
535 bool PinyinLookup2::train_result2(ChewingKeyVector keys,
536 CandidateConstraints constraints,
537 MatchResults results) {
538 const guint32 initial_seed = 23 * 3;
539 const guint32 expand_factor = 2;
540 const guint32 unigram_factor = 7;
541 const guint32 pinyin_factor = 1;
542 const guint32 ceiling_seed = 23 * 15 * 64;
544 /* begin training based on constraints and results. */
545 bool train_next = false;
546 ChewingKey * pinyin_keys = (ChewingKey *) keys->data;
548 phrase_token_t last_token = sentence_start;
549 /* constraints->len + 1 == results->len */
550 for (size_t i = 0; i < constraints->len; ++i) {
551 phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
552 if (null_token == *token)
555 lookup_constraint_t * constraint = &g_array_index
556 (constraints, lookup_constraint_t, i);
557 if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
558 if (CONSTRAINT_ONESTEP == constraint->m_type) {
559 assert(*token == constraint->m_token);
565 guint32 seed = initial_seed;
566 /* train bi-gram first, and get train seed. */
568 SingleGram * user = NULL;
569 m_user_bigram->load(last_token, user);
571 guint32 total_freq = 0;
573 user = new SingleGram;
575 assert(user->get_total_freq(total_freq));
578 /* compute train factor */
579 if (!user->get_freq(*token, freq)) {
580 assert(user->insert_freq(*token, 0));
583 seed = std_lite::max(freq, initial_seed);
584 seed *= expand_factor;
585 seed = std_lite::min(seed, ceiling_seed);
588 /* protect against total_freq overflow */
589 if (seed > 0 && total_freq > total_freq + seed)
592 assert(user->set_total_freq(total_freq + seed));
593 /* if total_freq is not overflow, then freq won't overflow. */
594 assert(user->set_freq(*token, freq + seed));
595 assert(m_user_bigram->store(last_token, user));
602 m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
603 m_cache_phrase_item.increase_pronunciation_possibility
604 (m_options, pinyin_keys + i, seed * pinyin_factor);
605 m_phrase_index->add_unigram_frequency
606 (*token, seed * unigram_factor);
614 int PinyinLookup2::add_constraint(CandidateConstraints constraints,
616 phrase_token_t token) {
618 if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
621 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
622 if ( index + phrase_length > constraints->len )
625 for (size_t i = index; i < index + phrase_length; ++i){
626 clear_constraint(constraints, i);
629 /* store one step constraint */
630 lookup_constraint_t * constraint = &g_array_index
631 (constraints, lookup_constraint_t, index);
632 constraint->m_type = CONSTRAINT_ONESTEP;
633 constraint->m_token = token;
635 /* propagate no search constraint */
636 for (size_t i = 1; i < phrase_length; ++i){
637 constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
638 constraint->m_type = CONSTRAINT_NOSEARCH;
639 constraint->m_constraint_step = index;
642 return phrase_length;
645 bool PinyinLookup2::clear_constraint(CandidateConstraints constraints,
647 if (index < 0 || index >= constraints->len)
650 lookup_constraint_t * constraint = &g_array_index
651 (constraints, lookup_constraint_t, index);
653 if (NO_CONSTRAINT == constraint->m_type)
656 if (CONSTRAINT_NOSEARCH == constraint->m_type){
657 index = constraint->m_constraint_step;
658 constraint = &g_array_index(constraints, lookup_constraint_t, index);
661 /* now var constraint points to the one step constraint. */
662 assert(constraint->m_type == CONSTRAINT_ONESTEP);
664 phrase_token_t token = constraint->m_token;
665 if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
668 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
669 for ( size_t i = 0; i < phrase_length; ++i){
670 if (index + i >= constraints->len)
673 constraint = &g_array_index
674 (constraints, lookup_constraint_t, index + i);
675 constraint->m_type = NO_CONSTRAINT;
681 bool PinyinLookup2::validate_constraint(CandidateConstraints constraints,
682 ChewingKeyVector keys) {
683 /* resize constraints array first */
684 size_t constraints_length = constraints->len;
686 if ( keys->len > constraints_length ){
687 g_array_set_size(constraints, keys->len);
689 /* initialize new element */
690 for( size_t i = constraints_length; i < keys->len; ++i){
691 lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
692 constraint->m_type = NO_CONSTRAINT;
695 }else if (keys->len < constraints_length ){
697 g_array_set_size(constraints, keys->len);
700 for ( size_t i = 0; i < constraints->len; ++i){
701 lookup_constraint_t * constraint = &g_array_index
702 (constraints, lookup_constraint_t, i);
704 /* handle one step constraint */
705 if ( constraint->m_type == CONSTRAINT_ONESTEP ){
707 phrase_token_t token = constraint->m_token;
708 m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
709 size_t phrase_length = m_cache_phrase_item.get_phrase_length();
711 /* clear too long constraint */
712 if (i + phrase_length > constraints->len){
713 clear_constraint(constraints, i);
717 ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
718 /* clear invalid pinyin */
719 gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i);
720 if (pinyin_poss < FLT_EPSILON)
721 clear_constraint(constraints, i);