3 * Library to deal with pinyin.
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
29 /* a glue layer for input method integration. */
31 struct _pinyin_context_t{
32 pinyin_option_t m_options;
34 FullPinyinParser2 * m_full_pinyin_parser;
35 DoublePinyinParser2 * m_double_pinyin_parser;
36 ChewingParser2 * m_chewing_parser;
38 FacadeChewingTable * m_pinyin_table;
39 FacadePhraseTable * m_phrase_table;
40 FacadePhraseIndex * m_phrase_index;
41 Bigram * m_system_bigram;
42 Bigram * m_user_bigram;
44 PinyinLookup * m_pinyin_lookup;
45 PhraseLookup * m_phrase_lookup;
52 static bool check_format(const char * userdir){
53 gchar * filename = g_build_filename
54 (userdir, "version", NULL);
57 bool exists = chunk.load(filename);
61 (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
62 strlen(LIBPINYIN_FORMAT_VERSION) + 1));
69 /* clean up files, if version mis-matches. */
70 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
71 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
73 if (NOT_USED == table_info->m_file_type)
76 if (NULL == table_info->m_user_filename)
79 const char * userfilename = table_info->m_user_filename;
81 /* remove dbin file. */
82 filename = g_build_filename(userdir, userfilename, NULL);
87 filename = g_build_filename
88 (userdir, "user.db", NULL);
95 static bool mark_version(const char * userdir){
96 gchar * filename = g_build_filename
97 (userdir, "version", NULL);
99 chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
100 strlen(LIBPINYIN_FORMAT_VERSION) + 1);
101 bool retval = chunk.save(filename);
106 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
107 pinyin_context_t * context = new pinyin_context_t;
109 context->m_options = USE_TONE;
111 context->m_system_dir = g_strdup(systemdir);
112 context->m_user_dir = g_strdup(userdir);
113 context->m_modified = false;
115 check_format(context->m_user_dir);
117 context->m_pinyin_table = new FacadeChewingTable;
118 MemoryChunk * chunk = new MemoryChunk;
119 gchar * filename = g_build_filename
120 (context->m_system_dir, "pinyin_index.bin", NULL);
121 if (!chunk->load(filename)) {
122 fprintf(stderr, "open %s failed!\n", filename);
127 context->m_pinyin_table->load(context->m_options, chunk, NULL);
129 context->m_full_pinyin_parser = new FullPinyinParser2;
130 context->m_double_pinyin_parser = new DoublePinyinParser2;
131 context->m_chewing_parser = new ChewingParser2;
133 context->m_phrase_table = new FacadePhraseTable;
134 chunk = new MemoryChunk;
135 filename = g_build_filename(context->m_system_dir, "phrase_index.bin", NULL);
136 if (!chunk->load(filename)) {
137 fprintf(stderr, "open %s failed!\n", filename);
141 context->m_phrase_table->load(chunk, NULL);
143 context->m_phrase_index = new FacadePhraseIndex;
145 /* hack here: directly call load phrase library. */
146 pinyin_load_phrase_library(context, 1);
148 context->m_system_bigram = new Bigram;
149 filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
150 context->m_system_bigram->attach(filename, ATTACH_READONLY);
153 context->m_user_bigram = new Bigram;
154 filename = g_build_filename(context->m_user_dir, "user.db", NULL);
155 context->m_user_bigram->load_db(filename);
158 context->m_pinyin_lookup = new PinyinLookup
159 ( context->m_options, context->m_pinyin_table,
160 context->m_phrase_index, context->m_system_bigram,
161 context->m_user_bigram);
163 context->m_phrase_lookup = new PhraseLookup
164 (context->m_phrase_table, context->m_phrase_index,
165 context->m_system_bigram, context->m_user_bigram);
170 bool pinyin_load_phrase_library(pinyin_context_t * context,
172 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
173 const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
175 if (SYSTEM_FILE == table_info->m_file_type) {
176 /* system phrase library */
177 MemoryChunk * chunk = new MemoryChunk;
179 const char * systemfilename = table_info->m_system_filename;
180 /* check bin file in system dir. */
181 gchar * chunkfilename = g_build_filename(context->m_system_dir,
182 systemfilename, NULL);
183 chunk->load(chunkfilename);
184 g_free(chunkfilename);
186 context->m_phrase_index->load(index, chunk);
188 const char * userfilename = table_info->m_user_filename;
190 chunkfilename = g_build_filename(context->m_user_dir,
193 MemoryChunk * log = new MemoryChunk;
194 log->load(chunkfilename);
195 g_free(chunkfilename);
197 /* merge the chunk log. */
198 context->m_phrase_index->merge(index, log);
202 if (USER_FILE == table_info->m_file_type) {
203 /* user phrase library */
204 MemoryChunk * chunk = new MemoryChunk;
205 const char * userfilename = table_info->m_user_filename;
207 gchar * chunkfilename = g_build_filename(context->m_user_dir,
210 /* check bin file exists. if not, create a new one. */
211 if (chunk->load(chunkfilename)) {
212 context->m_phrase_index->load(index, chunk);
215 context->m_phrase_index->create_sub_phrase(index);
218 g_free(chunkfilename);
225 bool pinyin_unload_phrase_library(pinyin_context_t * context,
227 /* gb_char.bin can't be unloaded. */
231 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
233 context->m_phrase_index->unload(index);
238 bool pinyin_save(pinyin_context_t * context){
239 if (!context->m_user_dir)
242 if (!context->m_modified)
245 context->m_phrase_index->compact();
247 /* skip the reserved zero phrase library. */
248 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
249 PhraseIndexRange range;
250 int retval = context->m_phrase_index->get_range(i, range);
252 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
255 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
257 if (NOT_USED == table_info->m_file_type)
260 const char * userfilename = table_info->m_user_filename;
262 if (NULL == userfilename)
265 if (SYSTEM_FILE == table_info->m_file_type) {
266 /* system phrase library */
267 MemoryChunk * chunk = new MemoryChunk;
268 MemoryChunk * log = new MemoryChunk;
269 const char * systemfilename = table_info->m_system_filename;
271 /* check bin file in system dir. */
272 gchar * chunkfilename = g_build_filename(context->m_system_dir,
273 systemfilename, NULL);
274 chunk->load(chunkfilename);
275 g_free(chunkfilename);
276 context->m_phrase_index->diff(i, chunk, log);
278 const char * userfilename = table_info->m_user_filename;
279 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
281 gchar * tmppathname = g_build_filename(context->m_user_dir,
285 gchar * chunkpathname = g_build_filename(context->m_user_dir,
287 log->save(tmppathname);
288 rename(tmppathname, chunkpathname);
289 g_free(chunkpathname);
294 if (USER_FILE == table_info->m_file_type) {
295 /* user phrase library */
296 MemoryChunk * chunk = new MemoryChunk;
297 context->m_phrase_index->store(i, chunk);
299 const char * userfilename = table_info->m_user_filename;
300 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
301 gchar * tmppathname = g_build_filename(context->m_user_dir,
305 gchar * chunkpathname = g_build_filename(context->m_user_dir,
308 chunk->save(tmppathname);
309 rename(tmppathname, chunkpathname);
310 g_free(chunkpathname);
316 gchar * tmpfilename = g_build_filename(context->m_user_dir,
317 "user.db.tmp", NULL);
319 gchar * filename = g_build_filename(context->m_user_dir, "user.db", NULL);
320 context->m_user_bigram->save_db(tmpfilename);
321 rename(tmpfilename, filename);
325 mark_version(context->m_user_dir);
327 context->m_modified = false;
331 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
332 DoublePinyinScheme scheme){
333 context->m_double_pinyin_parser->set_scheme(scheme);
337 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
338 ChewingScheme scheme){
339 context->m_chewing_parser->set_scheme(scheme);
344 void pinyin_fini(pinyin_context_t * context){
345 delete context->m_full_pinyin_parser;
346 delete context->m_double_pinyin_parser;
347 delete context->m_chewing_parser;
348 delete context->m_pinyin_table;
349 delete context->m_phrase_table;
350 delete context->m_phrase_index;
351 delete context->m_system_bigram;
352 delete context->m_user_bigram;
353 delete context->m_pinyin_lookup;
354 delete context->m_phrase_lookup;
356 g_free(context->m_system_dir);
357 g_free(context->m_user_dir);
358 context->m_modified = false;
361 /* copy from options to context->m_options. */
362 bool pinyin_set_options(pinyin_context_t * context,
363 pinyin_option_t options){
364 context->m_options = options;
365 context->m_pinyin_table->set_options(context->m_options);
366 context->m_pinyin_lookup->set_options(context->m_options);
371 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
372 pinyin_instance_t * instance = new pinyin_instance_t;
373 instance->m_context = context;
375 instance->m_raw_full_pinyin = NULL;
377 instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
378 instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
379 instance->m_pinyin_key_rests =
380 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
381 instance->m_constraints = g_array_new
382 (FALSE, FALSE, sizeof(lookup_constraint_t));
383 instance->m_match_results =
384 g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
389 void pinyin_free_instance(pinyin_instance_t * instance){
390 g_free(instance->m_raw_full_pinyin);
391 g_array_free(instance->m_prefixes, TRUE);
392 g_array_free(instance->m_pinyin_keys, TRUE);
393 g_array_free(instance->m_pinyin_key_rests, TRUE);
394 g_array_free(instance->m_constraints, TRUE);
395 g_array_free(instance->m_match_results, TRUE);
401 static bool pinyin_update_constraints(pinyin_instance_t * instance){
402 pinyin_context_t * & context = instance->m_context;
403 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
404 CandidateConstraints & constraints = instance->m_constraints;
406 size_t key_len = constraints->len;
407 g_array_set_size(constraints, pinyin_keys->len);
408 for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
409 lookup_constraint_t * constraint =
410 &g_array_index(constraints, lookup_constraint_t, i);
411 constraint->m_type = NO_CONSTRAINT;
414 context->m_pinyin_lookup->validate_constraint
415 (constraints, pinyin_keys);
421 bool pinyin_guess_sentence(pinyin_instance_t * instance){
422 pinyin_context_t * & context = instance->m_context;
424 g_array_set_size(instance->m_prefixes, 0);
425 g_array_append_val(instance->m_prefixes, sentence_start);
427 pinyin_update_constraints(instance);
428 bool retval = context->m_pinyin_lookup->get_best_match
429 (instance->m_prefixes,
430 instance->m_pinyin_keys,
431 instance->m_constraints,
432 instance->m_match_results);
437 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
438 const char * prefix){
439 pinyin_context_t * & context = instance->m_context;
441 g_array_set_size(instance->m_prefixes, 0);
442 g_array_append_val(instance->m_prefixes, sentence_start);
445 ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &written, NULL);
447 if (ucs4_str && written) {
449 for (ssize_t i = 1; i <= written; ++i) {
450 if (i > MAX_PHRASE_LENGTH)
453 phrase_token_t token = null_token;
454 ucs4_t * start = ucs4_str + written - i;
455 int result = context->m_phrase_table->search(i, start, token);
456 if (result & SEARCH_OK)
457 g_array_append_val(instance->m_prefixes, token);
462 pinyin_update_constraints(instance);
463 bool retval = context->m_pinyin_lookup->get_best_match
464 (instance->m_prefixes,
465 instance->m_pinyin_keys,
466 instance->m_constraints,
467 instance->m_match_results);
472 bool pinyin_phrase_segment(pinyin_instance_t * instance,
473 const char * sentence){
474 pinyin_context_t * & context = instance->m_context;
476 const glong num_of_chars = g_utf8_strlen(sentence, -1);
478 ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
480 g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
482 bool retval = context->m_phrase_lookup->get_best_match
483 (ucs4_len, ucs4_str, instance->m_match_results);
489 /* the returned sentence should be freed by g_free(). */
490 bool pinyin_get_sentence(pinyin_instance_t * instance,
492 pinyin_context_t * & context = instance->m_context;
494 bool retval = pinyin::convert_to_utf8
495 (context->m_phrase_index, instance->m_match_results,
501 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
502 const char * onepinyin,
503 ChewingKey * onekey){
504 pinyin_context_t * & context = instance->m_context;
506 int pinyin_len = strlen(onepinyin);
507 int parse_len = context->m_full_pinyin_parser->parse_one_key
508 ( context->m_options, *onekey, onepinyin, pinyin_len);
509 return pinyin_len == parse_len;
512 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
513 const char * pinyins){
514 pinyin_context_t * & context = instance->m_context;
516 g_free(instance->m_raw_full_pinyin);
517 instance->m_raw_full_pinyin = g_strdup(pinyins);
518 int pinyin_len = strlen(pinyins);
520 int parse_len = context->m_full_pinyin_parser->parse
521 ( context->m_options, instance->m_pinyin_keys,
522 instance->m_pinyin_key_rests, pinyins, pinyin_len);
527 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
528 const char * onepinyin,
529 ChewingKey * onekey){
530 pinyin_context_t * & context = instance->m_context;
532 int pinyin_len = strlen(onepinyin);
533 int parse_len = context->m_double_pinyin_parser->parse_one_key
534 ( context->m_options, *onekey, onepinyin, pinyin_len);
535 return pinyin_len == parse_len;
538 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
539 const char * pinyins){
540 pinyin_context_t * & context = instance->m_context;
541 int pinyin_len = strlen(pinyins);
543 int parse_len = context->m_double_pinyin_parser->parse
544 ( context->m_options, instance->m_pinyin_keys,
545 instance->m_pinyin_key_rests, pinyins, pinyin_len);
550 bool pinyin_parse_chewing(pinyin_instance_t * instance,
551 const char * onechewing,
552 ChewingKey * onekey){
553 pinyin_context_t * & context = instance->m_context;
555 int chewing_len = strlen(onechewing);
556 int parse_len = context->m_chewing_parser->parse_one_key
557 ( context->m_options, *onekey, onechewing, chewing_len );
558 return chewing_len == parse_len;
561 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
562 const char * chewings){
563 pinyin_context_t * & context = instance->m_context;
564 int chewing_len = strlen(chewings);
566 int parse_len = context->m_chewing_parser->parse
567 ( context->m_options, instance->m_pinyin_keys,
568 instance->m_pinyin_key_rests, chewings, chewing_len);
573 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
574 const char key, const char ** symbol) {
575 pinyin_context_t * & context = instance->m_context;
576 return context->m_chewing_parser->in_chewing_scheme
577 (context->m_options, key, symbol);
581 static gint compare_item_with_token(gconstpointer lhs,
583 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
584 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
586 phrase_token_t token_lhs = item_lhs->m_token;
587 phrase_token_t token_rhs = item_rhs->m_token;
589 return (token_lhs - token_rhs);
592 static gint compare_item_with_frequency(gconstpointer lhs,
594 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
595 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
597 guint32 freq_lhs = item_lhs->m_freq;
598 guint32 freq_rhs = item_rhs->m_freq;
600 return -(freq_lhs - freq_rhs); /* in descendant order */
603 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
605 phrase_token_t prev_token = null_token;
609 /* get previous token from prefixes. */
610 prev_token = sentence_start;
611 size_t prev_token_len = 0;
613 pinyin_context_t * context = instance->m_context;
614 TokenVector prefixes = instance->m_prefixes;
617 for (size_t i = 0; i < prefixes->len; ++i) {
618 phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
619 if (sentence_start == token)
622 int retval = context->m_phrase_index->get_phrase_item(token, item);
623 if (ERROR_OK == retval) {
624 size_t token_len = item.get_phrase_length();
625 if (token_len > prev_token_len) {
626 /* found longer match, and save it. */
628 prev_token_len = token_len;
633 /* get previous token from match results. */
636 phrase_token_t cur_token = g_array_index
637 (instance->m_match_results, phrase_token_t, offset);
638 if (null_token != cur_token) {
639 for (i = offset - 1; i >= 0; --i) {
640 cur_token = g_array_index
641 (instance->m_match_results, phrase_token_t, i);
642 if (null_token != cur_token) {
643 prev_token = cur_token;
653 static void _append_items(pinyin_context_t * context,
654 PhraseIndexRanges ranges,
655 lookup_candidate_t * template_item,
656 CandidateVector items) {
657 /* reduce and append to a single GArray. */
658 for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
659 if (NULL == ranges[m])
662 for (size_t n = 0; n < ranges[m]->len; ++n) {
663 PhraseIndexRange * range =
664 &g_array_index(ranges[m], PhraseIndexRange, n);
665 for (size_t k = range->m_range_begin;
666 k < range->m_range_end; ++k) {
667 lookup_candidate_t item;
668 item.m_candidate_type = template_item->m_candidate_type;
670 item.m_orig_rest = template_item->m_orig_rest;
671 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
672 item.m_freq = template_item->m_freq;
673 g_array_append_val(items, item);
679 static void _remove_duplicated_items(CandidateVector items) {
680 /* remove the duplicated items. */
681 phrase_token_t last_token = null_token, saved_token;
682 for (size_t n = 0; n < items->len; ++n) {
683 lookup_candidate_t * item = &g_array_index
684 (items, lookup_candidate_t, n);
686 saved_token = item->m_token;
687 if (last_token == saved_token) {
688 g_array_remove_index(items, n);
691 last_token = saved_token;
695 static void _compute_frequency_of_items(pinyin_context_t * context,
696 phrase_token_t prev_token,
697 SingleGram * merged_gram,
698 CandidateVector items) {
699 pinyin_option_t & options = context->m_options;
702 PhraseItem cached_item;
703 /* compute all freqs. */
704 for (i = 0; i < items->len; ++i) {
705 lookup_candidate_t * item = &g_array_index
706 (items, lookup_candidate_t, i);
707 phrase_token_t & token = item->m_token;
709 gfloat bigram_poss = 0; guint32 total_freq = 0;
710 if (options & DYNAMIC_ADJUST) {
711 if (null_token != prev_token) {
712 guint32 bigram_freq = 0;
713 merged_gram->get_total_freq(total_freq);
714 merged_gram->get_freq(token, bigram_freq);
716 bigram_poss = bigram_freq / (gfloat)total_freq;
720 /* compute the m_freq. */
721 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
722 phrase_index->get_phrase_item(token, cached_item);
723 total_freq = phrase_index->get_phrase_index_total_freq();
724 assert (0 < total_freq);
726 /* Note: possibility value <= 1.0. */
727 guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
728 (1 - LAMBDA_PARAMETER) *
729 cached_item.get_unigram_frequency() /
730 (gfloat) total_freq) * 256 * 256 * 256;
735 bool pinyin_get_candidates(pinyin_instance_t * instance,
737 TokenVector candidates) {
739 pinyin_context_t * & context = instance->m_context;
740 pinyin_option_t & options = context->m_options;
741 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
742 g_array_set_size(candidates, 0);
744 size_t pinyin_len = pinyin_keys->len - offset;
747 /* lookup the previous token here. */
748 phrase_token_t prev_token = null_token;
750 if (options & DYNAMIC_ADJUST) {
751 prev_token = _get_previous_token(instance, offset);
754 SingleGram merged_gram;
755 SingleGram * system_gram = NULL, * user_gram = NULL;
757 if (options & DYNAMIC_ADJUST) {
758 if (null_token != prev_token) {
759 context->m_system_bigram->load(prev_token, system_gram);
760 context->m_user_bigram->load(prev_token, user_gram);
761 merge_single_gram(&merged_gram, system_gram, user_gram);
765 PhraseIndexRanges ranges;
766 memset(ranges, 0, sizeof(ranges));
767 context->m_phrase_index->prepare_ranges(ranges);
769 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
771 for (i = pinyin_len; i >= 1; --i) {
772 g_array_set_size(items, 0);
774 ChewingKey * keys = &g_array_index
775 (pinyin_keys, ChewingKey, offset);
777 /* do pinyin search. */
778 int retval = context->m_pinyin_table->search
781 if ( !(retval & SEARCH_OK) )
784 lookup_candidate_t template_item;
785 _append_items(context, ranges, &template_item, items);
787 g_array_sort(items, compare_item_with_token);
789 _remove_duplicated_items(items);
791 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
793 /* sort the candidates of the same length by frequency. */
794 g_array_sort(items, compare_item_with_frequency);
796 /* transfer back items to tokens, and save it into candidates */
797 for (ssize_t k = 0; k < items->len; ++k) {
798 lookup_candidate_t * item = &g_array_index
799 (items, lookup_candidate_t, k);
800 g_array_append_val(candidates, item->m_token);
803 if (!(retval & SEARCH_CONTINUED))
807 g_array_free(items, TRUE);
809 context->m_phrase_index->destroy_ranges(ranges);
819 static bool _try_divided_table(pinyin_instance_t * instance,
820 PhraseIndexRanges ranges,
822 CandidateVector items){
825 pinyin_context_t * & context = instance->m_context;
826 pinyin_option_t & options = context->m_options;
827 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
828 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
830 assert(pinyin_keys->len == pinyin_key_rests->len);
831 gint num_keys = pinyin_keys->len;
832 assert(offset < num_keys);
834 /* handle "^xian$" -> "xi'an" here */
835 ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
836 ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
837 ChewingKeyRest, offset);
838 ChewingKeyRest orig_rest = *rest;
839 guint16 tone = CHEWING_ZERO_TONE;
841 const divided_table_item_t * item = NULL;
844 if (options & USE_TONE) {
846 if (CHEWING_ZERO_TONE != tone) {
847 key->m_tone = CHEWING_ZERO_TONE;
852 item = context->m_full_pinyin_parser->retrieve_divided_item
853 (options, key, rest, instance->m_raw_full_pinyin,
854 strlen(instance->m_raw_full_pinyin));
858 assert(item->m_new_freq > 0);
860 ChewingKey divided_keys[2];
861 const char * pinyin = item->m_new_keys[0];
862 assert(context->m_full_pinyin_parser->
863 parse_one_key(options, divided_keys[0],
864 pinyin, strlen(pinyin)));
865 pinyin = item->m_new_keys[1];
866 assert(context->m_full_pinyin_parser->
867 parse_one_key(options, divided_keys[1],
868 pinyin, strlen(pinyin)));
870 gchar * new_pinyins = g_strdup_printf
871 ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
873 /* propagate the tone */
874 if (options & USE_TONE) {
875 if (CHEWING_ZERO_TONE != tone) {
876 assert(0 < tone && tone <= 5);
877 divided_keys[1].m_tone = tone;
879 gchar * tmp_str = g_strdup_printf
880 ("%s%d", new_pinyins, tone);
882 new_pinyins = tmp_str;
886 /* do pinyin search. */
887 int retval = context->m_pinyin_table->search
888 (2, divided_keys, ranges);
890 if (retval & SEARCH_OK) {
891 lookup_candidate_t template_item;
892 template_item.m_candidate_type = DIVIDED_CANDIDATE;
893 template_item.m_orig_rest = orig_rest;
894 template_item.m_new_pinyins = new_pinyins;
896 _append_items(context, ranges, &template_item, items);
903 if (options & USE_TONE) {
904 if (CHEWING_ZERO_TONE != tone) {
913 static bool _try_resplit_table(pinyin_instance_t * instance,
914 PhraseIndexRanges ranges,
916 CandidateVector items){
919 pinyin_context_t * & context = instance->m_context;
920 pinyin_option_t & options = context->m_options;
921 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
922 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
924 assert(pinyin_keys->len == pinyin_key_rests->len);
925 gint num_keys = pinyin_keys->len;
926 assert(offset + 1 < num_keys);
928 guint16 next_tone = CHEWING_ZERO_TONE;
930 /* handle "^fa'nan$" -> "fan'an" here */
931 ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
932 ChewingKeyRest, offset);
933 ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
934 ChewingKeyRest, offset + 1);
936 if (cur_rest->m_raw_end != next_rest->m_raw_begin)
939 ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
940 ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
944 if (CHEWING_ZERO_TONE != cur_key->m_tone)
947 ChewingKeyRest orig_rest;
948 orig_rest.m_raw_begin = cur_rest->m_raw_begin;
949 orig_rest.m_raw_end = next_rest->m_raw_end;
952 if (options & USE_TONE) {
953 next_tone = next_key->m_tone;
954 if (CHEWING_ZERO_TONE != next_tone) {
955 next_key->m_tone = CHEWING_ZERO_TONE;
956 next_rest->m_raw_end --;
960 /* lookup re-split table */
961 const char * str = instance->m_raw_full_pinyin;
962 const resplit_table_item_t * item_by_orig =
963 context->m_full_pinyin_parser->
964 retrieve_resplit_item_by_original_pinyins
965 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
967 const resplit_table_item_t * item_by_new =
968 context->m_full_pinyin_parser->
969 retrieve_resplit_item_by_resplit_pinyins
970 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
972 /* there are no same couple of pinyins in re-split table. */
973 assert(!(item_by_orig && item_by_new));
975 ChewingKey resplit_keys[2];
976 const char * pinyins[2];
978 bool tosearch = false;
979 if (item_by_orig && item_by_orig->m_new_freq) {
980 pinyins[0] = item_by_orig->m_new_keys[0];
981 pinyins[1] = item_by_orig->m_new_keys[1];
983 assert(context->m_full_pinyin_parser->
984 parse_one_key(options, resplit_keys[0],
985 pinyins[0], strlen(pinyins[0])));
987 assert(context->m_full_pinyin_parser->
988 parse_one_key(options, resplit_keys[1],
989 pinyins[1], strlen(pinyins[1])));
993 if (item_by_new && item_by_new->m_orig_freq) {
994 pinyins[0] = item_by_new->m_orig_keys[0];
995 pinyins[1] = item_by_new->m_orig_keys[1];
997 assert(context->m_full_pinyin_parser->
998 parse_one_key(options, resplit_keys[0],
999 pinyins[0], strlen(pinyins[0])));
1001 assert(context->m_full_pinyin_parser->
1002 parse_one_key(options, resplit_keys[1],
1003 pinyins[1], strlen(pinyins[1])));
1008 gchar * new_pinyins = g_strdup_printf
1009 ("%s'%s", pinyins[0], pinyins[1]);
1011 /* propagate the tone */
1012 if (options & USE_TONE) {
1013 if (CHEWING_ZERO_TONE != next_tone) {
1014 assert(0 < next_tone && next_tone <= 5);
1015 resplit_keys[1].m_tone = next_tone;
1017 gchar * tmp_str = g_strdup_printf
1018 ("%s%d", new_pinyins, next_tone);
1019 g_free(new_pinyins);
1020 new_pinyins = tmp_str;
1024 /* do pinyin search. */
1025 int retval = context->m_pinyin_table->search
1026 (2, resplit_keys, ranges);
1028 if (retval & SEARCH_OK) {
1029 lookup_candidate_t template_item;
1030 template_item.m_candidate_type = RESPLIT_CANDIDATE;
1031 template_item.m_orig_rest = orig_rest;
1032 template_item.m_new_pinyins = new_pinyins;
1034 _append_items(context, ranges, &template_item, items);
1037 g_free(new_pinyins);
1041 if (options & USE_TONE) {
1042 if (CHEWING_ZERO_TONE != next_tone) {
1043 next_key->m_tone = next_tone;
1044 next_rest->m_raw_end ++;
1051 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1053 CandidateVector candidates){
1055 pinyin_context_t * & context = instance->m_context;
1056 pinyin_option_t & options = context->m_options;
1057 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1060 for (size_t i = 0; i < candidates->len; ++i) {
1061 lookup_candidate_t * candidate = &g_array_index
1062 (candidates, lookup_candidate_t, i);
1063 g_free(candidate->m_new_pinyins);
1065 g_array_set_size(candidates, 0);
1067 size_t pinyin_len = pinyin_keys->len - offset;
1068 pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1071 /* lookup the previous token here. */
1072 phrase_token_t prev_token = null_token;
1074 if (options & DYNAMIC_ADJUST) {
1075 prev_token = _get_previous_token(instance, offset);
1078 SingleGram merged_gram;
1079 SingleGram * system_gram = NULL, * user_gram = NULL;
1081 if (options & DYNAMIC_ADJUST) {
1082 if (null_token != prev_token) {
1083 context->m_system_bigram->load(prev_token, system_gram);
1084 context->m_user_bigram->load(prev_token, user_gram);
1085 merge_single_gram(&merged_gram, system_gram, user_gram);
1089 PhraseIndexRanges ranges;
1090 memset(ranges, 0, sizeof(ranges));
1091 context->m_phrase_index->prepare_ranges(ranges);
1093 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1095 if (1 == pinyin_len) {
1096 /* because there is only one pinyin left,
1097 * the following for-loop will not produce 2 character candidates.
1098 * the if-branch will fill the candidate list with
1099 * 2 character candidates.
1102 if (options & USE_DIVIDED_TABLE) {
1103 g_array_set_size(items, 0);
1105 if (_try_divided_table(instance, ranges, offset, items)) {
1107 g_array_sort(items, compare_item_with_token);
1109 _remove_duplicated_items(items);
1111 _compute_frequency_of_items(context, prev_token,
1112 &merged_gram, items);
1114 /* sort the candidates of the same length by frequency. */
1115 g_array_sort(items, compare_item_with_frequency);
1117 /* transfer back items to tokens, and save it into candidates */
1118 for (i = 0; i < items->len; ++i) {
1119 lookup_candidate_t * item = &g_array_index
1120 (items, lookup_candidate_t, i);
1121 g_array_append_val(candidates, *item);
1127 for (i = pinyin_len; i >= 1; --i) {
1129 g_array_set_size(items, 0);
1132 /* handle fuzzy pinyin segment here. */
1133 if (options & USE_DIVIDED_TABLE) {
1134 found = _try_divided_table(instance, ranges, offset, items) ||
1137 if (options & USE_RESPLIT_TABLE) {
1138 found = _try_resplit_table(instance, ranges, offset, items) ||
1143 ChewingKey * keys = &g_array_index
1144 (pinyin_keys, ChewingKey, offset);
1146 /* do pinyin search. */
1147 int retval = context->m_pinyin_table->search
1150 found = (retval & SEARCH_OK) || found;
1155 lookup_candidate_t template_item;
1156 _append_items(context, ranges, &template_item, items);
1158 g_array_sort(items, compare_item_with_token);
1160 _remove_duplicated_items(items);
1162 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1164 g_array_sort(items, compare_item_with_frequency);
1166 for (size_t k = 0; k < items->len; ++k) {
1167 lookup_candidate_t * item = &g_array_index
1168 (items, lookup_candidate_t, k);
1169 g_array_append_val(candidates, *item);
1172 if (!(retval & SEARCH_CONTINUED))
1176 g_array_free(items, TRUE);
1178 context->m_phrase_index->destroy_ranges(ranges);
1188 int pinyin_choose_candidate(pinyin_instance_t * instance,
1190 phrase_token_t token){
1191 pinyin_context_t * & context = instance->m_context;
1193 guint8 len = context->m_pinyin_lookup->add_constraint
1194 (instance->m_constraints, offset, token);
1196 bool retval = context->m_pinyin_lookup->validate_constraint
1197 (instance->m_constraints, instance->m_pinyin_keys) && len;
1199 return offset + len;
1202 int pinyin_choose_full_pinyin_candidate(pinyin_instance_t * instance,
1204 lookup_candidate_t * candidate){
1205 pinyin_context_t * & context = instance->m_context;
1207 if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1208 RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1209 /* update full pinyin. */
1210 gchar * oldpinyins = instance->m_raw_full_pinyin;
1211 const ChewingKeyRest rest = candidate->m_orig_rest;
1212 oldpinyins[rest.m_raw_begin] = '\0';
1213 const gchar * left_part = oldpinyins;
1214 const gchar * right_part = oldpinyins + rest.m_raw_end;
1215 gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1218 instance->m_raw_full_pinyin = newpinyins;
1220 /* re-parse the full pinyin. */
1221 const gchar * pinyins = instance->m_raw_full_pinyin;
1222 int pinyin_len = strlen(pinyins);
1223 int parse_len = context->m_full_pinyin_parser->parse
1224 (context->m_options, instance->m_pinyin_keys,
1225 instance->m_pinyin_key_rests, pinyins, pinyin_len);
1227 /* Note: there may be some un-parsable input here. */
1230 /* sync m_constraints to the length of m_pinyin_keys. */
1231 bool retval = context->m_pinyin_lookup->validate_constraint
1232 (instance->m_constraints, instance->m_pinyin_keys);
1234 phrase_token_t token = candidate->m_token;
1235 guint8 len = context->m_pinyin_lookup->add_constraint
1236 (instance->m_constraints, offset, token);
1238 /* safe guard: validate the m_constraints again. */
1239 retval = context->m_pinyin_lookup->validate_constraint
1240 (instance->m_constraints, instance->m_pinyin_keys) && len;
1242 return offset + len;
1246 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1248 pinyin_context_t * & context = instance->m_context;
1250 bool retval = context->m_pinyin_lookup->clear_constraint
1251 (instance->m_constraints, offset);
1256 bool pinyin_clear_constraints(pinyin_instance_t * instance){
1257 pinyin_context_t * & context = instance->m_context;
1260 for ( size_t i = 0; i < instance->m_constraints->len; ++i ) {
1261 retval = context->m_pinyin_lookup->clear_constraint
1262 (instance->m_constraints, i) && retval;
1268 /* the returned word should be freed by g_free. */
1269 bool pinyin_translate_token(pinyin_instance_t * instance,
1270 phrase_token_t token, char ** word){
1271 pinyin_context_t * & context = instance->m_context;
1273 ucs4_t buffer[MAX_PHRASE_LENGTH];
1275 int retval = context->m_phrase_index->get_phrase_item(token, item);
1276 item.get_phrase_string(buffer);
1277 guint8 length = item.get_phrase_length();
1278 *word = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1279 return ERROR_OK == retval;
1282 bool pinyin_train(pinyin_instance_t * instance){
1283 if (!instance->m_context->m_user_dir)
1286 pinyin_context_t * & context = instance->m_context;
1287 context->m_modified = true;
1289 bool retval = context->m_pinyin_lookup->train_result2
1290 (instance->m_pinyin_keys, instance->m_constraints,
1291 instance->m_match_results);
1296 bool pinyin_reset(pinyin_instance_t * instance){
1297 g_array_set_size(instance->m_pinyin_keys, 0);
1298 g_array_set_size(instance->m_pinyin_key_rests, 0);
1299 g_array_set_size(instance->m_constraints, 0);
1300 g_array_set_size(instance->m_match_results, 0);
1306 * Note: prefix is the text before the pre-edit string.