3 * Library to deal with pinyin.
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
29 /* a glue layer for input method integration. */
31 struct _pinyin_context_t{
32 pinyin_option_t m_options;
34 FullPinyinParser2 * m_full_pinyin_parser;
35 DoublePinyinParser2 * m_double_pinyin_parser;
36 ChewingParser2 * m_chewing_parser;
38 FacadeChewingTable * m_pinyin_table;
39 FacadePhraseTable * m_phrase_table;
40 FacadePhraseIndex * m_phrase_index;
41 Bigram * m_system_bigram;
42 Bigram * m_user_bigram;
44 PinyinLookup * m_pinyin_lookup;
45 PhraseLookup * m_phrase_lookup;
52 struct _import_iterator_t{
53 guint8 m_phrase_index;
56 static bool check_format(const char * userdir){
57 gchar * filename = g_build_filename
58 (userdir, "version", NULL);
61 bool exists = chunk.load(filename);
65 (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
66 strlen(LIBPINYIN_FORMAT_VERSION) + 1));
73 /* clean up files, if version mis-matches. */
74 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
75 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
77 if (NOT_USED == table_info->m_file_type)
80 if (NULL == table_info->m_user_filename)
83 const char * userfilename = table_info->m_user_filename;
85 /* remove dbin file. */
86 filename = g_build_filename(userdir, userfilename, NULL);
91 filename = g_build_filename
92 (userdir, "user.db", NULL);
99 static bool mark_version(const char * userdir){
100 gchar * filename = g_build_filename
101 (userdir, "version", NULL);
103 chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
104 strlen(LIBPINYIN_FORMAT_VERSION) + 1);
105 bool retval = chunk.save(filename);
110 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
111 pinyin_context_t * context = new pinyin_context_t;
113 context->m_options = USE_TONE;
115 context->m_system_dir = g_strdup(systemdir);
116 context->m_user_dir = g_strdup(userdir);
117 context->m_modified = false;
119 check_format(context->m_user_dir);
121 context->m_pinyin_table = new FacadeChewingTable;
122 MemoryChunk * chunk = new MemoryChunk;
123 gchar * filename = g_build_filename
124 (context->m_system_dir, "pinyin_index.bin", NULL);
125 if (!chunk->load(filename)) {
126 fprintf(stderr, "open %s failed!\n", filename);
131 context->m_pinyin_table->load(context->m_options, chunk, NULL);
133 context->m_full_pinyin_parser = new FullPinyinParser2;
134 context->m_double_pinyin_parser = new DoublePinyinParser2;
135 context->m_chewing_parser = new ChewingParser2;
137 context->m_phrase_table = new FacadePhraseTable;
138 chunk = new MemoryChunk;
139 filename = g_build_filename(context->m_system_dir, "phrase_index.bin", NULL);
140 if (!chunk->load(filename)) {
141 fprintf(stderr, "open %s failed!\n", filename);
145 context->m_phrase_table->load(chunk, NULL);
147 context->m_phrase_index = new FacadePhraseIndex;
149 /* hack here: directly call load phrase library. */
150 pinyin_load_phrase_library(context, 1);
152 context->m_system_bigram = new Bigram;
153 filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
154 context->m_system_bigram->attach(filename, ATTACH_READONLY);
157 context->m_user_bigram = new Bigram;
158 filename = g_build_filename(context->m_user_dir, "user.db", NULL);
159 context->m_user_bigram->load_db(filename);
162 context->m_pinyin_lookup = new PinyinLookup
163 ( context->m_options, context->m_pinyin_table,
164 context->m_phrase_index, context->m_system_bigram,
165 context->m_user_bigram);
167 context->m_phrase_lookup = new PhraseLookup
168 (context->m_phrase_table, context->m_phrase_index,
169 context->m_system_bigram, context->m_user_bigram);
174 bool pinyin_load_phrase_library(pinyin_context_t * context,
176 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
177 const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
179 if (SYSTEM_FILE == table_info->m_file_type) {
180 /* system phrase library */
181 MemoryChunk * chunk = new MemoryChunk;
183 const char * systemfilename = table_info->m_system_filename;
184 /* check bin file in system dir. */
185 gchar * chunkfilename = g_build_filename(context->m_system_dir,
186 systemfilename, NULL);
187 chunk->load(chunkfilename);
188 g_free(chunkfilename);
190 context->m_phrase_index->load(index, chunk);
192 const char * userfilename = table_info->m_user_filename;
194 chunkfilename = g_build_filename(context->m_user_dir,
197 MemoryChunk * log = new MemoryChunk;
198 log->load(chunkfilename);
199 g_free(chunkfilename);
201 /* merge the chunk log. */
202 context->m_phrase_index->merge(index, log);
206 if (USER_FILE == table_info->m_file_type) {
207 /* user phrase library */
208 MemoryChunk * chunk = new MemoryChunk;
209 const char * userfilename = table_info->m_user_filename;
211 gchar * chunkfilename = g_build_filename(context->m_user_dir,
214 /* check bin file exists. if not, create a new one. */
215 if (chunk->load(chunkfilename)) {
216 context->m_phrase_index->load(index, chunk);
219 context->m_phrase_index->create_sub_phrase(index);
222 g_free(chunkfilename);
229 bool pinyin_unload_phrase_library(pinyin_context_t * context,
231 /* gb_char.bin can't be unloaded. */
235 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
237 context->m_phrase_index->unload(index);
242 bool pinyin_save(pinyin_context_t * context){
243 if (!context->m_user_dir)
246 if (!context->m_modified)
249 context->m_phrase_index->compact();
251 /* skip the reserved zero phrase library. */
252 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
253 PhraseIndexRange range;
254 int retval = context->m_phrase_index->get_range(i, range);
256 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
259 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
261 if (NOT_USED == table_info->m_file_type)
264 const char * userfilename = table_info->m_user_filename;
266 if (NULL == userfilename)
269 if (SYSTEM_FILE == table_info->m_file_type) {
270 /* system phrase library */
271 MemoryChunk * chunk = new MemoryChunk;
272 MemoryChunk * log = new MemoryChunk;
273 const char * systemfilename = table_info->m_system_filename;
275 /* check bin file in system dir. */
276 gchar * chunkfilename = g_build_filename(context->m_system_dir,
277 systemfilename, NULL);
278 chunk->load(chunkfilename);
279 g_free(chunkfilename);
280 context->m_phrase_index->diff(i, chunk, log);
282 const char * userfilename = table_info->m_user_filename;
283 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
285 gchar * tmppathname = g_build_filename(context->m_user_dir,
289 gchar * chunkpathname = g_build_filename(context->m_user_dir,
291 log->save(tmppathname);
292 rename(tmppathname, chunkpathname);
293 g_free(chunkpathname);
298 if (USER_FILE == table_info->m_file_type) {
299 /* user phrase library */
300 MemoryChunk * chunk = new MemoryChunk;
301 context->m_phrase_index->store(i, chunk);
303 const char * userfilename = table_info->m_user_filename;
304 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
305 gchar * tmppathname = g_build_filename(context->m_user_dir,
309 gchar * chunkpathname = g_build_filename(context->m_user_dir,
312 chunk->save(tmppathname);
313 rename(tmppathname, chunkpathname);
314 g_free(chunkpathname);
320 gchar * tmpfilename = g_build_filename(context->m_user_dir,
321 "user.db.tmp", NULL);
323 gchar * filename = g_build_filename(context->m_user_dir, "user.db", NULL);
324 context->m_user_bigram->save_db(tmpfilename);
325 rename(tmpfilename, filename);
329 mark_version(context->m_user_dir);
331 context->m_modified = false;
335 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
336 DoublePinyinScheme scheme){
337 context->m_double_pinyin_parser->set_scheme(scheme);
341 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
342 ChewingScheme scheme){
343 context->m_chewing_parser->set_scheme(scheme);
348 void pinyin_fini(pinyin_context_t * context){
349 delete context->m_full_pinyin_parser;
350 delete context->m_double_pinyin_parser;
351 delete context->m_chewing_parser;
352 delete context->m_pinyin_table;
353 delete context->m_phrase_table;
354 delete context->m_phrase_index;
355 delete context->m_system_bigram;
356 delete context->m_user_bigram;
357 delete context->m_pinyin_lookup;
358 delete context->m_phrase_lookup;
360 g_free(context->m_system_dir);
361 g_free(context->m_user_dir);
362 context->m_modified = false;
365 /* copy from options to context->m_options. */
366 bool pinyin_set_options(pinyin_context_t * context,
367 pinyin_option_t options){
368 context->m_options = options;
369 context->m_pinyin_table->set_options(context->m_options);
370 context->m_pinyin_lookup->set_options(context->m_options);
375 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
376 pinyin_instance_t * instance = new pinyin_instance_t;
377 instance->m_context = context;
379 instance->m_raw_full_pinyin = NULL;
381 instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
382 instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
383 instance->m_pinyin_key_rests =
384 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
385 instance->m_constraints = g_array_new
386 (FALSE, FALSE, sizeof(lookup_constraint_t));
387 instance->m_match_results =
388 g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
393 void pinyin_free_instance(pinyin_instance_t * instance){
394 g_free(instance->m_raw_full_pinyin);
395 g_array_free(instance->m_prefixes, TRUE);
396 g_array_free(instance->m_pinyin_keys, TRUE);
397 g_array_free(instance->m_pinyin_key_rests, TRUE);
398 g_array_free(instance->m_constraints, TRUE);
399 g_array_free(instance->m_match_results, TRUE);
405 static bool pinyin_update_constraints(pinyin_instance_t * instance){
406 pinyin_context_t * & context = instance->m_context;
407 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
408 CandidateConstraints & constraints = instance->m_constraints;
410 size_t key_len = constraints->len;
411 g_array_set_size(constraints, pinyin_keys->len);
412 for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
413 lookup_constraint_t * constraint =
414 &g_array_index(constraints, lookup_constraint_t, i);
415 constraint->m_type = NO_CONSTRAINT;
418 context->m_pinyin_lookup->validate_constraint
419 (constraints, pinyin_keys);
425 bool pinyin_guess_sentence(pinyin_instance_t * instance){
426 pinyin_context_t * & context = instance->m_context;
428 g_array_set_size(instance->m_prefixes, 0);
429 g_array_append_val(instance->m_prefixes, sentence_start);
431 pinyin_update_constraints(instance);
432 bool retval = context->m_pinyin_lookup->get_best_match
433 (instance->m_prefixes,
434 instance->m_pinyin_keys,
435 instance->m_constraints,
436 instance->m_match_results);
441 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
442 const char * prefix){
443 pinyin_context_t * & context = instance->m_context;
445 g_array_set_size(instance->m_prefixes, 0);
446 g_array_append_val(instance->m_prefixes, sentence_start);
449 ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &written, NULL);
451 if (ucs4_str && written) {
453 for (ssize_t i = 1; i <= written; ++i) {
454 if (i > MAX_PHRASE_LENGTH)
457 phrase_token_t token = null_token;
458 ucs4_t * start = ucs4_str + written - i;
459 int result = context->m_phrase_table->search(i, start, token);
460 if (result & SEARCH_OK)
461 g_array_append_val(instance->m_prefixes, token);
466 pinyin_update_constraints(instance);
467 bool retval = context->m_pinyin_lookup->get_best_match
468 (instance->m_prefixes,
469 instance->m_pinyin_keys,
470 instance->m_constraints,
471 instance->m_match_results);
476 bool pinyin_phrase_segment(pinyin_instance_t * instance,
477 const char * sentence){
478 pinyin_context_t * & context = instance->m_context;
480 const glong num_of_chars = g_utf8_strlen(sentence, -1);
482 ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
484 g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
486 bool retval = context->m_phrase_lookup->get_best_match
487 (ucs4_len, ucs4_str, instance->m_match_results);
493 /* the returned sentence should be freed by g_free(). */
494 bool pinyin_get_sentence(pinyin_instance_t * instance,
496 pinyin_context_t * & context = instance->m_context;
498 bool retval = pinyin::convert_to_utf8
499 (context->m_phrase_index, instance->m_match_results,
505 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
506 const char * onepinyin,
507 ChewingKey * onekey){
508 pinyin_context_t * & context = instance->m_context;
510 int pinyin_len = strlen(onepinyin);
511 int parse_len = context->m_full_pinyin_parser->parse_one_key
512 ( context->m_options, *onekey, onepinyin, pinyin_len);
513 return pinyin_len == parse_len;
516 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
517 const char * pinyins){
518 pinyin_context_t * & context = instance->m_context;
520 g_free(instance->m_raw_full_pinyin);
521 instance->m_raw_full_pinyin = g_strdup(pinyins);
522 int pinyin_len = strlen(pinyins);
524 int parse_len = context->m_full_pinyin_parser->parse
525 ( context->m_options, instance->m_pinyin_keys,
526 instance->m_pinyin_key_rests, pinyins, pinyin_len);
531 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
532 const char * onepinyin,
533 ChewingKey * onekey){
534 pinyin_context_t * & context = instance->m_context;
536 int pinyin_len = strlen(onepinyin);
537 int parse_len = context->m_double_pinyin_parser->parse_one_key
538 ( context->m_options, *onekey, onepinyin, pinyin_len);
539 return pinyin_len == parse_len;
542 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
543 const char * pinyins){
544 pinyin_context_t * & context = instance->m_context;
545 int pinyin_len = strlen(pinyins);
547 int parse_len = context->m_double_pinyin_parser->parse
548 ( context->m_options, instance->m_pinyin_keys,
549 instance->m_pinyin_key_rests, pinyins, pinyin_len);
554 bool pinyin_parse_chewing(pinyin_instance_t * instance,
555 const char * onechewing,
556 ChewingKey * onekey){
557 pinyin_context_t * & context = instance->m_context;
559 int chewing_len = strlen(onechewing);
560 int parse_len = context->m_chewing_parser->parse_one_key
561 ( context->m_options, *onekey, onechewing, chewing_len );
562 return chewing_len == parse_len;
565 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
566 const char * chewings){
567 pinyin_context_t * & context = instance->m_context;
568 int chewing_len = strlen(chewings);
570 int parse_len = context->m_chewing_parser->parse
571 ( context->m_options, instance->m_pinyin_keys,
572 instance->m_pinyin_key_rests, chewings, chewing_len);
577 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
578 const char key, const char ** symbol) {
579 pinyin_context_t * & context = instance->m_context;
580 return context->m_chewing_parser->in_chewing_scheme
581 (context->m_options, key, symbol);
585 static gint compare_item_with_token(gconstpointer lhs,
587 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
588 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
590 phrase_token_t token_lhs = item_lhs->m_token;
591 phrase_token_t token_rhs = item_rhs->m_token;
593 return (token_lhs - token_rhs);
596 static gint compare_item_with_frequency(gconstpointer lhs,
598 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
599 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
601 guint32 freq_lhs = item_lhs->m_freq;
602 guint32 freq_rhs = item_rhs->m_freq;
604 return -(freq_lhs - freq_rhs); /* in descendant order */
607 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
609 phrase_token_t prev_token = null_token;
613 /* get previous token from prefixes. */
614 prev_token = sentence_start;
615 size_t prev_token_len = 0;
617 pinyin_context_t * context = instance->m_context;
618 TokenVector prefixes = instance->m_prefixes;
621 for (size_t i = 0; i < prefixes->len; ++i) {
622 phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
623 if (sentence_start == token)
626 int retval = context->m_phrase_index->get_phrase_item(token, item);
627 if (ERROR_OK == retval) {
628 size_t token_len = item.get_phrase_length();
629 if (token_len > prev_token_len) {
630 /* found longer match, and save it. */
632 prev_token_len = token_len;
637 /* get previous token from match results. */
640 phrase_token_t cur_token = g_array_index
641 (instance->m_match_results, phrase_token_t, offset);
642 if (null_token != cur_token) {
643 for (i = offset - 1; i >= 0; --i) {
644 cur_token = g_array_index
645 (instance->m_match_results, phrase_token_t, i);
646 if (null_token != cur_token) {
647 prev_token = cur_token;
657 static void _append_items(pinyin_context_t * context,
658 PhraseIndexRanges ranges,
659 lookup_candidate_t * template_item,
660 CandidateVector items) {
661 /* reduce and append to a single GArray. */
662 for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
663 if (NULL == ranges[m])
666 for (size_t n = 0; n < ranges[m]->len; ++n) {
667 PhraseIndexRange * range =
668 &g_array_index(ranges[m], PhraseIndexRange, n);
669 for (size_t k = range->m_range_begin;
670 k < range->m_range_end; ++k) {
671 lookup_candidate_t item;
672 item.m_candidate_type = template_item->m_candidate_type;
674 item.m_orig_rest = template_item->m_orig_rest;
675 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
676 item.m_freq = template_item->m_freq;
677 g_array_append_val(items, item);
683 static void _remove_duplicated_items(CandidateVector items) {
684 /* remove the duplicated items. */
685 phrase_token_t last_token = null_token, saved_token;
686 for (size_t n = 0; n < items->len; ++n) {
687 lookup_candidate_t * item = &g_array_index
688 (items, lookup_candidate_t, n);
690 saved_token = item->m_token;
691 if (last_token == saved_token) {
692 g_array_remove_index(items, n);
695 last_token = saved_token;
699 static void _compute_frequency_of_items(pinyin_context_t * context,
700 phrase_token_t prev_token,
701 SingleGram * merged_gram,
702 CandidateVector items) {
703 pinyin_option_t & options = context->m_options;
706 PhraseItem cached_item;
707 /* compute all freqs. */
708 for (i = 0; i < items->len; ++i) {
709 lookup_candidate_t * item = &g_array_index
710 (items, lookup_candidate_t, i);
711 phrase_token_t & token = item->m_token;
713 gfloat bigram_poss = 0; guint32 total_freq = 0;
714 if (options & DYNAMIC_ADJUST) {
715 if (null_token != prev_token) {
716 guint32 bigram_freq = 0;
717 merged_gram->get_total_freq(total_freq);
718 merged_gram->get_freq(token, bigram_freq);
720 bigram_poss = bigram_freq / (gfloat)total_freq;
724 /* compute the m_freq. */
725 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
726 phrase_index->get_phrase_item(token, cached_item);
727 total_freq = phrase_index->get_phrase_index_total_freq();
728 assert (0 < total_freq);
730 /* Note: possibility value <= 1.0. */
731 guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
732 (1 - LAMBDA_PARAMETER) *
733 cached_item.get_unigram_frequency() /
734 (gfloat) total_freq) * 256 * 256 * 256;
739 bool pinyin_get_candidates(pinyin_instance_t * instance,
741 TokenVector candidates) {
743 pinyin_context_t * & context = instance->m_context;
744 pinyin_option_t & options = context->m_options;
745 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
746 g_array_set_size(candidates, 0);
748 size_t pinyin_len = pinyin_keys->len - offset;
751 /* lookup the previous token here. */
752 phrase_token_t prev_token = null_token;
754 if (options & DYNAMIC_ADJUST) {
755 prev_token = _get_previous_token(instance, offset);
758 SingleGram merged_gram;
759 SingleGram * system_gram = NULL, * user_gram = NULL;
761 if (options & DYNAMIC_ADJUST) {
762 if (null_token != prev_token) {
763 context->m_system_bigram->load(prev_token, system_gram);
764 context->m_user_bigram->load(prev_token, user_gram);
765 merge_single_gram(&merged_gram, system_gram, user_gram);
769 PhraseIndexRanges ranges;
770 memset(ranges, 0, sizeof(ranges));
771 context->m_phrase_index->prepare_ranges(ranges);
773 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
775 for (i = pinyin_len; i >= 1; --i) {
776 g_array_set_size(items, 0);
778 ChewingKey * keys = &g_array_index
779 (pinyin_keys, ChewingKey, offset);
781 /* do pinyin search. */
782 int retval = context->m_pinyin_table->search
785 if ( !(retval & SEARCH_OK) )
788 lookup_candidate_t template_item;
789 _append_items(context, ranges, &template_item, items);
791 g_array_sort(items, compare_item_with_token);
793 _remove_duplicated_items(items);
795 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
797 /* sort the candidates of the same length by frequency. */
798 g_array_sort(items, compare_item_with_frequency);
800 /* transfer back items to tokens, and save it into candidates */
801 for (ssize_t k = 0; k < items->len; ++k) {
802 lookup_candidate_t * item = &g_array_index
803 (items, lookup_candidate_t, k);
804 g_array_append_val(candidates, item->m_token);
807 if (!(retval & SEARCH_CONTINUED))
811 g_array_free(items, TRUE);
813 context->m_phrase_index->destroy_ranges(ranges);
823 static bool _try_divided_table(pinyin_instance_t * instance,
824 PhraseIndexRanges ranges,
826 CandidateVector items){
829 pinyin_context_t * & context = instance->m_context;
830 pinyin_option_t & options = context->m_options;
831 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
832 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
834 assert(pinyin_keys->len == pinyin_key_rests->len);
835 gint num_keys = pinyin_keys->len;
836 assert(offset < num_keys);
838 /* handle "^xian$" -> "xi'an" here */
839 ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
840 ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
841 ChewingKeyRest, offset);
842 ChewingKeyRest orig_rest = *rest;
843 guint16 tone = CHEWING_ZERO_TONE;
845 const divided_table_item_t * item = NULL;
848 if (options & USE_TONE) {
850 if (CHEWING_ZERO_TONE != tone) {
851 key->m_tone = CHEWING_ZERO_TONE;
856 item = context->m_full_pinyin_parser->retrieve_divided_item
857 (options, key, rest, instance->m_raw_full_pinyin,
858 strlen(instance->m_raw_full_pinyin));
862 assert(item->m_new_freq > 0);
864 ChewingKey divided_keys[2];
865 const char * pinyin = item->m_new_keys[0];
866 assert(context->m_full_pinyin_parser->
867 parse_one_key(options, divided_keys[0],
868 pinyin, strlen(pinyin)));
869 pinyin = item->m_new_keys[1];
870 assert(context->m_full_pinyin_parser->
871 parse_one_key(options, divided_keys[1],
872 pinyin, strlen(pinyin)));
874 gchar * new_pinyins = g_strdup_printf
875 ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
877 /* propagate the tone */
878 if (options & USE_TONE) {
879 if (CHEWING_ZERO_TONE != tone) {
880 assert(0 < tone && tone <= 5);
881 divided_keys[1].m_tone = tone;
883 gchar * tmp_str = g_strdup_printf
884 ("%s%d", new_pinyins, tone);
886 new_pinyins = tmp_str;
890 /* do pinyin search. */
891 int retval = context->m_pinyin_table->search
892 (2, divided_keys, ranges);
894 if (retval & SEARCH_OK) {
895 lookup_candidate_t template_item;
896 template_item.m_candidate_type = DIVIDED_CANDIDATE;
897 template_item.m_orig_rest = orig_rest;
898 template_item.m_new_pinyins = new_pinyins;
900 _append_items(context, ranges, &template_item, items);
907 if (options & USE_TONE) {
908 if (CHEWING_ZERO_TONE != tone) {
917 static bool _try_resplit_table(pinyin_instance_t * instance,
918 PhraseIndexRanges ranges,
920 CandidateVector items){
923 pinyin_context_t * & context = instance->m_context;
924 pinyin_option_t & options = context->m_options;
925 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
926 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
928 assert(pinyin_keys->len == pinyin_key_rests->len);
929 gint num_keys = pinyin_keys->len;
930 assert(offset + 1 < num_keys);
932 guint16 next_tone = CHEWING_ZERO_TONE;
934 /* handle "^fa'nan$" -> "fan'an" here */
935 ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
936 ChewingKeyRest, offset);
937 ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
938 ChewingKeyRest, offset + 1);
940 if (cur_rest->m_raw_end != next_rest->m_raw_begin)
943 ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
944 ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
948 if (CHEWING_ZERO_TONE != cur_key->m_tone)
951 ChewingKeyRest orig_rest;
952 orig_rest.m_raw_begin = cur_rest->m_raw_begin;
953 orig_rest.m_raw_end = next_rest->m_raw_end;
956 if (options & USE_TONE) {
957 next_tone = next_key->m_tone;
958 if (CHEWING_ZERO_TONE != next_tone) {
959 next_key->m_tone = CHEWING_ZERO_TONE;
960 next_rest->m_raw_end --;
964 /* lookup re-split table */
965 const char * str = instance->m_raw_full_pinyin;
966 const resplit_table_item_t * item_by_orig =
967 context->m_full_pinyin_parser->
968 retrieve_resplit_item_by_original_pinyins
969 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
971 const resplit_table_item_t * item_by_new =
972 context->m_full_pinyin_parser->
973 retrieve_resplit_item_by_resplit_pinyins
974 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
976 /* there are no same couple of pinyins in re-split table. */
977 assert(!(item_by_orig && item_by_new));
979 ChewingKey resplit_keys[2];
980 const char * pinyins[2];
982 bool tosearch = false;
983 if (item_by_orig && item_by_orig->m_new_freq) {
984 pinyins[0] = item_by_orig->m_new_keys[0];
985 pinyins[1] = item_by_orig->m_new_keys[1];
987 assert(context->m_full_pinyin_parser->
988 parse_one_key(options, resplit_keys[0],
989 pinyins[0], strlen(pinyins[0])));
991 assert(context->m_full_pinyin_parser->
992 parse_one_key(options, resplit_keys[1],
993 pinyins[1], strlen(pinyins[1])));
997 if (item_by_new && item_by_new->m_orig_freq) {
998 pinyins[0] = item_by_new->m_orig_keys[0];
999 pinyins[1] = item_by_new->m_orig_keys[1];
1001 assert(context->m_full_pinyin_parser->
1002 parse_one_key(options, resplit_keys[0],
1003 pinyins[0], strlen(pinyins[0])));
1005 assert(context->m_full_pinyin_parser->
1006 parse_one_key(options, resplit_keys[1],
1007 pinyins[1], strlen(pinyins[1])));
1012 gchar * new_pinyins = g_strdup_printf
1013 ("%s'%s", pinyins[0], pinyins[1]);
1015 /* propagate the tone */
1016 if (options & USE_TONE) {
1017 if (CHEWING_ZERO_TONE != next_tone) {
1018 assert(0 < next_tone && next_tone <= 5);
1019 resplit_keys[1].m_tone = next_tone;
1021 gchar * tmp_str = g_strdup_printf
1022 ("%s%d", new_pinyins, next_tone);
1023 g_free(new_pinyins);
1024 new_pinyins = tmp_str;
1028 /* do pinyin search. */
1029 int retval = context->m_pinyin_table->search
1030 (2, resplit_keys, ranges);
1032 if (retval & SEARCH_OK) {
1033 lookup_candidate_t template_item;
1034 template_item.m_candidate_type = RESPLIT_CANDIDATE;
1035 template_item.m_orig_rest = orig_rest;
1036 template_item.m_new_pinyins = new_pinyins;
1038 _append_items(context, ranges, &template_item, items);
1041 g_free(new_pinyins);
1045 if (options & USE_TONE) {
1046 if (CHEWING_ZERO_TONE != next_tone) {
1047 next_key->m_tone = next_tone;
1048 next_rest->m_raw_end ++;
1055 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1057 CandidateVector candidates){
1059 pinyin_context_t * & context = instance->m_context;
1060 pinyin_option_t & options = context->m_options;
1061 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1064 for (size_t i = 0; i < candidates->len; ++i) {
1065 lookup_candidate_t * candidate = &g_array_index
1066 (candidates, lookup_candidate_t, i);
1067 g_free(candidate->m_new_pinyins);
1069 g_array_set_size(candidates, 0);
1071 size_t pinyin_len = pinyin_keys->len - offset;
1072 pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1075 /* lookup the previous token here. */
1076 phrase_token_t prev_token = null_token;
1078 if (options & DYNAMIC_ADJUST) {
1079 prev_token = _get_previous_token(instance, offset);
1082 SingleGram merged_gram;
1083 SingleGram * system_gram = NULL, * user_gram = NULL;
1085 if (options & DYNAMIC_ADJUST) {
1086 if (null_token != prev_token) {
1087 context->m_system_bigram->load(prev_token, system_gram);
1088 context->m_user_bigram->load(prev_token, user_gram);
1089 merge_single_gram(&merged_gram, system_gram, user_gram);
1093 PhraseIndexRanges ranges;
1094 memset(ranges, 0, sizeof(ranges));
1095 context->m_phrase_index->prepare_ranges(ranges);
1097 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1099 if (1 == pinyin_len) {
1100 /* because there is only one pinyin left,
1101 * the following for-loop will not produce 2 character candidates.
1102 * the if-branch will fill the candidate list with
1103 * 2 character candidates.
1106 if (options & USE_DIVIDED_TABLE) {
1107 g_array_set_size(items, 0);
1109 if (_try_divided_table(instance, ranges, offset, items)) {
1111 g_array_sort(items, compare_item_with_token);
1113 _remove_duplicated_items(items);
1115 _compute_frequency_of_items(context, prev_token,
1116 &merged_gram, items);
1118 /* sort the candidates of the same length by frequency. */
1119 g_array_sort(items, compare_item_with_frequency);
1121 /* transfer back items to tokens, and save it into candidates */
1122 for (i = 0; i < items->len; ++i) {
1123 lookup_candidate_t * item = &g_array_index
1124 (items, lookup_candidate_t, i);
1125 g_array_append_val(candidates, *item);
1131 for (i = pinyin_len; i >= 1; --i) {
1133 g_array_set_size(items, 0);
1136 /* handle fuzzy pinyin segment here. */
1137 if (options & USE_DIVIDED_TABLE) {
1138 found = _try_divided_table(instance, ranges, offset, items) ||
1141 if (options & USE_RESPLIT_TABLE) {
1142 found = _try_resplit_table(instance, ranges, offset, items) ||
1147 ChewingKey * keys = &g_array_index
1148 (pinyin_keys, ChewingKey, offset);
1150 /* do pinyin search. */
1151 int retval = context->m_pinyin_table->search
1154 found = (retval & SEARCH_OK) || found;
1159 lookup_candidate_t template_item;
1160 _append_items(context, ranges, &template_item, items);
1162 g_array_sort(items, compare_item_with_token);
1164 _remove_duplicated_items(items);
1166 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1168 g_array_sort(items, compare_item_with_frequency);
1170 for (size_t k = 0; k < items->len; ++k) {
1171 lookup_candidate_t * item = &g_array_index
1172 (items, lookup_candidate_t, k);
1173 g_array_append_val(candidates, *item);
1176 if (!(retval & SEARCH_CONTINUED))
1180 g_array_free(items, TRUE);
1182 context->m_phrase_index->destroy_ranges(ranges);
1192 int pinyin_choose_candidate(pinyin_instance_t * instance,
1194 phrase_token_t token){
1195 pinyin_context_t * & context = instance->m_context;
1197 guint8 len = context->m_pinyin_lookup->add_constraint
1198 (instance->m_constraints, offset, token);
1200 bool retval = context->m_pinyin_lookup->validate_constraint
1201 (instance->m_constraints, instance->m_pinyin_keys) && len;
1203 return offset + len;
1206 int pinyin_choose_full_pinyin_candidate(pinyin_instance_t * instance,
1208 lookup_candidate_t * candidate){
1209 pinyin_context_t * & context = instance->m_context;
1211 if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1212 RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1213 /* update full pinyin. */
1214 gchar * oldpinyins = instance->m_raw_full_pinyin;
1215 const ChewingKeyRest rest = candidate->m_orig_rest;
1216 oldpinyins[rest.m_raw_begin] = '\0';
1217 const gchar * left_part = oldpinyins;
1218 const gchar * right_part = oldpinyins + rest.m_raw_end;
1219 gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1222 instance->m_raw_full_pinyin = newpinyins;
1224 /* re-parse the full pinyin. */
1225 const gchar * pinyins = instance->m_raw_full_pinyin;
1226 int pinyin_len = strlen(pinyins);
1227 int parse_len = context->m_full_pinyin_parser->parse
1228 (context->m_options, instance->m_pinyin_keys,
1229 instance->m_pinyin_key_rests, pinyins, pinyin_len);
1231 /* Note: there may be some un-parsable input here. */
1234 /* sync m_constraints to the length of m_pinyin_keys. */
1235 bool retval = context->m_pinyin_lookup->validate_constraint
1236 (instance->m_constraints, instance->m_pinyin_keys);
1238 phrase_token_t token = candidate->m_token;
1239 guint8 len = context->m_pinyin_lookup->add_constraint
1240 (instance->m_constraints, offset, token);
1242 /* safe guard: validate the m_constraints again. */
1243 retval = context->m_pinyin_lookup->validate_constraint
1244 (instance->m_constraints, instance->m_pinyin_keys) && len;
1246 return offset + len;
1250 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1252 pinyin_context_t * & context = instance->m_context;
1254 bool retval = context->m_pinyin_lookup->clear_constraint
1255 (instance->m_constraints, offset);
1260 bool pinyin_clear_constraints(pinyin_instance_t * instance){
1261 pinyin_context_t * & context = instance->m_context;
1264 for ( size_t i = 0; i < instance->m_constraints->len; ++i ) {
1265 retval = context->m_pinyin_lookup->clear_constraint
1266 (instance->m_constraints, i) && retval;
1272 /* the returned word should be freed by g_free. */
1273 bool pinyin_translate_token(pinyin_instance_t * instance,
1274 phrase_token_t token, char ** word){
1275 pinyin_context_t * & context = instance->m_context;
1277 ucs4_t buffer[MAX_PHRASE_LENGTH];
1279 int retval = context->m_phrase_index->get_phrase_item(token, item);
1280 item.get_phrase_string(buffer);
1281 guint8 length = item.get_phrase_length();
1282 *word = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1283 return ERROR_OK == retval;
1286 bool pinyin_train(pinyin_instance_t * instance){
1287 if (!instance->m_context->m_user_dir)
1290 pinyin_context_t * & context = instance->m_context;
1291 context->m_modified = true;
1293 bool retval = context->m_pinyin_lookup->train_result2
1294 (instance->m_pinyin_keys, instance->m_constraints,
1295 instance->m_match_results);
1300 bool pinyin_reset(pinyin_instance_t * instance){
1301 g_array_set_size(instance->m_pinyin_keys, 0);
1302 g_array_set_size(instance->m_pinyin_key_rests, 0);
1303 g_array_set_size(instance->m_constraints, 0);
1304 g_array_set_size(instance->m_match_results, 0);
1310 * Note: prefix is the text before the pre-edit string.