3 * Library to deal with pinyin.
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
29 using namespace pinyin;
31 /* a glue layer for input method integration. */
33 struct _pinyin_context_t{
34 pinyin_option_t m_options;
36 FullPinyinParser2 * m_full_pinyin_parser;
37 DoublePinyinParser2 * m_double_pinyin_parser;
38 ChewingParser2 * m_chewing_parser;
40 FacadeChewingTable * m_pinyin_table;
41 FacadePhraseTable2 * m_phrase_table;
42 FacadePhraseIndex * m_phrase_index;
43 Bigram * m_system_bigram;
44 Bigram * m_user_bigram;
46 PinyinLookup2 * m_pinyin_lookup;
47 PhraseLookup * m_phrase_lookup;
54 struct _pinyin_instance_t{
55 pinyin_context_t * m_context;
56 gchar * m_raw_full_pinyin;
57 TokenVector m_prefixes;
58 ChewingKeyVector m_pinyin_keys;
59 ChewingKeyRestVector m_pinyin_key_rests;
60 CandidateConstraints m_constraints;
61 MatchResults m_match_results;
62 CandidateVector m_candidates;
65 struct _lookup_candidate_t{
66 enum lookup_candidate_type_t m_candidate_type;
67 gchar * m_phrase_string;
68 phrase_token_t m_token;
69 ChewingKeyRest m_orig_rest;
70 gchar * m_new_pinyins;
71 guint32 m_freq; /* the amplifed gfloat numerical value. */
73 _lookup_candidate_t() {
74 m_candidate_type = NORMAL_CANDIDATE;
75 m_phrase_string = NULL;
82 struct _import_iterator_t{
83 pinyin_context_t * m_context;
84 guint8 m_phrase_index;
88 static bool check_format(const char * userdir){
89 gchar * filename = g_build_filename
90 (userdir, "version", NULL);
93 bool exists = chunk.load(filename);
97 (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
98 strlen(LIBPINYIN_FORMAT_VERSION) + 1));
105 /* clean up files, if version mis-matches. */
106 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
107 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
109 if (NOT_USED == table_info->m_file_type)
112 if (NULL == table_info->m_user_filename)
115 const char * userfilename = table_info->m_user_filename;
117 /* remove dbin file. */
118 filename = g_build_filename(userdir, userfilename, NULL);
123 filename = g_build_filename
124 (userdir, "user_pinyin_index.bin", NULL);
128 filename = g_build_filename
129 (userdir, "user_phrase_index.bin", NULL);
133 filename = g_build_filename
134 (userdir, "user.db", NULL);
141 static bool mark_version(const char * userdir){
142 gchar * filename = g_build_filename
143 (userdir, "version", NULL);
145 chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
146 strlen(LIBPINYIN_FORMAT_VERSION) + 1);
147 bool retval = chunk.save(filename);
152 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
153 pinyin_context_t * context = new pinyin_context_t;
155 context->m_options = USE_TONE;
157 context->m_system_dir = g_strdup(systemdir);
158 context->m_user_dir = g_strdup(userdir);
159 context->m_modified = false;
161 check_format(context->m_user_dir);
163 context->m_full_pinyin_parser = new FullPinyinParser2;
164 context->m_double_pinyin_parser = new DoublePinyinParser2;
165 context->m_chewing_parser = new ChewingParser2;
167 /* load chewing table. */
168 context->m_pinyin_table = new FacadeChewingTable;
170 /* load system chewing table. */
171 MemoryChunk * chunk = new MemoryChunk;
172 gchar * filename = g_build_filename
173 (context->m_system_dir, "pinyin_index.bin", NULL);
174 if (!chunk->load(filename)) {
175 fprintf(stderr, "open %s failed!\n", filename);
180 /* load user chewing table */
181 MemoryChunk * userchunk = new MemoryChunk;
182 filename = g_build_filename
183 (context->m_user_dir, "user_pinyin_index.bin", NULL);
184 if (!userchunk->load(filename)) {
185 /* hack here: use local Chewing Table to create empty memory chunk. */
186 ChewingLargeTable table(context->m_options);
187 table.store(userchunk);
191 context->m_pinyin_table->load(context->m_options, chunk, userchunk);
193 /* load phrase table */
194 context->m_phrase_table = new FacadePhraseTable2;
196 /* load system phrase table */
197 chunk = new MemoryChunk;
198 filename = g_build_filename
199 (context->m_system_dir, "phrase_index.bin", NULL);
200 if (!chunk->load(filename)) {
201 fprintf(stderr, "open %s failed!\n", filename);
206 /* load user phrase table */
207 userchunk = new MemoryChunk;
208 filename = g_build_filename
209 (context->m_user_dir, "user_phrase_index.bin", NULL);
210 if (!userchunk->load(filename)) {
211 /* hack here: use local Phrase Table to create empty memory chunk. */
212 PhraseLargeTable2 table;
213 table.store(userchunk);
217 context->m_phrase_table->load(chunk, userchunk);
219 context->m_phrase_index = new FacadePhraseIndex;
221 /* hack here: directly call load phrase library. */
222 pinyin_load_phrase_library(context, GB_DICTIONARY);
223 pinyin_load_phrase_library(context, MERGED_DICTIONARY);
225 context->m_system_bigram = new Bigram;
226 filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
227 context->m_system_bigram->attach(filename, ATTACH_READONLY);
230 context->m_user_bigram = new Bigram;
231 filename = g_build_filename(context->m_user_dir, "user.db", NULL);
232 context->m_user_bigram->load_db(filename);
235 context->m_pinyin_lookup = new PinyinLookup2
236 ( context->m_options, context->m_pinyin_table,
237 context->m_phrase_index, context->m_system_bigram,
238 context->m_user_bigram);
240 context->m_phrase_lookup = new PhraseLookup
241 (context->m_phrase_table, context->m_phrase_index,
242 context->m_system_bigram, context->m_user_bigram);
247 bool pinyin_load_phrase_library(pinyin_context_t * context,
249 if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
252 /* check whether the sub phrase index is already loaded. */
253 PhraseIndexRange range;
254 int retval = context->m_phrase_index->get_range(index, range);
255 if (ERROR_OK == retval)
258 const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
260 if (SYSTEM_FILE == table_info->m_file_type ||
261 DICTIONARY == table_info->m_file_type) {
262 /* system phrase library */
263 MemoryChunk * chunk = new MemoryChunk;
265 const char * systemfilename = table_info->m_system_filename;
266 /* check bin file in system dir. */
267 gchar * chunkfilename = g_build_filename(context->m_system_dir,
268 systemfilename, NULL);
269 chunk->load(chunkfilename);
270 g_free(chunkfilename);
272 context->m_phrase_index->load(index, chunk);
274 const char * userfilename = table_info->m_user_filename;
276 chunkfilename = g_build_filename(context->m_user_dir,
279 MemoryChunk * log = new MemoryChunk;
280 log->load(chunkfilename);
281 g_free(chunkfilename);
283 /* merge the chunk log. */
284 context->m_phrase_index->merge(index, log);
288 if (USER_FILE == table_info->m_file_type) {
289 /* user phrase library */
290 MemoryChunk * chunk = new MemoryChunk;
291 const char * userfilename = table_info->m_user_filename;
293 gchar * chunkfilename = g_build_filename(context->m_user_dir,
296 /* check bin file exists. if not, create a new one. */
297 if (chunk->load(chunkfilename)) {
298 context->m_phrase_index->load(index, chunk);
301 context->m_phrase_index->create_sub_phrase(index);
304 g_free(chunkfilename);
311 bool pinyin_unload_phrase_library(pinyin_context_t * context,
313 /* gb_char.bin and merged.bin can't be unloaded. */
314 if (GB_DICTIONARY == index || MERGED_DICTIONARY == index)
317 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
319 context->m_phrase_index->unload(index);
323 import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context,
325 import_iterator_t * iter = new import_iterator_t;
326 iter->m_context = context;
327 iter->m_phrase_index = index;
331 bool pinyin_iterator_add_phrase(import_iterator_t * iter,
335 /* if -1 == count, use the default value. */
336 const gint default_count = 5;
337 const guint32 unigram_factor = 3;
339 count = default_count;
341 pinyin_context_t * & context = iter->m_context;
342 FacadePhraseTable2 * & phrase_table = context->m_phrase_table;
343 FacadeChewingTable * & pinyin_table = context->m_pinyin_table;
344 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
346 /* check whether the phrase exists in phrase table */
347 glong len_phrase = 0;
348 ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL);
352 pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
353 FullPinyinParser2 parser;
354 ChewingKeyVector keys =
355 g_array_new(FALSE, FALSE, sizeof(ChewingKey));
356 ChewingKeyRestVector key_rests =
357 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
359 /* parse the pinyin. */
360 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
362 if (len_phrase != keys->len)
365 if (len_phrase >= MAX_PHRASE_LENGTH)
368 phrase_token_t token = null_token;
369 GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
371 /* do phrase table search. */
373 memset(tokens, 0, sizeof(PhraseTokens));
374 phrase_index->prepare_tokens(tokens);
375 int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens);
376 int num = reduce_tokens(tokens, tokenarray);
377 phrase_index->destroy_tokens(tokens);
379 /* find the best token candidate. */
380 for (size_t i = 0; i < tokenarray->len; ++i) {
381 phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
382 if (null_token == token) {
387 if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) {
388 /* only one phrase string per sub phrase index. */
389 assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index);
394 g_array_free(tokenarray, TRUE);
397 /* check whether it exists in the same sub phrase index; */
398 if (null_token != token &&
399 PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) {
400 /* if so, remove the phrase, add the pinyin for the phrase item,
402 phrase_index->get_phrase_item(token, item);
403 assert(len_phrase == item.get_phrase_length());
404 ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
405 item.get_phrase_string(tmp_phrase);
407 (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase));
409 PhraseItem * removed_item = NULL;
410 retval = phrase_index->remove_phrase_item(token, removed_item);
411 if (ERROR_OK == retval) {
412 /* maybe check whether there are duplicated pronunciations here. */
413 removed_item->append_pronunciation((ChewingKey *)keys->data,
415 phrase_index->add_phrase_item(token, removed_item);
420 /* if not exists in the same sub phrase index,
421 get the maximum token,
422 then add it directly with maximum token + 1; */
423 PhraseIndexRange range;
424 retval = phrase_index->get_range(iter->m_phrase_index, range);
426 if (ERROR_OK == retval) {
427 token = range.m_range_end;
428 if (0x00000000 == (token & PHRASE_MASK))
431 if (len_phrase == keys->len) { /* valid pinyin */
432 phrase_table->add_index(len_phrase, ucs4_phrase, token);
433 pinyin_table->add_index
434 (keys->len, (ChewingKey *)(keys->data), token);
436 item.set_phrase_string(len_phrase, ucs4_phrase);
437 item.append_pronunciation((ChewingKey *)(keys->data), count);
438 phrase_index->add_phrase_item(token, &item);
439 phrase_index->add_unigram_frequency(token,
440 count * unigram_factor);
446 g_array_free(key_rests, TRUE);
447 g_array_free(keys, TRUE);
452 void pinyin_end_add_phrases(import_iterator_t * iter){
453 /* compact the content memory chunk of phrase index. */
454 iter->m_context->m_phrase_index->compact();
458 bool pinyin_save(pinyin_context_t * context){
459 if (!context->m_user_dir)
462 if (!context->m_modified)
465 context->m_phrase_index->compact();
467 /* skip the reserved zero phrase library. */
468 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
469 PhraseIndexRange range;
470 int retval = context->m_phrase_index->get_range(i, range);
472 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
475 const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
477 if (NOT_USED == table_info->m_file_type)
480 const char * userfilename = table_info->m_user_filename;
482 if (NULL == userfilename)
485 if (SYSTEM_FILE == table_info->m_file_type ||
486 DICTIONARY == table_info->m_file_type) {
487 /* system phrase library */
488 MemoryChunk * chunk = new MemoryChunk;
489 MemoryChunk * log = new MemoryChunk;
490 const char * systemfilename = table_info->m_system_filename;
492 /* check bin file in system dir. */
493 gchar * chunkfilename = g_build_filename(context->m_system_dir,
494 systemfilename, NULL);
495 chunk->load(chunkfilename);
496 g_free(chunkfilename);
497 context->m_phrase_index->diff(i, chunk, log);
499 const char * userfilename = table_info->m_user_filename;
500 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
502 gchar * tmppathname = g_build_filename(context->m_user_dir,
506 gchar * chunkpathname = g_build_filename(context->m_user_dir,
508 log->save(tmppathname);
509 rename(tmppathname, chunkpathname);
510 g_free(chunkpathname);
515 if (USER_FILE == table_info->m_file_type) {
516 /* user phrase library */
517 MemoryChunk * chunk = new MemoryChunk;
518 context->m_phrase_index->store(i, chunk);
520 const char * userfilename = table_info->m_user_filename;
521 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
522 gchar * tmppathname = g_build_filename(context->m_user_dir,
526 gchar * chunkpathname = g_build_filename(context->m_user_dir,
529 chunk->save(tmppathname);
530 rename(tmppathname, chunkpathname);
531 g_free(chunkpathname);
537 /* save user chewing table */
538 gchar * tmpfilename = g_build_filename
539 (context->m_user_dir, "user_pinyin_index.bin.tmp", NULL);
541 gchar * filename = g_build_filename
542 (context->m_user_dir, "user_pinyin_index.bin", NULL);
544 MemoryChunk * chunk = new MemoryChunk;
545 context->m_pinyin_table->store(chunk);
546 chunk->save(tmpfilename);
548 rename(tmpfilename, filename);
552 /* save user phrase table */
553 tmpfilename = g_build_filename
554 (context->m_user_dir, "user_phrase_index.bin.tmp", NULL);
556 filename = g_build_filename
557 (context->m_user_dir, "user_phrase_index.bin", NULL);
559 chunk = new MemoryChunk;
560 context->m_phrase_table->store(chunk);
561 chunk->save(tmpfilename);
563 rename(tmpfilename, filename);
567 /* save user bi-gram */
568 tmpfilename = g_build_filename
569 (context->m_user_dir, "user.db.tmp", NULL);
571 filename = g_build_filename(context->m_user_dir, "user.db", NULL);
572 context->m_user_bigram->save_db(tmpfilename);
573 rename(tmpfilename, filename);
577 mark_version(context->m_user_dir);
579 context->m_modified = false;
583 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
584 DoublePinyinScheme scheme){
585 context->m_double_pinyin_parser->set_scheme(scheme);
589 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
590 ChewingScheme scheme){
591 context->m_chewing_parser->set_scheme(scheme);
595 void pinyin_fini(pinyin_context_t * context){
596 delete context->m_full_pinyin_parser;
597 delete context->m_double_pinyin_parser;
598 delete context->m_chewing_parser;
599 delete context->m_pinyin_table;
600 delete context->m_phrase_table;
601 delete context->m_phrase_index;
602 delete context->m_system_bigram;
603 delete context->m_user_bigram;
604 delete context->m_pinyin_lookup;
605 delete context->m_phrase_lookup;
607 g_free(context->m_system_dir);
608 g_free(context->m_user_dir);
609 context->m_modified = false;
614 bool pinyin_mask_out(pinyin_context_t * context,
616 phrase_token_t value) {
618 context->m_pinyin_table->mask_out(mask, value);
619 context->m_phrase_table->mask_out(mask, value);
620 context->m_user_bigram->mask_out(mask, value);
622 /* mask out the phrase index. */
623 for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
624 PhraseIndexRange range;
625 int retval = context->m_phrase_index->get_range(index, range);
627 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
630 const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
632 if (NOT_USED == table_info->m_file_type)
635 const char * userfilename = table_info->m_user_filename;
637 if (NULL == userfilename)
640 if (SYSTEM_FILE == table_info->m_file_type ||
641 DICTIONARY == table_info->m_file_type) {
642 /* system phrase library */
643 MemoryChunk * chunk = new MemoryChunk;
645 const char * systemfilename = table_info->m_system_filename;
646 /* check bin file in system dir. */
647 gchar * chunkfilename = g_build_filename(context->m_system_dir,
648 systemfilename, NULL);
649 chunk->load(chunkfilename);
650 g_free(chunkfilename);
652 context->m_phrase_index->load(index, chunk);
654 const char * userfilename = table_info->m_user_filename;
656 chunkfilename = g_build_filename(context->m_user_dir,
659 MemoryChunk * log = new MemoryChunk;
660 log->load(chunkfilename);
661 g_free(chunkfilename);
663 /* merge the chunk log with mask. */
664 context->m_phrase_index->merge_with_mask(index, log, mask, value);
667 if (USER_FILE == table_info->m_file_type) {
668 /* user phrase library */
669 context->m_phrase_index->mask_out(index, mask, value);
673 context->m_phrase_index->compact();
677 /* copy from options to context->m_options. */
678 bool pinyin_set_options(pinyin_context_t * context,
679 pinyin_option_t options){
680 context->m_options = options;
681 context->m_pinyin_table->set_options(context->m_options);
682 context->m_pinyin_lookup->set_options(context->m_options);
687 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
688 pinyin_instance_t * instance = new pinyin_instance_t;
689 instance->m_context = context;
691 instance->m_raw_full_pinyin = NULL;
693 instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
694 instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
695 instance->m_pinyin_key_rests =
696 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
697 instance->m_constraints = g_array_new
698 (TRUE, FALSE, sizeof(lookup_constraint_t));
699 instance->m_match_results =
700 g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
705 void pinyin_free_instance(pinyin_instance_t * instance){
706 g_free(instance->m_raw_full_pinyin);
707 g_array_free(instance->m_prefixes, TRUE);
708 g_array_free(instance->m_pinyin_keys, TRUE);
709 g_array_free(instance->m_pinyin_key_rests, TRUE);
710 g_array_free(instance->m_constraints, TRUE);
711 g_array_free(instance->m_match_results, TRUE);
717 static bool pinyin_update_constraints(pinyin_instance_t * instance){
718 pinyin_context_t * & context = instance->m_context;
719 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
720 CandidateConstraints & constraints = instance->m_constraints;
722 size_t key_len = constraints->len;
723 g_array_set_size(constraints, pinyin_keys->len);
724 for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
725 lookup_constraint_t * constraint =
726 &g_array_index(constraints, lookup_constraint_t, i);
727 constraint->m_type = NO_CONSTRAINT;
730 context->m_pinyin_lookup->validate_constraint
731 (constraints, pinyin_keys);
737 bool pinyin_guess_sentence(pinyin_instance_t * instance){
738 pinyin_context_t * & context = instance->m_context;
740 g_array_set_size(instance->m_prefixes, 0);
741 g_array_append_val(instance->m_prefixes, sentence_start);
743 pinyin_update_constraints(instance);
744 bool retval = context->m_pinyin_lookup->get_best_match
745 (instance->m_prefixes,
746 instance->m_pinyin_keys,
747 instance->m_constraints,
748 instance->m_match_results);
753 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
754 const char * prefix){
755 pinyin_context_t * & context = instance->m_context;
757 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
759 g_array_set_size(instance->m_prefixes, 0);
760 g_array_append_val(instance->m_prefixes, sentence_start);
763 ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
764 GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
766 if (ucs4_str && len_str) {
768 for (ssize_t i = 1; i <= len_str; ++i) {
769 if (i > MAX_PHRASE_LENGTH)
772 ucs4_t * start = ucs4_str + len_str - i;
775 memset(tokens, 0, sizeof(tokens));
776 phrase_index->prepare_tokens(tokens);
777 int result = context->m_phrase_table->search(i, start, tokens);
778 int num = reduce_tokens(tokens, tokenarray);
779 phrase_index->destroy_tokens(tokens);
781 if (result & SEARCH_OK)
782 g_array_append_vals(instance->m_prefixes,
783 tokenarray->data, tokenarray->len);
786 g_array_free(tokenarray, TRUE);
789 pinyin_update_constraints(instance);
790 bool retval = context->m_pinyin_lookup->get_best_match
791 (instance->m_prefixes,
792 instance->m_pinyin_keys,
793 instance->m_constraints,
794 instance->m_match_results);
799 bool pinyin_phrase_segment(pinyin_instance_t * instance,
800 const char * sentence){
801 pinyin_context_t * & context = instance->m_context;
803 const glong num_of_chars = g_utf8_strlen(sentence, -1);
805 ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
807 g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
809 bool retval = context->m_phrase_lookup->get_best_match
810 (ucs4_len, ucs4_str, instance->m_match_results);
816 /* the returned sentence should be freed by g_free(). */
817 bool pinyin_get_sentence(pinyin_instance_t * instance,
819 pinyin_context_t * & context = instance->m_context;
821 bool retval = pinyin::convert_to_utf8
822 (context->m_phrase_index, instance->m_match_results,
823 NULL, false, *sentence);
828 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
829 const char * onepinyin,
830 ChewingKey * onekey){
831 pinyin_context_t * & context = instance->m_context;
833 int pinyin_len = strlen(onepinyin);
834 int parse_len = context->m_full_pinyin_parser->parse_one_key
835 ( context->m_options, *onekey, onepinyin, pinyin_len);
836 return pinyin_len == parse_len;
839 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
840 const char * pinyins){
841 pinyin_context_t * & context = instance->m_context;
843 g_free(instance->m_raw_full_pinyin);
844 instance->m_raw_full_pinyin = g_strdup(pinyins);
845 int pinyin_len = strlen(pinyins);
847 int parse_len = context->m_full_pinyin_parser->parse
848 ( context->m_options, instance->m_pinyin_keys,
849 instance->m_pinyin_key_rests, pinyins, pinyin_len);
854 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
855 const char * onepinyin,
856 ChewingKey * onekey){
857 pinyin_context_t * & context = instance->m_context;
859 int pinyin_len = strlen(onepinyin);
860 int parse_len = context->m_double_pinyin_parser->parse_one_key
861 ( context->m_options, *onekey, onepinyin, pinyin_len);
862 return pinyin_len == parse_len;
865 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
866 const char * pinyins){
867 pinyin_context_t * & context = instance->m_context;
868 int pinyin_len = strlen(pinyins);
870 int parse_len = context->m_double_pinyin_parser->parse
871 ( context->m_options, instance->m_pinyin_keys,
872 instance->m_pinyin_key_rests, pinyins, pinyin_len);
877 bool pinyin_parse_chewing(pinyin_instance_t * instance,
878 const char * onechewing,
879 ChewingKey * onekey){
880 pinyin_context_t * & context = instance->m_context;
882 int chewing_len = strlen(onechewing);
883 int parse_len = context->m_chewing_parser->parse_one_key
884 ( context->m_options, *onekey, onechewing, chewing_len );
885 return chewing_len == parse_len;
888 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
889 const char * chewings){
890 pinyin_context_t * & context = instance->m_context;
891 int chewing_len = strlen(chewings);
893 int parse_len = context->m_chewing_parser->parse
894 ( context->m_options, instance->m_pinyin_keys,
895 instance->m_pinyin_key_rests, chewings, chewing_len);
900 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
901 const char key, const char ** symbol) {
902 pinyin_context_t * & context = instance->m_context;
903 return context->m_chewing_parser->in_chewing_scheme
904 (context->m_options, key, symbol);
908 static gint compare_item_with_token(gconstpointer lhs,
910 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
911 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
913 phrase_token_t token_lhs = item_lhs->m_token;
914 phrase_token_t token_rhs = item_rhs->m_token;
916 return (token_lhs - token_rhs);
920 static gint compare_item_with_frequency(gconstpointer lhs,
922 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
923 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
925 guint32 freq_lhs = item_lhs->m_freq;
926 guint32 freq_rhs = item_rhs->m_freq;
928 return -(freq_lhs - freq_rhs); /* in descendant order */
931 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
933 phrase_token_t prev_token = null_token;
937 /* get previous token from prefixes. */
938 prev_token = sentence_start;
939 size_t prev_token_len = 0;
941 pinyin_context_t * context = instance->m_context;
942 TokenVector prefixes = instance->m_prefixes;
945 for (size_t i = 0; i < prefixes->len; ++i) {
946 phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
947 if (sentence_start == token)
950 int retval = context->m_phrase_index->get_phrase_item(token, item);
951 if (ERROR_OK == retval) {
952 size_t token_len = item.get_phrase_length();
953 if (token_len > prev_token_len) {
954 /* found longer match, and save it. */
956 prev_token_len = token_len;
961 /* get previous token from match results. */
964 phrase_token_t cur_token = g_array_index
965 (instance->m_match_results, phrase_token_t, offset);
966 if (null_token != cur_token) {
967 for (i = offset - 1; i >= 0; --i) {
968 cur_token = g_array_index
969 (instance->m_match_results, phrase_token_t, i);
970 if (null_token != cur_token) {
971 prev_token = cur_token;
981 static void _append_items(pinyin_context_t * context,
982 PhraseIndexRanges ranges,
983 lookup_candidate_t * template_item,
984 CandidateVector items) {
985 /* reduce and append to a single GArray. */
986 for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
987 if (NULL == ranges[m])
990 for (size_t n = 0; n < ranges[m]->len; ++n) {
991 PhraseIndexRange * range =
992 &g_array_index(ranges[m], PhraseIndexRange, n);
993 for (size_t k = range->m_range_begin;
994 k < range->m_range_end; ++k) {
995 lookup_candidate_t item;
996 item.m_candidate_type = template_item->m_candidate_type;
998 item.m_orig_rest = template_item->m_orig_rest;
999 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
1000 item.m_freq = template_item->m_freq;
1001 g_array_append_val(items, item);
1008 static void _remove_duplicated_items(CandidateVector items) {
1009 /* remove the duplicated items. */
1010 phrase_token_t last_token = null_token, saved_token;
1011 for (size_t n = 0; n < items->len; ++n) {
1012 lookup_candidate_t * item = &g_array_index
1013 (items, lookup_candidate_t, n);
1015 saved_token = item->m_token;
1016 if (last_token == saved_token) {
1017 g_array_remove_index(items, n);
1020 last_token = saved_token;
1025 static void _compute_frequency_of_items(pinyin_context_t * context,
1026 phrase_token_t prev_token,
1027 SingleGram * merged_gram,
1028 CandidateVector items) {
1029 pinyin_option_t & options = context->m_options;
1032 PhraseItem cached_item;
1033 /* compute all freqs. */
1034 for (i = 0; i < items->len; ++i) {
1035 lookup_candidate_t * item = &g_array_index
1036 (items, lookup_candidate_t, i);
1037 phrase_token_t & token = item->m_token;
1039 gfloat bigram_poss = 0; guint32 total_freq = 0;
1040 if (options & DYNAMIC_ADJUST) {
1041 if (null_token != prev_token) {
1042 guint32 bigram_freq = 0;
1043 merged_gram->get_total_freq(total_freq);
1044 merged_gram->get_freq(token, bigram_freq);
1045 if (0 != total_freq)
1046 bigram_poss = bigram_freq / (gfloat)total_freq;
1050 /* compute the m_freq. */
1051 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1052 phrase_index->get_phrase_item(token, cached_item);
1053 total_freq = phrase_index->get_phrase_index_total_freq();
1054 assert (0 < total_freq);
1056 /* Note: possibility value <= 1.0. */
1057 guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
1058 (1 - LAMBDA_PARAMETER) *
1059 cached_item.get_unigram_frequency() /
1060 (gfloat) total_freq) * 256 * 256 * 256;
1061 item->m_freq = freq;
1065 static bool _prepend_sentence_candidate(pinyin_instance_t * instance,
1066 CandidateVector candidates) {
1067 /* check whether the best match candidate exists. */
1068 gchar * sentence = NULL;
1069 pinyin_get_sentence(instance, &sentence);
1070 if (NULL == sentence)
1074 /* prepend best match candidate to candidates. */
1075 lookup_candidate_t candidate;
1076 candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
1077 g_array_prepend_val(candidates, candidate);
1082 static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
1084 CandidateVector candidates) {
1085 /* populate m_phrase_string in lookup_candidate_t. */
1087 for(size_t i = 0; i < candidates->len; ++i) {
1088 lookup_candidate_t * candidate = &g_array_index
1089 (candidates, lookup_candidate_t, i);
1091 switch(candidate->m_candidate_type) {
1092 case BEST_MATCH_CANDIDATE: {
1093 gchar * sentence = NULL;
1094 pinyin_get_sentence(instance, &sentence);
1095 candidate->m_phrase_string = g_strdup
1096 (g_utf8_offset_to_pointer(sentence, offset));
1100 case NORMAL_CANDIDATE:
1101 case DIVIDED_CANDIDATE:
1102 case RESPLIT_CANDIDATE:
1103 pinyin_token_get_phrase
1104 (instance, candidate->m_token, NULL,
1105 &(candidate->m_phrase_string));
1107 case ZOMBIE_CANDIDATE:
1115 static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
1117 gpointer userdata) {
1118 size_t index_lhs = *((size_t *) lhs);
1119 size_t index_rhs = *((size_t *) rhs);
1120 CandidateVector candidates = (CandidateVector) userdata;
1122 lookup_candidate_t * candidate_lhs =
1123 &g_array_index(candidates, lookup_candidate_t, index_lhs);
1124 lookup_candidate_t * candidate_rhs =
1125 &g_array_index(candidates, lookup_candidate_t, index_rhs);
1127 return -strcmp(candidate_lhs->m_phrase_string,
1128 candidate_rhs->m_phrase_string); /* in descendant order */
1132 static bool _remove_duplicated_items_by_phrase_string
1133 (pinyin_instance_t * instance,
1134 CandidateVector candidates) {
1136 /* create the GArray of indexed item */
1137 GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
1138 for (i = 0; i < candidates->len; ++i)
1139 g_array_append_val(indices, i);
1141 /* sort the indices array by phrase array */
1142 g_array_sort_with_data
1143 (indices, compare_indexed_item_with_phrase_string, candidates);
1145 /* mark duplicated items as zombie candidate */
1146 lookup_candidate_t * cur_item, * saved_item = NULL;
1147 for (i = 0; i < indices->len; ++i) {
1148 size_t cur_index = g_array_index(indices, size_t, i);
1149 cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
1151 /* handle the first candidate */
1152 if (NULL == saved_item) {
1153 saved_item = cur_item;
1157 if (0 == strcmp(saved_item->m_phrase_string,
1158 cur_item->m_phrase_string)) {
1159 /* found duplicated candidates */
1161 /* keep best match candidate */
1162 if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
1163 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1167 if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
1168 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1169 saved_item = cur_item;
1173 /* keep the higher possiblity one
1174 to quickly move the word forward in the candidate list */
1175 if (cur_item->m_freq > saved_item->m_freq) {
1176 /* find better candidate */
1177 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1178 saved_item = cur_item;
1181 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1185 /* keep the current candidate */
1186 saved_item = cur_item;
1190 g_array_free(indices, TRUE);
1192 /* remove zombie candidate from the returned candidates */
1193 for (i = 0; i < candidates->len; ++i) {
1194 lookup_candidate_t * candidate = &g_array_index
1195 (candidates, lookup_candidate_t, i);
1197 if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
1198 g_free(candidate->m_phrase_string);
1199 g_free(candidate->m_new_pinyins);
1200 g_array_remove_index(candidates, i);
1208 static bool _free_candidates(CandidateVector candidates) {
1209 /* free candidates */
1210 for (size_t i = 0; i < candidates->len; ++i) {
1211 lookup_candidate_t * candidate = &g_array_index
1212 (candidates, lookup_candidate_t, i);
1213 g_free(candidate->m_phrase_string);
1214 g_free(candidate->m_new_pinyins);
1216 g_array_set_size(candidates, 0);
1221 bool pinyin_get_candidates(pinyin_instance_t * instance,
1223 CandidateVector candidates) {
1225 pinyin_context_t * & context = instance->m_context;
1226 pinyin_option_t & options = context->m_options;
1227 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1229 _free_candidates(candidates);
1231 size_t pinyin_len = pinyin_keys->len - offset;
1234 /* lookup the previous token here. */
1235 phrase_token_t prev_token = null_token;
1237 if (options & DYNAMIC_ADJUST) {
1238 prev_token = _get_previous_token(instance, offset);
1241 SingleGram merged_gram;
1242 SingleGram * system_gram = NULL, * user_gram = NULL;
1244 if (options & DYNAMIC_ADJUST) {
1245 if (null_token != prev_token) {
1246 context->m_system_bigram->load(prev_token, system_gram);
1247 context->m_user_bigram->load(prev_token, user_gram);
1248 merge_single_gram(&merged_gram, system_gram, user_gram);
1252 PhraseIndexRanges ranges;
1253 memset(ranges, 0, sizeof(ranges));
1254 context->m_phrase_index->prepare_ranges(ranges);
1256 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1258 for (i = pinyin_len; i >= 1; --i) {
1259 g_array_set_size(items, 0);
1261 ChewingKey * keys = &g_array_index
1262 (pinyin_keys, ChewingKey, offset);
1264 /* do pinyin search. */
1265 int retval = context->m_pinyin_table->search
1268 if ( !(retval & SEARCH_OK) )
1271 lookup_candidate_t template_item;
1272 _append_items(context, ranges, &template_item, items);
1275 g_array_sort(items, compare_item_with_token);
1277 _remove_duplicated_items(items);
1280 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1282 /* sort the candidates of the same length by frequency. */
1283 g_array_sort(items, compare_item_with_frequency);
1285 /* transfer back items to tokens, and save it into candidates */
1286 for (size_t k = 0; k < items->len; ++k) {
1287 lookup_candidate_t * item = &g_array_index
1288 (items, lookup_candidate_t, k);
1289 g_array_append_val(candidates, *item);
1293 if (!(retval & SEARCH_CONTINUED))
1298 g_array_free(items, TRUE);
1299 context->m_phrase_index->destroy_ranges(ranges);
1305 /* post process to remove duplicated candidates */
1307 _prepend_sentence_candidate(instance, candidates);
1309 _compute_phrase_strings_of_items(instance, offset, candidates);
1311 _remove_duplicated_items_by_phrase_string(instance, candidates);
1317 static bool _try_divided_table(pinyin_instance_t * instance,
1318 PhraseIndexRanges ranges,
1320 CandidateVector items){
1323 pinyin_context_t * & context = instance->m_context;
1324 pinyin_option_t & options = context->m_options;
1325 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1326 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1328 assert(pinyin_keys->len == pinyin_key_rests->len);
1329 guint num_keys = pinyin_keys->len;
1330 assert(offset < num_keys);
1332 /* handle "^xian$" -> "xi'an" here */
1333 ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
1334 ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
1335 ChewingKeyRest, offset);
1336 ChewingKeyRest orig_rest = *rest;
1337 guint16 tone = CHEWING_ZERO_TONE;
1339 const divided_table_item_t * item = NULL;
1342 if (options & USE_TONE) {
1344 if (CHEWING_ZERO_TONE != tone) {
1345 key->m_tone = CHEWING_ZERO_TONE;
1350 item = context->m_full_pinyin_parser->retrieve_divided_item
1351 (options, key, rest, instance->m_raw_full_pinyin,
1352 strlen(instance->m_raw_full_pinyin));
1356 assert(item->m_new_freq > 0);
1358 ChewingKey divided_keys[2];
1359 const char * pinyin = item->m_new_keys[0];
1360 assert(context->m_full_pinyin_parser->
1361 parse_one_key(options, divided_keys[0],
1362 pinyin, strlen(pinyin)));
1363 pinyin = item->m_new_keys[1];
1364 assert(context->m_full_pinyin_parser->
1365 parse_one_key(options, divided_keys[1],
1366 pinyin, strlen(pinyin)));
1368 gchar * new_pinyins = g_strdup_printf
1369 ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
1371 /* propagate the tone */
1372 if (options & USE_TONE) {
1373 if (CHEWING_ZERO_TONE != tone) {
1374 assert(0 < tone && tone <= 5);
1375 divided_keys[1].m_tone = tone;
1377 gchar * tmp_str = g_strdup_printf
1378 ("%s%d", new_pinyins, tone);
1379 g_free(new_pinyins);
1380 new_pinyins = tmp_str;
1384 /* do pinyin search. */
1385 int retval = context->m_pinyin_table->search
1386 (2, divided_keys, ranges);
1388 if (retval & SEARCH_OK) {
1389 lookup_candidate_t template_item;
1390 template_item.m_candidate_type = DIVIDED_CANDIDATE;
1391 template_item.m_orig_rest = orig_rest;
1392 template_item.m_new_pinyins = new_pinyins;
1394 _append_items(context, ranges, &template_item, items);
1397 g_free(new_pinyins);
1401 if (options & USE_TONE) {
1402 if (CHEWING_ZERO_TONE != tone) {
1411 static bool _try_resplit_table(pinyin_instance_t * instance,
1412 PhraseIndexRanges ranges,
1414 CandidateVector items){
1417 pinyin_context_t * & context = instance->m_context;
1418 pinyin_option_t & options = context->m_options;
1419 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1420 ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1422 assert(pinyin_keys->len == pinyin_key_rests->len);
1423 guint num_keys = pinyin_keys->len;
1424 assert(offset + 1 < num_keys);
1426 guint16 next_tone = CHEWING_ZERO_TONE;
1428 /* handle "^fa'nan$" -> "fan'an" here */
1429 ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
1430 ChewingKeyRest, offset);
1431 ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
1432 ChewingKeyRest, offset + 1);
1434 if (cur_rest->m_raw_end != next_rest->m_raw_begin)
1437 ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
1438 ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
1441 /* some tone here */
1442 if (CHEWING_ZERO_TONE != cur_key->m_tone)
1445 ChewingKeyRest orig_rest;
1446 orig_rest.m_raw_begin = cur_rest->m_raw_begin;
1447 orig_rest.m_raw_end = next_rest->m_raw_end;
1450 if (options & USE_TONE) {
1451 next_tone = next_key->m_tone;
1452 if (CHEWING_ZERO_TONE != next_tone) {
1453 next_key->m_tone = CHEWING_ZERO_TONE;
1454 next_rest->m_raw_end --;
1458 /* lookup re-split table */
1459 const char * str = instance->m_raw_full_pinyin;
1460 const resplit_table_item_t * item_by_orig =
1461 context->m_full_pinyin_parser->
1462 retrieve_resplit_item_by_original_pinyins
1463 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1465 const resplit_table_item_t * item_by_new =
1466 context->m_full_pinyin_parser->
1467 retrieve_resplit_item_by_resplit_pinyins
1468 (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1470 /* there are no same couple of pinyins in re-split table. */
1471 assert(!(item_by_orig && item_by_new));
1473 ChewingKey resplit_keys[2];
1474 const char * pinyins[2];
1476 bool tosearch = false;
1477 if (item_by_orig && item_by_orig->m_new_freq) {
1478 pinyins[0] = item_by_orig->m_new_keys[0];
1479 pinyins[1] = item_by_orig->m_new_keys[1];
1481 assert(context->m_full_pinyin_parser->
1482 parse_one_key(options, resplit_keys[0],
1483 pinyins[0], strlen(pinyins[0])));
1485 assert(context->m_full_pinyin_parser->
1486 parse_one_key(options, resplit_keys[1],
1487 pinyins[1], strlen(pinyins[1])));
1491 if (item_by_new && item_by_new->m_orig_freq) {
1492 pinyins[0] = item_by_new->m_orig_keys[0];
1493 pinyins[1] = item_by_new->m_orig_keys[1];
1495 assert(context->m_full_pinyin_parser->
1496 parse_one_key(options, resplit_keys[0],
1497 pinyins[0], strlen(pinyins[0])));
1499 assert(context->m_full_pinyin_parser->
1500 parse_one_key(options, resplit_keys[1],
1501 pinyins[1], strlen(pinyins[1])));
1506 gchar * new_pinyins = g_strdup_printf
1507 ("%s'%s", pinyins[0], pinyins[1]);
1509 /* propagate the tone */
1510 if (options & USE_TONE) {
1511 if (CHEWING_ZERO_TONE != next_tone) {
1512 assert(0 < next_tone && next_tone <= 5);
1513 resplit_keys[1].m_tone = next_tone;
1515 gchar * tmp_str = g_strdup_printf
1516 ("%s%d", new_pinyins, next_tone);
1517 g_free(new_pinyins);
1518 new_pinyins = tmp_str;
1522 /* do pinyin search. */
1523 int retval = context->m_pinyin_table->search
1524 (2, resplit_keys, ranges);
1526 if (retval & SEARCH_OK) {
1527 lookup_candidate_t template_item;
1528 template_item.m_candidate_type = RESPLIT_CANDIDATE;
1529 template_item.m_orig_rest = orig_rest;
1530 template_item.m_new_pinyins = new_pinyins;
1532 _append_items(context, ranges, &template_item, items);
1535 g_free(new_pinyins);
1539 if (options & USE_TONE) {
1540 if (CHEWING_ZERO_TONE != next_tone) {
1541 next_key->m_tone = next_tone;
1542 next_rest->m_raw_end ++;
1549 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1551 CandidateVector candidates){
1553 pinyin_context_t * & context = instance->m_context;
1554 pinyin_option_t & options = context->m_options;
1555 ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1557 _free_candidates(candidates);
1559 size_t pinyin_len = pinyin_keys->len - offset;
1560 pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1563 /* lookup the previous token here. */
1564 phrase_token_t prev_token = null_token;
1566 if (options & DYNAMIC_ADJUST) {
1567 prev_token = _get_previous_token(instance, offset);
1570 SingleGram merged_gram;
1571 SingleGram * system_gram = NULL, * user_gram = NULL;
1573 if (options & DYNAMIC_ADJUST) {
1574 if (null_token != prev_token) {
1575 context->m_system_bigram->load(prev_token, system_gram);
1576 context->m_user_bigram->load(prev_token, user_gram);
1577 merge_single_gram(&merged_gram, system_gram, user_gram);
1581 PhraseIndexRanges ranges;
1582 memset(ranges, 0, sizeof(ranges));
1583 context->m_phrase_index->prepare_ranges(ranges);
1585 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1587 if (1 == pinyin_len) {
1588 /* because there is only one pinyin left,
1589 * the following for-loop will not produce 2 character candidates.
1590 * the if-branch will fill the candidate list with
1591 * 2 character candidates.
1594 if (options & USE_DIVIDED_TABLE) {
1595 g_array_set_size(items, 0);
1597 if (_try_divided_table(instance, ranges, offset, items)) {
1600 g_array_sort(items, compare_item_with_token);
1602 _remove_duplicated_items(items);
1605 _compute_frequency_of_items(context, prev_token,
1606 &merged_gram, items);
1608 /* sort the candidates of the same length by frequency. */
1609 g_array_sort(items, compare_item_with_frequency);
1611 /* transfer back items to tokens, and save it into candidates */
1612 for (i = 0; i < items->len; ++i) {
1613 lookup_candidate_t * item = &g_array_index
1614 (items, lookup_candidate_t, i);
1615 g_array_append_val(candidates, *item);
1621 for (i = pinyin_len; i >= 1; --i) {
1623 g_array_set_size(items, 0);
1626 /* handle fuzzy pinyin segment here. */
1627 if (options & USE_DIVIDED_TABLE) {
1628 found = _try_divided_table(instance, ranges, offset, items) ||
1631 if (options & USE_RESPLIT_TABLE) {
1632 found = _try_resplit_table(instance, ranges, offset, items) ||
1637 ChewingKey * keys = &g_array_index
1638 (pinyin_keys, ChewingKey, offset);
1640 /* do pinyin search. */
1641 int retval = context->m_pinyin_table->search
1644 found = (retval & SEARCH_OK) || found;
1649 lookup_candidate_t template_item;
1650 _append_items(context, ranges, &template_item, items);
1653 g_array_sort(items, compare_item_with_token);
1655 _remove_duplicated_items(items);
1658 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1660 g_array_sort(items, compare_item_with_frequency);
1662 for (size_t k = 0; k < items->len; ++k) {
1663 lookup_candidate_t * item = &g_array_index
1664 (items, lookup_candidate_t, k);
1665 g_array_append_val(candidates, *item);
1669 if (!(retval & SEARCH_CONTINUED))
1674 g_array_free(items, TRUE);
1675 context->m_phrase_index->destroy_ranges(ranges);
1681 /* post process to remove duplicated candidates */
1683 _prepend_sentence_candidate(instance, candidates);
1685 _compute_phrase_strings_of_items(instance, offset, candidates);
1687 _remove_duplicated_items_by_phrase_string(instance, candidates);
1693 int pinyin_choose_candidate(pinyin_instance_t * instance,
1695 lookup_candidate_t * candidate){
1696 pinyin_context_t * & context = instance->m_context;
1698 if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1699 RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1700 /* update full pinyin. */
1701 gchar * oldpinyins = instance->m_raw_full_pinyin;
1702 const ChewingKeyRest rest = candidate->m_orig_rest;
1703 oldpinyins[rest.m_raw_begin] = '\0';
1704 const gchar * left_part = oldpinyins;
1705 const gchar * right_part = oldpinyins + rest.m_raw_end;
1706 gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1709 instance->m_raw_full_pinyin = newpinyins;
1711 /* re-parse the full pinyin. */
1712 const gchar * pinyins = instance->m_raw_full_pinyin;
1713 int pinyin_len = strlen(pinyins);
1714 int parse_len = context->m_full_pinyin_parser->parse
1715 (context->m_options, instance->m_pinyin_keys,
1716 instance->m_pinyin_key_rests, pinyins, pinyin_len);
1718 /* Note: there may be some un-parsable input here. */
1721 /* sync m_constraints to the length of m_pinyin_keys. */
1722 bool retval = context->m_pinyin_lookup->validate_constraint
1723 (instance->m_constraints, instance->m_pinyin_keys);
1725 phrase_token_t token = candidate->m_token;
1726 guint8 len = context->m_pinyin_lookup->add_constraint
1727 (instance->m_constraints, offset, token);
1729 /* safe guard: validate the m_constraints again. */
1730 retval = context->m_pinyin_lookup->validate_constraint
1731 (instance->m_constraints, instance->m_pinyin_keys) && len;
1733 return offset + len;
1737 bool pinyin_free_candidates(pinyin_instance_t * instance,
1738 CandidateVector candidates) {
1739 _free_candidates(candidates);
1743 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1745 pinyin_context_t * & context = instance->m_context;
1747 bool retval = context->m_pinyin_lookup->clear_constraint
1748 (instance->m_constraints, offset);
1753 bool pinyin_lookup_tokens(pinyin_instance_t * instance,
1754 const char * phrase, GArray * tokenarray){
1755 pinyin_context_t * & context = instance->m_context;
1756 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1759 ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
1761 PhraseTokens tokens;
1762 memset(tokens, 0, sizeof(PhraseTokens));
1763 phrase_index->prepare_tokens(tokens);
1764 int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
1765 int num = reduce_tokens(tokens, tokenarray);
1766 phrase_index->destroy_tokens(tokens);
1768 return SEARCH_OK & retval;
1771 bool pinyin_train(pinyin_instance_t * instance){
1772 if (!instance->m_context->m_user_dir)
1775 pinyin_context_t * & context = instance->m_context;
1776 context->m_modified = true;
1778 bool retval = context->m_pinyin_lookup->train_result2
1779 (instance->m_pinyin_keys, instance->m_constraints,
1780 instance->m_match_results);
1785 bool pinyin_reset(pinyin_instance_t * instance){
1786 g_free(instance->m_raw_full_pinyin);
1787 instance->m_raw_full_pinyin = NULL;
1789 g_array_set_size(instance->m_prefixes, 0);
1790 g_array_set_size(instance->m_pinyin_keys, 0);
1791 g_array_set_size(instance->m_pinyin_key_rests, 0);
1792 g_array_set_size(instance->m_constraints, 0);
1793 g_array_set_size(instance->m_match_results, 0);
1798 bool pinyin_get_chewing_string(pinyin_instance_t * instance,
1800 gchar ** utf8_str) {
1802 if (0 == key->get_table_index())
1805 *utf8_str = key->get_chewing_string();
1809 bool pinyin_get_pinyin_string(pinyin_instance_t * instance,
1811 gchar ** utf8_str) {
1813 if (0 == key->get_table_index())
1816 *utf8_str = key->get_pinyin_string();
1820 bool pinyin_get_pinyin_strings(pinyin_instance_t * instance,
1824 *shengmu = NULL; *yunmu = NULL;
1825 if (0 == key->get_table_index())
1828 *shengmu = key->get_shengmu_string();
1829 *yunmu = key->get_yunmu_string();
1833 bool pinyin_token_get_phrase(pinyin_instance_t * instance,
1834 phrase_token_t token,
1836 gchar ** utf8_str) {
1837 pinyin_context_t * & context = instance->m_context;
1839 ucs4_t buffer[MAX_PHRASE_LENGTH];
1841 int retval = context->m_phrase_index->get_phrase_item(token, item);
1842 if (ERROR_OK != retval)
1845 item.get_phrase_string(buffer);
1846 guint length = item.get_phrase_length();
1850 *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1854 bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
1855 phrase_token_t token,
1858 pinyin_context_t * & context = instance->m_context;
1861 int retval = context->m_phrase_index->get_phrase_item(token, item);
1862 if (ERROR_OK != retval)
1865 *num = item.get_n_pronunciation();
1869 bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance,
1870 phrase_token_t token,
1872 ChewingKeyVector keys){
1873 g_array_set_size(keys, 0);
1874 pinyin_context_t * & context = instance->m_context;
1876 ChewingKey buffer[MAX_PHRASE_LENGTH];
1879 int retval = context->m_phrase_index->get_phrase_item(token, item);
1880 if (ERROR_OK != retval)
1883 item.get_nth_pronunciation(nth, buffer, freq);
1884 guint8 len = item.get_phrase_length();
1885 g_array_append_vals(keys, buffer, len);
1889 bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance,
1890 phrase_token_t token,
1893 pinyin_context_t * & context = instance->m_context;
1896 int retval = context->m_phrase_index->get_phrase_item(token, item);
1897 if (ERROR_OK != retval)
1900 *freq = item.get_unigram_frequency();
1904 bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance,
1905 phrase_token_t token,
1907 pinyin_context_t * & context = instance->m_context;
1908 int retval = context->m_phrase_index->add_unigram_frequency
1910 return ERROR_OK == retval;
1916 * Note: prefix is the text before the pre-edit string.