3b5eaa941a5d39e1756978c75172b36b5ec2e082
[platform/upstream/libpinyin.git] / src / pinyin.cpp
1 /* 
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *  
5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6  *  
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  *  
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  */
21
22
23 #include "pinyin.h"
24 #include <stdio.h>
25 #include <unistd.h>
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
28
29 /* a glue layer for input method integration. */
30
31 struct _pinyin_context_t{
32     pinyin_option_t m_options;
33
34     FullPinyinParser2 * m_full_pinyin_parser;
35     DoublePinyinParser2 * m_double_pinyin_parser;
36     ChewingParser2 * m_chewing_parser;
37
38     FacadeChewingTable * m_pinyin_table;
39     FacadePhraseTable * m_phrase_table;
40     FacadePhraseIndex * m_phrase_index;
41     Bigram * m_system_bigram;
42     Bigram * m_user_bigram;
43
44     PinyinLookup * m_pinyin_lookup;
45     PhraseLookup * m_phrase_lookup;
46
47     char * m_system_dir;
48     char * m_user_dir;
49     bool m_modified;
50 };
51
52 static bool check_format(const char * userdir){
53     gchar * filename = g_build_filename
54         (userdir, "version", NULL);
55
56     MemoryChunk chunk;
57     bool exists = chunk.load(filename);
58
59     if (exists) {
60         exists = (0 == memcmp
61                   (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
62                    strlen(LIBPINYIN_FORMAT_VERSION) + 1));
63     }
64     g_free(filename);
65
66     if (exists)
67         return exists;
68
69     /* clean up files, if version mis-matches. */
70     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
71         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
72
73         if (NOT_USED == table_info->m_file_type)
74             continue;
75
76         if (NULL == table_info->m_user_filename)
77             continue;
78
79         const char * userfilename = table_info->m_user_filename;
80
81         /* remove dbin file. */
82         filename = g_build_filename(userdir, userfilename, NULL);
83         unlink(filename);
84         g_free(filename);
85     }
86
87     filename = g_build_filename
88         (userdir, "user.db", NULL);
89     unlink(filename);
90     g_free(filename);
91
92     return exists;
93 }
94
95 static bool mark_version(const char * userdir){
96     gchar * filename = g_build_filename
97         (userdir, "version", NULL);
98     MemoryChunk chunk;
99     chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
100                       strlen(LIBPINYIN_FORMAT_VERSION) + 1);
101     bool retval = chunk.save(filename);
102     g_free(filename);
103     return retval;
104 }
105
106 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
107     pinyin_context_t * context = new pinyin_context_t;
108
109     context->m_options = USE_TONE;
110
111     context->m_system_dir = g_strdup(systemdir);
112     context->m_user_dir = g_strdup(userdir);
113     context->m_modified = false;
114
115     check_format(context->m_user_dir);
116
117     context->m_pinyin_table = new FacadeChewingTable;
118     MemoryChunk * chunk = new MemoryChunk;
119     gchar * filename = g_build_filename
120         (context->m_system_dir, "pinyin_index.bin", NULL);
121     if (!chunk->load(filename)) {
122         fprintf(stderr, "open %s failed!\n", filename);
123         return NULL;
124     }
125     g_free(filename);
126
127     context->m_pinyin_table->load(context->m_options, chunk, NULL);
128
129     context->m_full_pinyin_parser = new FullPinyinParser2;
130     context->m_double_pinyin_parser = new DoublePinyinParser2;
131     context->m_chewing_parser = new ChewingParser2;
132
133     context->m_phrase_table = new FacadePhraseTable;
134     chunk = new MemoryChunk;
135     filename = g_build_filename(context->m_system_dir, "phrase_index.bin", NULL);
136     if (!chunk->load(filename)) {
137         fprintf(stderr, "open %s failed!\n", filename);
138         return NULL;
139     }
140     g_free(filename);
141     context->m_phrase_table->load(chunk, NULL);
142
143     context->m_phrase_index = new FacadePhraseIndex;
144
145     /* hack here: directly call load phrase library. */
146     pinyin_load_phrase_library(context, 1);
147
148     context->m_system_bigram = new Bigram;
149     filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
150     context->m_system_bigram->attach(filename, ATTACH_READONLY);
151     g_free(filename);
152
153     context->m_user_bigram = new Bigram;
154     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
155     context->m_user_bigram->load_db(filename);
156     g_free(filename);
157
158     context->m_pinyin_lookup = new PinyinLookup
159         ( context->m_options, context->m_pinyin_table,
160           context->m_phrase_index, context->m_system_bigram,
161           context->m_user_bigram);
162
163     context->m_phrase_lookup = new PhraseLookup
164         (context->m_phrase_table, context->m_phrase_index,
165          context->m_system_bigram, context->m_user_bigram);
166
167     return context;
168 }
169
170 bool pinyin_load_phrase_library(pinyin_context_t * context,
171                                 guint8 index){
172     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
173     const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
174
175     if (SYSTEM_FILE == table_info->m_file_type) {
176         /* system phrase library */
177         MemoryChunk * chunk = new MemoryChunk;
178
179         const char * systemfilename = table_info->m_system_filename;
180         /* check bin file in system dir. */
181         gchar * chunkfilename = g_build_filename(context->m_system_dir,
182                                                  systemfilename, NULL);
183         chunk->load(chunkfilename);
184         g_free(chunkfilename);
185
186         context->m_phrase_index->load(index, chunk);
187
188         const char * userfilename = table_info->m_user_filename;
189
190         chunkfilename = g_build_filename(context->m_user_dir,
191                                          userfilename, NULL);
192
193         MemoryChunk * log = new MemoryChunk;
194         log->load(chunkfilename);
195         g_free(chunkfilename);
196
197         /* merge the chunk log. */
198         context->m_phrase_index->merge(index, log);
199         return true;
200     }
201
202     if (USER_FILE == table_info->m_file_type) {
203         /* user phrase library */
204         MemoryChunk * chunk = new MemoryChunk;
205         const char * userfilename = table_info->m_user_filename;
206
207         gchar * chunkfilename = g_build_filename(context->m_user_dir,
208                                                  userfilename, NULL);
209
210         /* check bin file exists. if not, create a new one. */
211         if (chunk->load(chunkfilename)) {
212             context->m_phrase_index->load(index, chunk);
213         } else {
214             delete chunk;
215             context->m_phrase_index->create_sub_phrase(index);
216         }
217
218         g_free(chunkfilename);
219         return true;
220     }
221
222     return false;
223 }
224
225 bool pinyin_unload_phrase_library(pinyin_context_t * context,
226                                   guint8 index){
227     /* gb_char.bin can't be unloaded. */
228     if (1 == index)
229         return false;
230
231     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
232
233     context->m_phrase_index->unload(index);
234     return true;
235 }
236
237
238 bool pinyin_save(pinyin_context_t * context){
239     if (!context->m_user_dir)
240         return false;
241
242     if (!context->m_modified)
243         return false;
244
245     context->m_phrase_index->compact();
246
247     /* skip the reserved zero phrase library. */
248     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
249         PhraseIndexRange range;
250         int retval = context->m_phrase_index->get_range(i, range);
251
252         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
253             continue;
254
255         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
256
257         if (NOT_USED == table_info->m_file_type)
258             continue;
259
260         const char * userfilename = table_info->m_user_filename;
261
262         if (NULL == userfilename)
263             continue;
264
265         if (SYSTEM_FILE == table_info->m_file_type) {
266             /* system phrase library */
267             MemoryChunk * chunk = new MemoryChunk;
268             MemoryChunk * log = new MemoryChunk;
269             const char * systemfilename = table_info->m_system_filename;
270
271             /* check bin file in system dir. */
272             gchar * chunkfilename = g_build_filename(context->m_system_dir,
273                                                      systemfilename, NULL);
274             chunk->load(chunkfilename);
275             g_free(chunkfilename);
276             context->m_phrase_index->diff(i, chunk, log);
277
278             const char * userfilename = table_info->m_user_filename;
279             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
280
281             gchar * tmppathname = g_build_filename(context->m_user_dir,
282                                                    tmpfilename, NULL);
283             g_free(tmpfilename);
284
285             gchar * chunkpathname = g_build_filename(context->m_user_dir,
286                                                      userfilename, NULL);
287             log->save(tmppathname);
288             rename(tmppathname, chunkpathname);
289             g_free(chunkpathname);
290             g_free(tmppathname);
291             delete log;
292         }
293
294         if (USER_FILE == table_info->m_file_type) {
295             /* user phrase library */
296             MemoryChunk * chunk = new MemoryChunk;
297             context->m_phrase_index->store(i, chunk);
298
299             const char * userfilename = table_info->m_user_filename;
300             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
301             gchar * tmppathname = g_build_filename(context->m_user_dir,
302                                                    tmpfilename, NULL);
303             g_free(tmpfilename);
304
305             gchar * chunkpathname = g_build_filename(context->m_user_dir,
306                                                      userfilename, NULL);
307
308             chunk->save(tmppathname);
309             rename(tmppathname, chunkpathname);
310             g_free(chunkpathname);
311             g_free(tmppathname);
312             delete chunk;
313         }
314     }
315
316     gchar * tmpfilename = g_build_filename(context->m_user_dir,
317                                    "user.db.tmp", NULL);
318     unlink(tmpfilename);
319     gchar * filename = g_build_filename(context->m_user_dir, "user.db", NULL);
320     context->m_user_bigram->save_db(tmpfilename);
321     rename(tmpfilename, filename);
322     g_free(tmpfilename);
323     g_free(filename);
324
325     mark_version(context->m_user_dir);
326
327     context->m_modified = false;
328     return true;
329 }
330
331 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
332                                      DoublePinyinScheme scheme){
333     context->m_double_pinyin_parser->set_scheme(scheme);
334     return true;
335 }
336
337 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
338                                ChewingScheme scheme){
339     context->m_chewing_parser->set_scheme(scheme);
340     return true;
341 }
342
343
344 void pinyin_fini(pinyin_context_t * context){
345     delete context->m_full_pinyin_parser;
346     delete context->m_double_pinyin_parser;
347     delete context->m_chewing_parser;
348     delete context->m_pinyin_table;
349     delete context->m_phrase_table;
350     delete context->m_phrase_index;
351     delete context->m_system_bigram;
352     delete context->m_user_bigram;
353     delete context->m_pinyin_lookup;
354     delete context->m_phrase_lookup;
355
356     g_free(context->m_system_dir);
357     g_free(context->m_user_dir);
358     context->m_modified = false;
359 }
360
361 /* copy from options to context->m_options. */
362 bool pinyin_set_options(pinyin_context_t * context,
363                         pinyin_option_t options){
364     context->m_options = options;
365     context->m_pinyin_table->set_options(context->m_options);
366     context->m_pinyin_lookup->set_options(context->m_options);
367     return true;
368 }
369
370
371 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
372     pinyin_instance_t * instance = new pinyin_instance_t;
373     instance->m_context = context;
374
375     instance->m_raw_full_pinyin = NULL;
376
377     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
378     instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
379     instance->m_pinyin_key_rests =
380         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
381     instance->m_constraints = g_array_new
382         (FALSE, FALSE, sizeof(lookup_constraint_t));
383     instance->m_match_results =
384         g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
385
386     return instance;
387 }
388
389 void pinyin_free_instance(pinyin_instance_t * instance){
390     g_free(instance->m_raw_full_pinyin);
391     g_array_free(instance->m_prefixes, TRUE);
392     g_array_free(instance->m_pinyin_keys, TRUE);
393     g_array_free(instance->m_pinyin_key_rests, TRUE);
394     g_array_free(instance->m_constraints, TRUE);
395     g_array_free(instance->m_match_results, TRUE);
396
397     delete instance;
398 }
399
400
401 static bool pinyin_update_constraints(pinyin_instance_t * instance){
402     pinyin_context_t * & context = instance->m_context;
403     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
404     CandidateConstraints & constraints = instance->m_constraints;
405
406     size_t key_len = constraints->len;
407     g_array_set_size(constraints, pinyin_keys->len);
408     for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
409         lookup_constraint_t * constraint =
410             &g_array_index(constraints, lookup_constraint_t, i);
411         constraint->m_type = NO_CONSTRAINT;
412     }
413
414     context->m_pinyin_lookup->validate_constraint
415         (constraints, pinyin_keys);
416
417     return true;
418 }
419
420
421 bool pinyin_guess_sentence(pinyin_instance_t * instance){
422     pinyin_context_t * & context = instance->m_context;
423
424     g_array_set_size(instance->m_prefixes, 0);
425     g_array_append_val(instance->m_prefixes, sentence_start);
426
427     pinyin_update_constraints(instance);
428     bool retval = context->m_pinyin_lookup->get_best_match
429         (instance->m_prefixes,
430          instance->m_pinyin_keys,
431          instance->m_constraints,
432          instance->m_match_results);
433
434     return retval;
435 }
436
437 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
438                                        const char * prefix){
439     pinyin_context_t * & context = instance->m_context;
440
441     g_array_set_size(instance->m_prefixes, 0);
442     g_array_append_val(instance->m_prefixes, sentence_start);
443
444     glong written = 0;
445     ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &written, NULL);
446
447     if (ucs4_str && written) {
448         /* add prefixes. */
449         for (ssize_t i = 1; i <= written; ++i) {
450             if (i > MAX_PHRASE_LENGTH)
451                 break;
452
453             phrase_token_t token = null_token;
454             ucs4_t * start = ucs4_str + written - i;
455             int result = context->m_phrase_table->search(i, start, token);
456             if (result & SEARCH_OK)
457                 g_array_append_val(instance->m_prefixes, token);
458         }
459     }
460     g_free(ucs4_str);
461
462     pinyin_update_constraints(instance);
463     bool retval = context->m_pinyin_lookup->get_best_match
464         (instance->m_prefixes,
465          instance->m_pinyin_keys,
466          instance->m_constraints,
467          instance->m_match_results);
468
469     return retval;
470 }
471
472 bool pinyin_phrase_segment(pinyin_instance_t * instance,
473                            const char * sentence){
474     pinyin_context_t * & context = instance->m_context;
475
476     const glong num_of_chars = g_utf8_strlen(sentence, -1);
477     glong ucs4_len = 0;
478     ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
479
480     g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
481
482     bool retval = context->m_phrase_lookup->get_best_match
483         (ucs4_len, ucs4_str, instance->m_match_results);
484
485     g_free(ucs4_str);
486     return retval;
487 }
488
489 /* the returned sentence should be freed by g_free(). */
490 bool pinyin_get_sentence(pinyin_instance_t * instance,
491                          char ** sentence){
492     pinyin_context_t * & context = instance->m_context;
493
494     bool retval = pinyin::convert_to_utf8
495         (context->m_phrase_index, instance->m_match_results,
496          NULL, *sentence);
497
498     return retval;
499 }
500
501 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
502                               const char * onepinyin,
503                               ChewingKey * onekey){
504     pinyin_context_t * & context = instance->m_context;
505
506     int pinyin_len = strlen(onepinyin);
507     int parse_len = context->m_full_pinyin_parser->parse_one_key
508         ( context->m_options, *onekey, onepinyin, pinyin_len);
509     return pinyin_len == parse_len;
510 }
511
512 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
513                                       const char * pinyins){
514     pinyin_context_t * & context = instance->m_context;
515
516     g_free(instance->m_raw_full_pinyin);
517     instance->m_raw_full_pinyin = g_strdup(pinyins);
518     int pinyin_len = strlen(pinyins);
519
520     int parse_len = context->m_full_pinyin_parser->parse
521         ( context->m_options, instance->m_pinyin_keys,
522           instance->m_pinyin_key_rests, pinyins, pinyin_len);
523
524     return parse_len;
525 }
526
527 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
528                                 const char * onepinyin,
529                                 ChewingKey * onekey){
530     pinyin_context_t * & context = instance->m_context;
531
532     int pinyin_len = strlen(onepinyin);
533     int parse_len = context->m_double_pinyin_parser->parse_one_key
534         ( context->m_options, *onekey, onepinyin, pinyin_len);
535     return pinyin_len == parse_len;
536 }
537
538 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
539                                         const char * pinyins){
540     pinyin_context_t * & context = instance->m_context;
541     int pinyin_len = strlen(pinyins);
542
543     int parse_len = context->m_double_pinyin_parser->parse
544         ( context->m_options, instance->m_pinyin_keys,
545           instance->m_pinyin_key_rests, pinyins, pinyin_len);
546
547     return parse_len;
548 }
549
550 bool pinyin_parse_chewing(pinyin_instance_t * instance,
551                           const char * onechewing,
552                           ChewingKey * onekey){
553     pinyin_context_t * & context = instance->m_context;
554
555     int chewing_len = strlen(onechewing);
556     int parse_len = context->m_chewing_parser->parse_one_key
557         ( context->m_options, *onekey, onechewing, chewing_len );
558     return chewing_len == parse_len;
559 }
560
561 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
562                                   const char * chewings){
563     pinyin_context_t * & context = instance->m_context;
564     int chewing_len = strlen(chewings);
565
566     int parse_len = context->m_chewing_parser->parse
567         ( context->m_options, instance->m_pinyin_keys,
568           instance->m_pinyin_key_rests, chewings, chewing_len);
569
570     return parse_len;
571 }
572
573 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
574                                 const char key, const char ** symbol) {
575     pinyin_context_t * & context = instance->m_context;
576     return context->m_chewing_parser->in_chewing_scheme
577         (context->m_options, key, symbol);
578 }
579
580
581 static gint compare_item_with_token(gconstpointer lhs,
582                                     gconstpointer rhs) {
583     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
584     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
585
586     phrase_token_t token_lhs = item_lhs->m_token;
587     phrase_token_t token_rhs = item_rhs->m_token;
588
589     return (token_lhs - token_rhs);
590 }
591
592 static gint compare_item_with_frequency(gconstpointer lhs,
593                                         gconstpointer rhs) {
594     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
595     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
596
597     guint32 freq_lhs = item_lhs->m_freq;
598     guint32 freq_rhs = item_rhs->m_freq;
599
600     return -(freq_lhs - freq_rhs); /* in descendant order */
601 }
602
603 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
604                                           size_t offset) {
605     phrase_token_t prev_token = null_token;
606     ssize_t i;
607
608     if (0 == offset) {
609         /* get previous token from prefixes. */
610         prev_token = sentence_start;
611         size_t prev_token_len = 0;
612
613         pinyin_context_t * context = instance->m_context;
614         TokenVector prefixes = instance->m_prefixes;
615         PhraseItem item;
616
617         for (size_t i = 0; i < prefixes->len; ++i) {
618             phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
619             if (sentence_start == token)
620                 continue;
621
622             int retval = context->m_phrase_index->get_phrase_item(token, item);
623             if (ERROR_OK == retval) {
624                 size_t token_len = item.get_phrase_length();
625                 if (token_len > prev_token_len) {
626                     /* found longer match, and save it. */
627                     prev_token = token;
628                     prev_token_len = token_len;
629                 }
630             }
631         }
632     } else {
633         /* get previous token from match results. */
634         assert (0 < offset);
635
636         phrase_token_t cur_token = g_array_index
637             (instance->m_match_results, phrase_token_t, offset);
638         if (null_token != cur_token) {
639             for (i = offset - 1; i >= 0; --i) {
640                 cur_token = g_array_index
641                     (instance->m_match_results, phrase_token_t, i);
642                 if (null_token != cur_token) {
643                     prev_token = cur_token;
644                     break;
645                 }
646             }
647         }
648     }
649
650     return prev_token;
651 }
652
653 static void _append_items(pinyin_context_t * context,
654                           PhraseIndexRanges ranges,
655                           lookup_candidate_t * template_item,
656                           CandidateVector items) {
657     /* reduce and append to a single GArray. */
658     for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
659         if (NULL == ranges[m])
660             continue;
661
662         for (size_t n = 0; n < ranges[m]->len; ++n) {
663             PhraseIndexRange * range =
664                 &g_array_index(ranges[m], PhraseIndexRange, n);
665             for (size_t k = range->m_range_begin;
666                  k < range->m_range_end; ++k) {
667                 lookup_candidate_t item;
668                 item.m_candidate_type = template_item->m_candidate_type;
669                 item.m_token = k;
670                 item.m_orig_rest = template_item->m_orig_rest;
671                 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
672                 item.m_freq = template_item->m_freq;
673                 g_array_append_val(items, item);
674             }
675         }
676     }
677 }
678
679 static void _remove_duplicated_items(CandidateVector items) {
680     /* remove the duplicated items. */
681     phrase_token_t last_token = null_token, saved_token;
682     for (size_t n = 0; n < items->len; ++n) {
683         lookup_candidate_t * item = &g_array_index
684             (items, lookup_candidate_t, n);
685
686         saved_token = item->m_token;
687         if (last_token == saved_token) {
688             g_array_remove_index(items, n);
689             n--;
690         }
691         last_token = saved_token;
692     }
693 }
694
695 static void _compute_frequency_of_items(pinyin_context_t * context,
696                                         phrase_token_t prev_token,
697                                         SingleGram * merged_gram,
698                                         CandidateVector items) {
699     pinyin_option_t & options = context->m_options;
700     ssize_t i;
701
702     PhraseItem cached_item;
703     /* compute all freqs. */
704     for (i = 0; i < items->len; ++i) {
705         lookup_candidate_t * item = &g_array_index
706             (items, lookup_candidate_t, i);
707         phrase_token_t & token = item->m_token;
708
709         gfloat bigram_poss = 0; guint32 total_freq = 0;
710         if (options & DYNAMIC_ADJUST) {
711             if (null_token != prev_token) {
712                 guint32 bigram_freq = 0;
713                 merged_gram->get_total_freq(total_freq);
714                 merged_gram->get_freq(token, bigram_freq);
715                 if (0 != total_freq)
716                     bigram_poss = bigram_freq / (gfloat)total_freq;
717             }
718         }
719
720         /* compute the m_freq. */
721         FacadePhraseIndex * & phrase_index = context->m_phrase_index;
722         phrase_index->get_phrase_item(token, cached_item);
723         total_freq = phrase_index->get_phrase_index_total_freq();
724         assert (0 < total_freq);
725
726         /* Note: possibility value <= 1.0. */
727         guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
728                         (1 - LAMBDA_PARAMETER) *
729                         cached_item.get_unigram_frequency() /
730                         (gfloat) total_freq) * 256 * 256 * 256;
731         item->m_freq = freq;
732     }
733 }
734
735 bool pinyin_get_candidates(pinyin_instance_t * instance,
736                            size_t offset,
737                            TokenVector candidates) {
738
739     pinyin_context_t * & context = instance->m_context;
740     pinyin_option_t & options = context->m_options;
741     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
742     g_array_set_size(candidates, 0);
743
744     size_t pinyin_len = pinyin_keys->len - offset;
745     ssize_t i;
746
747     /* lookup the previous token here. */
748     phrase_token_t prev_token = null_token;
749
750     if (options & DYNAMIC_ADJUST) {
751         prev_token = _get_previous_token(instance, offset);
752     }
753
754     SingleGram merged_gram;
755     SingleGram * system_gram = NULL, * user_gram = NULL;
756
757     if (options & DYNAMIC_ADJUST) {
758         if (null_token != prev_token) {
759             context->m_system_bigram->load(prev_token, system_gram);
760             context->m_user_bigram->load(prev_token, user_gram);
761             merge_single_gram(&merged_gram, system_gram, user_gram);
762         }
763     }
764
765     PhraseIndexRanges ranges;
766     memset(ranges, 0, sizeof(ranges));
767     context->m_phrase_index->prepare_ranges(ranges);
768
769     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
770
771     for (i = pinyin_len; i >= 1; --i) {
772         g_array_set_size(items, 0);
773
774         ChewingKey * keys = &g_array_index
775             (pinyin_keys, ChewingKey, offset);
776
777         /* do pinyin search. */
778         int retval = context->m_pinyin_table->search
779             (i, keys, ranges);
780
781         if ( !(retval & SEARCH_OK) )
782             continue;
783
784         lookup_candidate_t template_item;
785         _append_items(context, ranges, &template_item, items);
786
787         g_array_sort(items, compare_item_with_token);
788
789         _remove_duplicated_items(items);
790
791         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
792
793         /* sort the candidates of the same length by frequency. */
794         g_array_sort(items, compare_item_with_frequency);
795
796         /* transfer back items to tokens, and save it into candidates */
797         for (ssize_t k = 0; k < items->len; ++k) {
798             lookup_candidate_t * item = &g_array_index
799                 (items, lookup_candidate_t, k);
800             g_array_append_val(candidates, item->m_token);
801         }
802
803         if (!(retval & SEARCH_CONTINUED))
804             break;
805     }
806
807     g_array_free(items, TRUE);
808
809     context->m_phrase_index->destroy_ranges(ranges);
810
811     if (system_gram)
812         delete system_gram;
813     if (user_gram)
814         delete user_gram;
815     return true;
816 }
817
818
819 static bool _try_divided_table(pinyin_instance_t * instance,
820                                PhraseIndexRanges ranges,
821                                size_t offset,
822                                CandidateVector items){
823     bool found = false;
824
825     pinyin_context_t * & context = instance->m_context;
826     pinyin_option_t & options = context->m_options;
827     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
828     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
829
830     assert(pinyin_keys->len == pinyin_key_rests->len);
831     gint num_keys = pinyin_keys->len;
832     assert(offset < num_keys);
833
834     /* handle "^xian$" -> "xi'an" here */
835     ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
836     ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
837                                            ChewingKeyRest, offset);
838     ChewingKeyRest orig_rest = *rest;
839     guint16 tone = CHEWING_ZERO_TONE;
840
841     const divided_table_item_t * item = NULL;
842
843     /* back up tone */
844     if (options & USE_TONE) {
845         tone = key->m_tone;
846         if (CHEWING_ZERO_TONE != tone) {
847             key->m_tone = CHEWING_ZERO_TONE;
848             rest->m_raw_end --;
849         }
850     }
851
852     item = context->m_full_pinyin_parser->retrieve_divided_item
853         (options, key, rest, instance->m_raw_full_pinyin,
854          strlen(instance->m_raw_full_pinyin));
855
856     if (item) {
857         /* no ops */
858         assert(item->m_new_freq > 0);
859
860         ChewingKey divided_keys[2];
861         const char * pinyin = item->m_new_keys[0];
862         assert(context->m_full_pinyin_parser->
863                parse_one_key(options, divided_keys[0],
864                              pinyin, strlen(pinyin)));
865         pinyin = item->m_new_keys[1];
866         assert(context->m_full_pinyin_parser->
867                parse_one_key(options, divided_keys[1],
868                              pinyin, strlen(pinyin)));
869
870         gchar * new_pinyins = g_strdup_printf
871             ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
872
873         /* propagate the tone */
874         if (options & USE_TONE) {
875             if (CHEWING_ZERO_TONE != tone) {
876                 assert(0 < tone && tone <= 5);
877                 divided_keys[1].m_tone = tone;
878
879                 gchar * tmp_str = g_strdup_printf
880                     ("%s%d", new_pinyins, tone);
881                 g_free(new_pinyins);
882                 new_pinyins = tmp_str;
883             }
884         }
885
886         /* do pinyin search. */
887         int retval = context->m_pinyin_table->search
888             (2, divided_keys, ranges);
889
890         if (retval & SEARCH_OK) {
891             lookup_candidate_t template_item;
892             template_item.m_candidate_type = DIVIDED_CANDIDATE;
893             template_item.m_orig_rest = orig_rest;
894             template_item.m_new_pinyins = new_pinyins;
895
896             _append_items(context, ranges, &template_item, items);
897             found = true;
898         }
899         g_free(new_pinyins);
900     }
901
902     /* restore tones */
903     if (options & USE_TONE) {
904         if (CHEWING_ZERO_TONE != tone) {
905             key->m_tone = tone;
906             rest->m_raw_end ++;
907         }
908     }
909
910     return found;
911 }
912
913 static bool _try_resplit_table(pinyin_instance_t * instance,
914                                PhraseIndexRanges ranges,
915                                size_t offset,
916                                CandidateVector items){
917     bool found = false;
918
919     pinyin_context_t * & context = instance->m_context;
920     pinyin_option_t & options = context->m_options;
921     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
922     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
923
924     assert(pinyin_keys->len == pinyin_key_rests->len);
925     gint num_keys = pinyin_keys->len;
926     assert(offset + 1 < num_keys);
927
928     guint16 next_tone = CHEWING_ZERO_TONE;
929
930     /* handle "^fa'nan$" -> "fan'an" here */
931     ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
932                                                ChewingKeyRest, offset);
933     ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
934                                                 ChewingKeyRest, offset + 1);
935     /* some "'" here */
936     if (cur_rest->m_raw_end != next_rest->m_raw_begin)
937         return found;
938
939     ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
940     ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
941                                            offset + 1);
942
943     /* some tone here */
944     if (CHEWING_ZERO_TONE != cur_key->m_tone)
945         return found;
946
947     ChewingKeyRest orig_rest;
948     orig_rest.m_raw_begin = cur_rest->m_raw_begin;
949     orig_rest.m_raw_end = next_rest->m_raw_end;
950
951     /* backup tone */
952     if (options & USE_TONE) {
953         next_tone = next_key->m_tone;
954         if (CHEWING_ZERO_TONE != next_tone) {
955             next_key->m_tone = CHEWING_ZERO_TONE;
956             next_rest->m_raw_end --;
957         }
958     }
959
960     /* lookup re-split table */
961     const char * str = instance->m_raw_full_pinyin;
962     const resplit_table_item_t * item_by_orig =
963         context->m_full_pinyin_parser->
964         retrieve_resplit_item_by_original_pinyins
965         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
966
967     const resplit_table_item_t * item_by_new =
968         context->m_full_pinyin_parser->
969         retrieve_resplit_item_by_resplit_pinyins
970         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
971
972     /* there are no same couple of pinyins in re-split table. */
973     assert(!(item_by_orig && item_by_new));
974
975     ChewingKey resplit_keys[2];
976     const char * pinyins[2];
977
978     bool tosearch = false;
979     if (item_by_orig && item_by_orig->m_new_freq) {
980         pinyins[0] = item_by_orig->m_new_keys[0];
981         pinyins[1] = item_by_orig->m_new_keys[1];
982
983         assert(context->m_full_pinyin_parser->
984                parse_one_key(options, resplit_keys[0],
985                              pinyins[0], strlen(pinyins[0])));
986
987         assert(context->m_full_pinyin_parser->
988                parse_one_key(options, resplit_keys[1],
989                              pinyins[1], strlen(pinyins[1])));
990         tosearch = true;
991     }
992
993     if (item_by_new && item_by_new->m_orig_freq) {
994         pinyins[0] = item_by_new->m_orig_keys[0];
995         pinyins[1] = item_by_new->m_orig_keys[1];
996
997         assert(context->m_full_pinyin_parser->
998                parse_one_key(options, resplit_keys[0],
999                              pinyins[0], strlen(pinyins[0])));
1000
1001         assert(context->m_full_pinyin_parser->
1002                parse_one_key(options, resplit_keys[1],
1003                              pinyins[1], strlen(pinyins[1])));
1004         tosearch = true;
1005     }
1006
1007     if (tosearch) {
1008         gchar * new_pinyins = g_strdup_printf
1009             ("%s'%s", pinyins[0], pinyins[1]);
1010
1011         /* propagate the tone */
1012         if (options & USE_TONE) {
1013             if (CHEWING_ZERO_TONE != next_tone) {
1014                 assert(0 < next_tone && next_tone <= 5);
1015                 resplit_keys[1].m_tone = next_tone;
1016
1017                 gchar * tmp_str = g_strdup_printf
1018                     ("%s%d", new_pinyins, next_tone);
1019                 g_free(new_pinyins);
1020                 new_pinyins = tmp_str;
1021             }
1022         }
1023
1024         /* do pinyin search. */
1025         int retval = context->m_pinyin_table->search
1026             (2, resplit_keys, ranges);
1027
1028         if (retval & SEARCH_OK) {
1029             lookup_candidate_t template_item;
1030             template_item.m_candidate_type = RESPLIT_CANDIDATE;
1031             template_item.m_orig_rest = orig_rest;
1032             template_item.m_new_pinyins = new_pinyins;
1033
1034             _append_items(context, ranges, &template_item, items);
1035             found = true;
1036         }
1037         g_free(new_pinyins);
1038     }
1039
1040     /* restore tones */
1041     if (options & USE_TONE) {
1042         if (CHEWING_ZERO_TONE != next_tone) {
1043             next_key->m_tone = next_tone;
1044             next_rest->m_raw_end ++;
1045         }
1046     }
1047
1048     return found;
1049 }
1050
1051 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1052                                        size_t offset,
1053                                        CandidateVector candidates){
1054
1055     pinyin_context_t * & context = instance->m_context;
1056     pinyin_option_t & options = context->m_options;
1057     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1058
1059     /* free memory */
1060     for (size_t i = 0; i < candidates->len; ++i) {
1061         lookup_candidate_t * candidate = &g_array_index
1062             (candidates, lookup_candidate_t, i);
1063         g_free(candidate->m_new_pinyins);
1064     }
1065     g_array_set_size(candidates, 0);
1066
1067     size_t pinyin_len = pinyin_keys->len - offset;
1068     pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1069     ssize_t i;
1070
1071     /* lookup the previous token here. */
1072     phrase_token_t prev_token = null_token;
1073
1074     if (options & DYNAMIC_ADJUST) {
1075         prev_token = _get_previous_token(instance, offset);
1076     }
1077
1078     SingleGram merged_gram;
1079     SingleGram * system_gram = NULL, * user_gram = NULL;
1080
1081     if (options & DYNAMIC_ADJUST) {
1082         if (null_token != prev_token) {
1083             context->m_system_bigram->load(prev_token, system_gram);
1084             context->m_user_bigram->load(prev_token, user_gram);
1085             merge_single_gram(&merged_gram, system_gram, user_gram);
1086         }
1087     }
1088
1089     PhraseIndexRanges ranges;
1090     memset(ranges, 0, sizeof(ranges));
1091     context->m_phrase_index->prepare_ranges(ranges);
1092
1093     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1094
1095     if (1 == pinyin_len) {
1096         /* because there is only one pinyin left,
1097          *  the following for-loop will not produce 2 character candidates.
1098          * the if-branch will fill the candidate list with
1099          *  2 character candidates.
1100          */
1101
1102         if (options & USE_DIVIDED_TABLE) {
1103             g_array_set_size(items, 0);
1104
1105             if (_try_divided_table(instance, ranges, offset, items)) {
1106
1107                 g_array_sort(items, compare_item_with_token);
1108
1109                 _remove_duplicated_items(items);
1110
1111                 _compute_frequency_of_items(context, prev_token,
1112                                             &merged_gram, items);
1113
1114                 /* sort the candidates of the same length by frequency. */
1115                 g_array_sort(items, compare_item_with_frequency);
1116
1117                 /* transfer back items to tokens, and save it into candidates */
1118                 for (i = 0; i < items->len; ++i) {
1119                     lookup_candidate_t * item = &g_array_index
1120                         (items, lookup_candidate_t, i);
1121                     g_array_append_val(candidates, *item);
1122                 }
1123             }
1124         }
1125     }
1126
1127     for (i = pinyin_len; i >= 1; --i) {
1128         bool found = false;
1129         g_array_set_size(items, 0);
1130
1131         if (2 == i) {
1132             /* handle fuzzy pinyin segment here. */
1133             if (options & USE_DIVIDED_TABLE) {
1134                 found = _try_divided_table(instance, ranges, offset, items) ||
1135                     found;
1136             }
1137             if (options & USE_RESPLIT_TABLE) {
1138                 found = _try_resplit_table(instance, ranges, offset, items) ||
1139                     found;
1140             }
1141         }
1142
1143         ChewingKey * keys = &g_array_index
1144             (pinyin_keys, ChewingKey, offset);
1145
1146         /* do pinyin search. */
1147         int retval = context->m_pinyin_table->search
1148             (i, keys, ranges);
1149
1150         found = (retval & SEARCH_OK) || found;
1151
1152         if ( !found )
1153             continue;
1154
1155         lookup_candidate_t template_item;
1156         _append_items(context, ranges, &template_item, items);
1157
1158         g_array_sort(items, compare_item_with_token);
1159
1160         _remove_duplicated_items(items);
1161
1162         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1163
1164         g_array_sort(items, compare_item_with_frequency);
1165
1166         for (size_t k = 0; k < items->len; ++k) {
1167             lookup_candidate_t * item = &g_array_index
1168                 (items, lookup_candidate_t, k);
1169             g_array_append_val(candidates, *item);
1170         }
1171
1172         if (!(retval & SEARCH_CONTINUED))
1173             break;
1174     }
1175
1176     g_array_free(items, TRUE);
1177
1178     context->m_phrase_index->destroy_ranges(ranges);
1179
1180     if (system_gram)
1181         delete system_gram;
1182     if (user_gram)
1183         delete user_gram;
1184     return true;
1185 }
1186
1187
1188 int pinyin_choose_candidate(pinyin_instance_t * instance,
1189                             size_t offset,
1190                             phrase_token_t token){
1191     pinyin_context_t * & context = instance->m_context;
1192
1193     guint8 len = context->m_pinyin_lookup->add_constraint
1194         (instance->m_constraints, offset, token);
1195
1196     bool retval = context->m_pinyin_lookup->validate_constraint
1197         (instance->m_constraints, instance->m_pinyin_keys) && len;
1198
1199     return offset + len;
1200 }
1201
1202 int pinyin_choose_full_pinyin_candidate(pinyin_instance_t * instance,
1203                                         size_t offset,
1204                                         lookup_candidate_t * candidate){
1205     pinyin_context_t * & context = instance->m_context;
1206
1207     if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1208         RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1209         /* update full pinyin. */
1210         gchar * oldpinyins = instance->m_raw_full_pinyin;
1211         const ChewingKeyRest rest = candidate->m_orig_rest;
1212         oldpinyins[rest.m_raw_begin] = '\0';
1213         const gchar * left_part = oldpinyins;
1214         const gchar * right_part = oldpinyins + rest.m_raw_end;
1215         gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1216                                          right_part, NULL);
1217         g_free(oldpinyins);
1218         instance->m_raw_full_pinyin = newpinyins;
1219
1220         /* re-parse the full pinyin.  */
1221         const gchar * pinyins = instance->m_raw_full_pinyin;
1222         int pinyin_len = strlen(pinyins);
1223         int parse_len = context->m_full_pinyin_parser->parse
1224             (context->m_options, instance->m_pinyin_keys,
1225              instance->m_pinyin_key_rests, pinyins, pinyin_len);
1226
1227         /* Note: there may be some un-parsable input here. */
1228     }
1229
1230     /* sync m_constraints to the length of m_pinyin_keys. */
1231     bool retval = context->m_pinyin_lookup->validate_constraint
1232         (instance->m_constraints, instance->m_pinyin_keys);
1233
1234     phrase_token_t token = candidate->m_token;
1235     guint8 len = context->m_pinyin_lookup->add_constraint
1236         (instance->m_constraints, offset, token);
1237
1238     /* safe guard: validate the m_constraints again. */
1239     retval = context->m_pinyin_lookup->validate_constraint
1240         (instance->m_constraints, instance->m_pinyin_keys) && len;
1241
1242     return offset + len;
1243 }
1244
1245
1246 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1247                              size_t offset){
1248     pinyin_context_t * & context = instance->m_context;
1249
1250     bool retval = context->m_pinyin_lookup->clear_constraint
1251         (instance->m_constraints, offset);
1252
1253     return retval;
1254 }
1255
1256 bool pinyin_clear_constraints(pinyin_instance_t * instance){
1257     pinyin_context_t * & context = instance->m_context;
1258     bool retval = true;
1259
1260     for ( size_t i = 0; i < instance->m_constraints->len; ++i ) {
1261         retval = context->m_pinyin_lookup->clear_constraint
1262             (instance->m_constraints, i) && retval;
1263     }
1264
1265     return retval;
1266 }
1267
1268 /* the returned word should be freed by g_free. */
1269 bool pinyin_translate_token(pinyin_instance_t * instance,
1270                             phrase_token_t token, char ** word){
1271     pinyin_context_t * & context = instance->m_context;
1272     PhraseItem item;
1273     ucs4_t buffer[MAX_PHRASE_LENGTH];
1274
1275     int retval = context->m_phrase_index->get_phrase_item(token, item);
1276     item.get_phrase_string(buffer);
1277     guint8 length = item.get_phrase_length();
1278     *word = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1279     return ERROR_OK == retval;
1280 }
1281
1282 bool pinyin_train(pinyin_instance_t * instance){
1283     if (!instance->m_context->m_user_dir)
1284         return false;
1285
1286     pinyin_context_t * & context = instance->m_context;
1287     context->m_modified = true;
1288
1289     bool retval = context->m_pinyin_lookup->train_result2
1290         (instance->m_pinyin_keys, instance->m_constraints,
1291          instance->m_match_results);
1292
1293     return retval;
1294 }
1295
1296 bool pinyin_reset(pinyin_instance_t * instance){
1297     g_array_set_size(instance->m_pinyin_keys, 0);
1298     g_array_set_size(instance->m_pinyin_key_rests, 0);
1299     g_array_set_size(instance->m_constraints, 0);
1300     g_array_set_size(instance->m_match_results, 0);
1301
1302     return true;
1303 }
1304
1305 /**
1306  *  Note: prefix is the text before the pre-edit string.
1307  */