begin to write import iterator
[platform/upstream/libpinyin.git] / src / pinyin.cpp
1 /* 
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *  
5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6  *  
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  *  
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  */
21
22
23 #include "pinyin.h"
24 #include <stdio.h>
25 #include <unistd.h>
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
28
29 /* a glue layer for input method integration. */
30
31 struct _pinyin_context_t{
32     pinyin_option_t m_options;
33
34     FullPinyinParser2 * m_full_pinyin_parser;
35     DoublePinyinParser2 * m_double_pinyin_parser;
36     ChewingParser2 * m_chewing_parser;
37
38     FacadeChewingTable * m_pinyin_table;
39     FacadePhraseTable * m_phrase_table;
40     FacadePhraseIndex * m_phrase_index;
41     Bigram * m_system_bigram;
42     Bigram * m_user_bigram;
43
44     PinyinLookup * m_pinyin_lookup;
45     PhraseLookup * m_phrase_lookup;
46
47     char * m_system_dir;
48     char * m_user_dir;
49     bool m_modified;
50 };
51
52 struct _import_iterator_t{
53     guint8 m_phrase_index;
54 };
55
56 static bool check_format(const char * userdir){
57     gchar * filename = g_build_filename
58         (userdir, "version", NULL);
59
60     MemoryChunk chunk;
61     bool exists = chunk.load(filename);
62
63     if (exists) {
64         exists = (0 == memcmp
65                   (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
66                    strlen(LIBPINYIN_FORMAT_VERSION) + 1));
67     }
68     g_free(filename);
69
70     if (exists)
71         return exists;
72
73     /* clean up files, if version mis-matches. */
74     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
75         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
76
77         if (NOT_USED == table_info->m_file_type)
78             continue;
79
80         if (NULL == table_info->m_user_filename)
81             continue;
82
83         const char * userfilename = table_info->m_user_filename;
84
85         /* remove dbin file. */
86         filename = g_build_filename(userdir, userfilename, NULL);
87         unlink(filename);
88         g_free(filename);
89     }
90
91     filename = g_build_filename
92         (userdir, "user.db", NULL);
93     unlink(filename);
94     g_free(filename);
95
96     return exists;
97 }
98
99 static bool mark_version(const char * userdir){
100     gchar * filename = g_build_filename
101         (userdir, "version", NULL);
102     MemoryChunk chunk;
103     chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
104                       strlen(LIBPINYIN_FORMAT_VERSION) + 1);
105     bool retval = chunk.save(filename);
106     g_free(filename);
107     return retval;
108 }
109
110 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
111     pinyin_context_t * context = new pinyin_context_t;
112
113     context->m_options = USE_TONE;
114
115     context->m_system_dir = g_strdup(systemdir);
116     context->m_user_dir = g_strdup(userdir);
117     context->m_modified = false;
118
119     check_format(context->m_user_dir);
120
121     context->m_pinyin_table = new FacadeChewingTable;
122     MemoryChunk * chunk = new MemoryChunk;
123     gchar * filename = g_build_filename
124         (context->m_system_dir, "pinyin_index.bin", NULL);
125     if (!chunk->load(filename)) {
126         fprintf(stderr, "open %s failed!\n", filename);
127         return NULL;
128     }
129     g_free(filename);
130
131     context->m_pinyin_table->load(context->m_options, chunk, NULL);
132
133     context->m_full_pinyin_parser = new FullPinyinParser2;
134     context->m_double_pinyin_parser = new DoublePinyinParser2;
135     context->m_chewing_parser = new ChewingParser2;
136
137     context->m_phrase_table = new FacadePhraseTable;
138     chunk = new MemoryChunk;
139     filename = g_build_filename(context->m_system_dir, "phrase_index.bin", NULL);
140     if (!chunk->load(filename)) {
141         fprintf(stderr, "open %s failed!\n", filename);
142         return NULL;
143     }
144     g_free(filename);
145     context->m_phrase_table->load(chunk, NULL);
146
147     context->m_phrase_index = new FacadePhraseIndex;
148
149     /* hack here: directly call load phrase library. */
150     pinyin_load_phrase_library(context, 1);
151
152     context->m_system_bigram = new Bigram;
153     filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
154     context->m_system_bigram->attach(filename, ATTACH_READONLY);
155     g_free(filename);
156
157     context->m_user_bigram = new Bigram;
158     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
159     context->m_user_bigram->load_db(filename);
160     g_free(filename);
161
162     context->m_pinyin_lookup = new PinyinLookup
163         ( context->m_options, context->m_pinyin_table,
164           context->m_phrase_index, context->m_system_bigram,
165           context->m_user_bigram);
166
167     context->m_phrase_lookup = new PhraseLookup
168         (context->m_phrase_table, context->m_phrase_index,
169          context->m_system_bigram, context->m_user_bigram);
170
171     return context;
172 }
173
174 bool pinyin_load_phrase_library(pinyin_context_t * context,
175                                 guint8 index){
176     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
177     const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
178
179     if (SYSTEM_FILE == table_info->m_file_type) {
180         /* system phrase library */
181         MemoryChunk * chunk = new MemoryChunk;
182
183         const char * systemfilename = table_info->m_system_filename;
184         /* check bin file in system dir. */
185         gchar * chunkfilename = g_build_filename(context->m_system_dir,
186                                                  systemfilename, NULL);
187         chunk->load(chunkfilename);
188         g_free(chunkfilename);
189
190         context->m_phrase_index->load(index, chunk);
191
192         const char * userfilename = table_info->m_user_filename;
193
194         chunkfilename = g_build_filename(context->m_user_dir,
195                                          userfilename, NULL);
196
197         MemoryChunk * log = new MemoryChunk;
198         log->load(chunkfilename);
199         g_free(chunkfilename);
200
201         /* merge the chunk log. */
202         context->m_phrase_index->merge(index, log);
203         return true;
204     }
205
206     if (USER_FILE == table_info->m_file_type) {
207         /* user phrase library */
208         MemoryChunk * chunk = new MemoryChunk;
209         const char * userfilename = table_info->m_user_filename;
210
211         gchar * chunkfilename = g_build_filename(context->m_user_dir,
212                                                  userfilename, NULL);
213
214         /* check bin file exists. if not, create a new one. */
215         if (chunk->load(chunkfilename)) {
216             context->m_phrase_index->load(index, chunk);
217         } else {
218             delete chunk;
219             context->m_phrase_index->create_sub_phrase(index);
220         }
221
222         g_free(chunkfilename);
223         return true;
224     }
225
226     return false;
227 }
228
229 bool pinyin_unload_phrase_library(pinyin_context_t * context,
230                                   guint8 index){
231     /* gb_char.bin can't be unloaded. */
232     if (1 == index)
233         return false;
234
235     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
236
237     context->m_phrase_index->unload(index);
238     return true;
239 }
240
241
242 bool pinyin_save(pinyin_context_t * context){
243     if (!context->m_user_dir)
244         return false;
245
246     if (!context->m_modified)
247         return false;
248
249     context->m_phrase_index->compact();
250
251     /* skip the reserved zero phrase library. */
252     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
253         PhraseIndexRange range;
254         int retval = context->m_phrase_index->get_range(i, range);
255
256         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
257             continue;
258
259         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
260
261         if (NOT_USED == table_info->m_file_type)
262             continue;
263
264         const char * userfilename = table_info->m_user_filename;
265
266         if (NULL == userfilename)
267             continue;
268
269         if (SYSTEM_FILE == table_info->m_file_type) {
270             /* system phrase library */
271             MemoryChunk * chunk = new MemoryChunk;
272             MemoryChunk * log = new MemoryChunk;
273             const char * systemfilename = table_info->m_system_filename;
274
275             /* check bin file in system dir. */
276             gchar * chunkfilename = g_build_filename(context->m_system_dir,
277                                                      systemfilename, NULL);
278             chunk->load(chunkfilename);
279             g_free(chunkfilename);
280             context->m_phrase_index->diff(i, chunk, log);
281
282             const char * userfilename = table_info->m_user_filename;
283             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
284
285             gchar * tmppathname = g_build_filename(context->m_user_dir,
286                                                    tmpfilename, NULL);
287             g_free(tmpfilename);
288
289             gchar * chunkpathname = g_build_filename(context->m_user_dir,
290                                                      userfilename, NULL);
291             log->save(tmppathname);
292             rename(tmppathname, chunkpathname);
293             g_free(chunkpathname);
294             g_free(tmppathname);
295             delete log;
296         }
297
298         if (USER_FILE == table_info->m_file_type) {
299             /* user phrase library */
300             MemoryChunk * chunk = new MemoryChunk;
301             context->m_phrase_index->store(i, chunk);
302
303             const char * userfilename = table_info->m_user_filename;
304             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
305             gchar * tmppathname = g_build_filename(context->m_user_dir,
306                                                    tmpfilename, NULL);
307             g_free(tmpfilename);
308
309             gchar * chunkpathname = g_build_filename(context->m_user_dir,
310                                                      userfilename, NULL);
311
312             chunk->save(tmppathname);
313             rename(tmppathname, chunkpathname);
314             g_free(chunkpathname);
315             g_free(tmppathname);
316             delete chunk;
317         }
318     }
319
320     gchar * tmpfilename = g_build_filename(context->m_user_dir,
321                                    "user.db.tmp", NULL);
322     unlink(tmpfilename);
323     gchar * filename = g_build_filename(context->m_user_dir, "user.db", NULL);
324     context->m_user_bigram->save_db(tmpfilename);
325     rename(tmpfilename, filename);
326     g_free(tmpfilename);
327     g_free(filename);
328
329     mark_version(context->m_user_dir);
330
331     context->m_modified = false;
332     return true;
333 }
334
335 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
336                                      DoublePinyinScheme scheme){
337     context->m_double_pinyin_parser->set_scheme(scheme);
338     return true;
339 }
340
341 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
342                                ChewingScheme scheme){
343     context->m_chewing_parser->set_scheme(scheme);
344     return true;
345 }
346
347
348 void pinyin_fini(pinyin_context_t * context){
349     delete context->m_full_pinyin_parser;
350     delete context->m_double_pinyin_parser;
351     delete context->m_chewing_parser;
352     delete context->m_pinyin_table;
353     delete context->m_phrase_table;
354     delete context->m_phrase_index;
355     delete context->m_system_bigram;
356     delete context->m_user_bigram;
357     delete context->m_pinyin_lookup;
358     delete context->m_phrase_lookup;
359
360     g_free(context->m_system_dir);
361     g_free(context->m_user_dir);
362     context->m_modified = false;
363 }
364
365 /* copy from options to context->m_options. */
366 bool pinyin_set_options(pinyin_context_t * context,
367                         pinyin_option_t options){
368     context->m_options = options;
369     context->m_pinyin_table->set_options(context->m_options);
370     context->m_pinyin_lookup->set_options(context->m_options);
371     return true;
372 }
373
374
375 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
376     pinyin_instance_t * instance = new pinyin_instance_t;
377     instance->m_context = context;
378
379     instance->m_raw_full_pinyin = NULL;
380
381     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
382     instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
383     instance->m_pinyin_key_rests =
384         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
385     instance->m_constraints = g_array_new
386         (FALSE, FALSE, sizeof(lookup_constraint_t));
387     instance->m_match_results =
388         g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
389
390     return instance;
391 }
392
393 void pinyin_free_instance(pinyin_instance_t * instance){
394     g_free(instance->m_raw_full_pinyin);
395     g_array_free(instance->m_prefixes, TRUE);
396     g_array_free(instance->m_pinyin_keys, TRUE);
397     g_array_free(instance->m_pinyin_key_rests, TRUE);
398     g_array_free(instance->m_constraints, TRUE);
399     g_array_free(instance->m_match_results, TRUE);
400
401     delete instance;
402 }
403
404
405 static bool pinyin_update_constraints(pinyin_instance_t * instance){
406     pinyin_context_t * & context = instance->m_context;
407     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
408     CandidateConstraints & constraints = instance->m_constraints;
409
410     size_t key_len = constraints->len;
411     g_array_set_size(constraints, pinyin_keys->len);
412     for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
413         lookup_constraint_t * constraint =
414             &g_array_index(constraints, lookup_constraint_t, i);
415         constraint->m_type = NO_CONSTRAINT;
416     }
417
418     context->m_pinyin_lookup->validate_constraint
419         (constraints, pinyin_keys);
420
421     return true;
422 }
423
424
425 bool pinyin_guess_sentence(pinyin_instance_t * instance){
426     pinyin_context_t * & context = instance->m_context;
427
428     g_array_set_size(instance->m_prefixes, 0);
429     g_array_append_val(instance->m_prefixes, sentence_start);
430
431     pinyin_update_constraints(instance);
432     bool retval = context->m_pinyin_lookup->get_best_match
433         (instance->m_prefixes,
434          instance->m_pinyin_keys,
435          instance->m_constraints,
436          instance->m_match_results);
437
438     return retval;
439 }
440
441 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
442                                        const char * prefix){
443     pinyin_context_t * & context = instance->m_context;
444
445     g_array_set_size(instance->m_prefixes, 0);
446     g_array_append_val(instance->m_prefixes, sentence_start);
447
448     glong written = 0;
449     ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &written, NULL);
450
451     if (ucs4_str && written) {
452         /* add prefixes. */
453         for (ssize_t i = 1; i <= written; ++i) {
454             if (i > MAX_PHRASE_LENGTH)
455                 break;
456
457             phrase_token_t token = null_token;
458             ucs4_t * start = ucs4_str + written - i;
459             int result = context->m_phrase_table->search(i, start, token);
460             if (result & SEARCH_OK)
461                 g_array_append_val(instance->m_prefixes, token);
462         }
463     }
464     g_free(ucs4_str);
465
466     pinyin_update_constraints(instance);
467     bool retval = context->m_pinyin_lookup->get_best_match
468         (instance->m_prefixes,
469          instance->m_pinyin_keys,
470          instance->m_constraints,
471          instance->m_match_results);
472
473     return retval;
474 }
475
476 bool pinyin_phrase_segment(pinyin_instance_t * instance,
477                            const char * sentence){
478     pinyin_context_t * & context = instance->m_context;
479
480     const glong num_of_chars = g_utf8_strlen(sentence, -1);
481     glong ucs4_len = 0;
482     ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
483
484     g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
485
486     bool retval = context->m_phrase_lookup->get_best_match
487         (ucs4_len, ucs4_str, instance->m_match_results);
488
489     g_free(ucs4_str);
490     return retval;
491 }
492
493 /* the returned sentence should be freed by g_free(). */
494 bool pinyin_get_sentence(pinyin_instance_t * instance,
495                          char ** sentence){
496     pinyin_context_t * & context = instance->m_context;
497
498     bool retval = pinyin::convert_to_utf8
499         (context->m_phrase_index, instance->m_match_results,
500          NULL, *sentence);
501
502     return retval;
503 }
504
505 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
506                               const char * onepinyin,
507                               ChewingKey * onekey){
508     pinyin_context_t * & context = instance->m_context;
509
510     int pinyin_len = strlen(onepinyin);
511     int parse_len = context->m_full_pinyin_parser->parse_one_key
512         ( context->m_options, *onekey, onepinyin, pinyin_len);
513     return pinyin_len == parse_len;
514 }
515
516 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
517                                       const char * pinyins){
518     pinyin_context_t * & context = instance->m_context;
519
520     g_free(instance->m_raw_full_pinyin);
521     instance->m_raw_full_pinyin = g_strdup(pinyins);
522     int pinyin_len = strlen(pinyins);
523
524     int parse_len = context->m_full_pinyin_parser->parse
525         ( context->m_options, instance->m_pinyin_keys,
526           instance->m_pinyin_key_rests, pinyins, pinyin_len);
527
528     return parse_len;
529 }
530
531 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
532                                 const char * onepinyin,
533                                 ChewingKey * onekey){
534     pinyin_context_t * & context = instance->m_context;
535
536     int pinyin_len = strlen(onepinyin);
537     int parse_len = context->m_double_pinyin_parser->parse_one_key
538         ( context->m_options, *onekey, onepinyin, pinyin_len);
539     return pinyin_len == parse_len;
540 }
541
542 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
543                                         const char * pinyins){
544     pinyin_context_t * & context = instance->m_context;
545     int pinyin_len = strlen(pinyins);
546
547     int parse_len = context->m_double_pinyin_parser->parse
548         ( context->m_options, instance->m_pinyin_keys,
549           instance->m_pinyin_key_rests, pinyins, pinyin_len);
550
551     return parse_len;
552 }
553
554 bool pinyin_parse_chewing(pinyin_instance_t * instance,
555                           const char * onechewing,
556                           ChewingKey * onekey){
557     pinyin_context_t * & context = instance->m_context;
558
559     int chewing_len = strlen(onechewing);
560     int parse_len = context->m_chewing_parser->parse_one_key
561         ( context->m_options, *onekey, onechewing, chewing_len );
562     return chewing_len == parse_len;
563 }
564
565 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
566                                   const char * chewings){
567     pinyin_context_t * & context = instance->m_context;
568     int chewing_len = strlen(chewings);
569
570     int parse_len = context->m_chewing_parser->parse
571         ( context->m_options, instance->m_pinyin_keys,
572           instance->m_pinyin_key_rests, chewings, chewing_len);
573
574     return parse_len;
575 }
576
577 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
578                                 const char key, const char ** symbol) {
579     pinyin_context_t * & context = instance->m_context;
580     return context->m_chewing_parser->in_chewing_scheme
581         (context->m_options, key, symbol);
582 }
583
584
585 static gint compare_item_with_token(gconstpointer lhs,
586                                     gconstpointer rhs) {
587     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
588     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
589
590     phrase_token_t token_lhs = item_lhs->m_token;
591     phrase_token_t token_rhs = item_rhs->m_token;
592
593     return (token_lhs - token_rhs);
594 }
595
596 static gint compare_item_with_frequency(gconstpointer lhs,
597                                         gconstpointer rhs) {
598     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
599     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
600
601     guint32 freq_lhs = item_lhs->m_freq;
602     guint32 freq_rhs = item_rhs->m_freq;
603
604     return -(freq_lhs - freq_rhs); /* in descendant order */
605 }
606
607 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
608                                           size_t offset) {
609     phrase_token_t prev_token = null_token;
610     ssize_t i;
611
612     if (0 == offset) {
613         /* get previous token from prefixes. */
614         prev_token = sentence_start;
615         size_t prev_token_len = 0;
616
617         pinyin_context_t * context = instance->m_context;
618         TokenVector prefixes = instance->m_prefixes;
619         PhraseItem item;
620
621         for (size_t i = 0; i < prefixes->len; ++i) {
622             phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
623             if (sentence_start == token)
624                 continue;
625
626             int retval = context->m_phrase_index->get_phrase_item(token, item);
627             if (ERROR_OK == retval) {
628                 size_t token_len = item.get_phrase_length();
629                 if (token_len > prev_token_len) {
630                     /* found longer match, and save it. */
631                     prev_token = token;
632                     prev_token_len = token_len;
633                 }
634             }
635         }
636     } else {
637         /* get previous token from match results. */
638         assert (0 < offset);
639
640         phrase_token_t cur_token = g_array_index
641             (instance->m_match_results, phrase_token_t, offset);
642         if (null_token != cur_token) {
643             for (i = offset - 1; i >= 0; --i) {
644                 cur_token = g_array_index
645                     (instance->m_match_results, phrase_token_t, i);
646                 if (null_token != cur_token) {
647                     prev_token = cur_token;
648                     break;
649                 }
650             }
651         }
652     }
653
654     return prev_token;
655 }
656
657 static void _append_items(pinyin_context_t * context,
658                           PhraseIndexRanges ranges,
659                           lookup_candidate_t * template_item,
660                           CandidateVector items) {
661     /* reduce and append to a single GArray. */
662     for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
663         if (NULL == ranges[m])
664             continue;
665
666         for (size_t n = 0; n < ranges[m]->len; ++n) {
667             PhraseIndexRange * range =
668                 &g_array_index(ranges[m], PhraseIndexRange, n);
669             for (size_t k = range->m_range_begin;
670                  k < range->m_range_end; ++k) {
671                 lookup_candidate_t item;
672                 item.m_candidate_type = template_item->m_candidate_type;
673                 item.m_token = k;
674                 item.m_orig_rest = template_item->m_orig_rest;
675                 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
676                 item.m_freq = template_item->m_freq;
677                 g_array_append_val(items, item);
678             }
679         }
680     }
681 }
682
683 static void _remove_duplicated_items(CandidateVector items) {
684     /* remove the duplicated items. */
685     phrase_token_t last_token = null_token, saved_token;
686     for (size_t n = 0; n < items->len; ++n) {
687         lookup_candidate_t * item = &g_array_index
688             (items, lookup_candidate_t, n);
689
690         saved_token = item->m_token;
691         if (last_token == saved_token) {
692             g_array_remove_index(items, n);
693             n--;
694         }
695         last_token = saved_token;
696     }
697 }
698
699 static void _compute_frequency_of_items(pinyin_context_t * context,
700                                         phrase_token_t prev_token,
701                                         SingleGram * merged_gram,
702                                         CandidateVector items) {
703     pinyin_option_t & options = context->m_options;
704     ssize_t i;
705
706     PhraseItem cached_item;
707     /* compute all freqs. */
708     for (i = 0; i < items->len; ++i) {
709         lookup_candidate_t * item = &g_array_index
710             (items, lookup_candidate_t, i);
711         phrase_token_t & token = item->m_token;
712
713         gfloat bigram_poss = 0; guint32 total_freq = 0;
714         if (options & DYNAMIC_ADJUST) {
715             if (null_token != prev_token) {
716                 guint32 bigram_freq = 0;
717                 merged_gram->get_total_freq(total_freq);
718                 merged_gram->get_freq(token, bigram_freq);
719                 if (0 != total_freq)
720                     bigram_poss = bigram_freq / (gfloat)total_freq;
721             }
722         }
723
724         /* compute the m_freq. */
725         FacadePhraseIndex * & phrase_index = context->m_phrase_index;
726         phrase_index->get_phrase_item(token, cached_item);
727         total_freq = phrase_index->get_phrase_index_total_freq();
728         assert (0 < total_freq);
729
730         /* Note: possibility value <= 1.0. */
731         guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
732                         (1 - LAMBDA_PARAMETER) *
733                         cached_item.get_unigram_frequency() /
734                         (gfloat) total_freq) * 256 * 256 * 256;
735         item->m_freq = freq;
736     }
737 }
738
739 bool pinyin_get_candidates(pinyin_instance_t * instance,
740                            size_t offset,
741                            TokenVector candidates) {
742
743     pinyin_context_t * & context = instance->m_context;
744     pinyin_option_t & options = context->m_options;
745     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
746     g_array_set_size(candidates, 0);
747
748     size_t pinyin_len = pinyin_keys->len - offset;
749     ssize_t i;
750
751     /* lookup the previous token here. */
752     phrase_token_t prev_token = null_token;
753
754     if (options & DYNAMIC_ADJUST) {
755         prev_token = _get_previous_token(instance, offset);
756     }
757
758     SingleGram merged_gram;
759     SingleGram * system_gram = NULL, * user_gram = NULL;
760
761     if (options & DYNAMIC_ADJUST) {
762         if (null_token != prev_token) {
763             context->m_system_bigram->load(prev_token, system_gram);
764             context->m_user_bigram->load(prev_token, user_gram);
765             merge_single_gram(&merged_gram, system_gram, user_gram);
766         }
767     }
768
769     PhraseIndexRanges ranges;
770     memset(ranges, 0, sizeof(ranges));
771     context->m_phrase_index->prepare_ranges(ranges);
772
773     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
774
775     for (i = pinyin_len; i >= 1; --i) {
776         g_array_set_size(items, 0);
777
778         ChewingKey * keys = &g_array_index
779             (pinyin_keys, ChewingKey, offset);
780
781         /* do pinyin search. */
782         int retval = context->m_pinyin_table->search
783             (i, keys, ranges);
784
785         if ( !(retval & SEARCH_OK) )
786             continue;
787
788         lookup_candidate_t template_item;
789         _append_items(context, ranges, &template_item, items);
790
791         g_array_sort(items, compare_item_with_token);
792
793         _remove_duplicated_items(items);
794
795         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
796
797         /* sort the candidates of the same length by frequency. */
798         g_array_sort(items, compare_item_with_frequency);
799
800         /* transfer back items to tokens, and save it into candidates */
801         for (ssize_t k = 0; k < items->len; ++k) {
802             lookup_candidate_t * item = &g_array_index
803                 (items, lookup_candidate_t, k);
804             g_array_append_val(candidates, item->m_token);
805         }
806
807         if (!(retval & SEARCH_CONTINUED))
808             break;
809     }
810
811     g_array_free(items, TRUE);
812
813     context->m_phrase_index->destroy_ranges(ranges);
814
815     if (system_gram)
816         delete system_gram;
817     if (user_gram)
818         delete user_gram;
819     return true;
820 }
821
822
823 static bool _try_divided_table(pinyin_instance_t * instance,
824                                PhraseIndexRanges ranges,
825                                size_t offset,
826                                CandidateVector items){
827     bool found = false;
828
829     pinyin_context_t * & context = instance->m_context;
830     pinyin_option_t & options = context->m_options;
831     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
832     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
833
834     assert(pinyin_keys->len == pinyin_key_rests->len);
835     gint num_keys = pinyin_keys->len;
836     assert(offset < num_keys);
837
838     /* handle "^xian$" -> "xi'an" here */
839     ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
840     ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
841                                            ChewingKeyRest, offset);
842     ChewingKeyRest orig_rest = *rest;
843     guint16 tone = CHEWING_ZERO_TONE;
844
845     const divided_table_item_t * item = NULL;
846
847     /* back up tone */
848     if (options & USE_TONE) {
849         tone = key->m_tone;
850         if (CHEWING_ZERO_TONE != tone) {
851             key->m_tone = CHEWING_ZERO_TONE;
852             rest->m_raw_end --;
853         }
854     }
855
856     item = context->m_full_pinyin_parser->retrieve_divided_item
857         (options, key, rest, instance->m_raw_full_pinyin,
858          strlen(instance->m_raw_full_pinyin));
859
860     if (item) {
861         /* no ops */
862         assert(item->m_new_freq > 0);
863
864         ChewingKey divided_keys[2];
865         const char * pinyin = item->m_new_keys[0];
866         assert(context->m_full_pinyin_parser->
867                parse_one_key(options, divided_keys[0],
868                              pinyin, strlen(pinyin)));
869         pinyin = item->m_new_keys[1];
870         assert(context->m_full_pinyin_parser->
871                parse_one_key(options, divided_keys[1],
872                              pinyin, strlen(pinyin)));
873
874         gchar * new_pinyins = g_strdup_printf
875             ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
876
877         /* propagate the tone */
878         if (options & USE_TONE) {
879             if (CHEWING_ZERO_TONE != tone) {
880                 assert(0 < tone && tone <= 5);
881                 divided_keys[1].m_tone = tone;
882
883                 gchar * tmp_str = g_strdup_printf
884                     ("%s%d", new_pinyins, tone);
885                 g_free(new_pinyins);
886                 new_pinyins = tmp_str;
887             }
888         }
889
890         /* do pinyin search. */
891         int retval = context->m_pinyin_table->search
892             (2, divided_keys, ranges);
893
894         if (retval & SEARCH_OK) {
895             lookup_candidate_t template_item;
896             template_item.m_candidate_type = DIVIDED_CANDIDATE;
897             template_item.m_orig_rest = orig_rest;
898             template_item.m_new_pinyins = new_pinyins;
899
900             _append_items(context, ranges, &template_item, items);
901             found = true;
902         }
903         g_free(new_pinyins);
904     }
905
906     /* restore tones */
907     if (options & USE_TONE) {
908         if (CHEWING_ZERO_TONE != tone) {
909             key->m_tone = tone;
910             rest->m_raw_end ++;
911         }
912     }
913
914     return found;
915 }
916
917 static bool _try_resplit_table(pinyin_instance_t * instance,
918                                PhraseIndexRanges ranges,
919                                size_t offset,
920                                CandidateVector items){
921     bool found = false;
922
923     pinyin_context_t * & context = instance->m_context;
924     pinyin_option_t & options = context->m_options;
925     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
926     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
927
928     assert(pinyin_keys->len == pinyin_key_rests->len);
929     gint num_keys = pinyin_keys->len;
930     assert(offset + 1 < num_keys);
931
932     guint16 next_tone = CHEWING_ZERO_TONE;
933
934     /* handle "^fa'nan$" -> "fan'an" here */
935     ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
936                                                ChewingKeyRest, offset);
937     ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
938                                                 ChewingKeyRest, offset + 1);
939     /* some "'" here */
940     if (cur_rest->m_raw_end != next_rest->m_raw_begin)
941         return found;
942
943     ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
944     ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
945                                            offset + 1);
946
947     /* some tone here */
948     if (CHEWING_ZERO_TONE != cur_key->m_tone)
949         return found;
950
951     ChewingKeyRest orig_rest;
952     orig_rest.m_raw_begin = cur_rest->m_raw_begin;
953     orig_rest.m_raw_end = next_rest->m_raw_end;
954
955     /* backup tone */
956     if (options & USE_TONE) {
957         next_tone = next_key->m_tone;
958         if (CHEWING_ZERO_TONE != next_tone) {
959             next_key->m_tone = CHEWING_ZERO_TONE;
960             next_rest->m_raw_end --;
961         }
962     }
963
964     /* lookup re-split table */
965     const char * str = instance->m_raw_full_pinyin;
966     const resplit_table_item_t * item_by_orig =
967         context->m_full_pinyin_parser->
968         retrieve_resplit_item_by_original_pinyins
969         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
970
971     const resplit_table_item_t * item_by_new =
972         context->m_full_pinyin_parser->
973         retrieve_resplit_item_by_resplit_pinyins
974         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
975
976     /* there are no same couple of pinyins in re-split table. */
977     assert(!(item_by_orig && item_by_new));
978
979     ChewingKey resplit_keys[2];
980     const char * pinyins[2];
981
982     bool tosearch = false;
983     if (item_by_orig && item_by_orig->m_new_freq) {
984         pinyins[0] = item_by_orig->m_new_keys[0];
985         pinyins[1] = item_by_orig->m_new_keys[1];
986
987         assert(context->m_full_pinyin_parser->
988                parse_one_key(options, resplit_keys[0],
989                              pinyins[0], strlen(pinyins[0])));
990
991         assert(context->m_full_pinyin_parser->
992                parse_one_key(options, resplit_keys[1],
993                              pinyins[1], strlen(pinyins[1])));
994         tosearch = true;
995     }
996
997     if (item_by_new && item_by_new->m_orig_freq) {
998         pinyins[0] = item_by_new->m_orig_keys[0];
999         pinyins[1] = item_by_new->m_orig_keys[1];
1000
1001         assert(context->m_full_pinyin_parser->
1002                parse_one_key(options, resplit_keys[0],
1003                              pinyins[0], strlen(pinyins[0])));
1004
1005         assert(context->m_full_pinyin_parser->
1006                parse_one_key(options, resplit_keys[1],
1007                              pinyins[1], strlen(pinyins[1])));
1008         tosearch = true;
1009     }
1010
1011     if (tosearch) {
1012         gchar * new_pinyins = g_strdup_printf
1013             ("%s'%s", pinyins[0], pinyins[1]);
1014
1015         /* propagate the tone */
1016         if (options & USE_TONE) {
1017             if (CHEWING_ZERO_TONE != next_tone) {
1018                 assert(0 < next_tone && next_tone <= 5);
1019                 resplit_keys[1].m_tone = next_tone;
1020
1021                 gchar * tmp_str = g_strdup_printf
1022                     ("%s%d", new_pinyins, next_tone);
1023                 g_free(new_pinyins);
1024                 new_pinyins = tmp_str;
1025             }
1026         }
1027
1028         /* do pinyin search. */
1029         int retval = context->m_pinyin_table->search
1030             (2, resplit_keys, ranges);
1031
1032         if (retval & SEARCH_OK) {
1033             lookup_candidate_t template_item;
1034             template_item.m_candidate_type = RESPLIT_CANDIDATE;
1035             template_item.m_orig_rest = orig_rest;
1036             template_item.m_new_pinyins = new_pinyins;
1037
1038             _append_items(context, ranges, &template_item, items);
1039             found = true;
1040         }
1041         g_free(new_pinyins);
1042     }
1043
1044     /* restore tones */
1045     if (options & USE_TONE) {
1046         if (CHEWING_ZERO_TONE != next_tone) {
1047             next_key->m_tone = next_tone;
1048             next_rest->m_raw_end ++;
1049         }
1050     }
1051
1052     return found;
1053 }
1054
1055 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1056                                        size_t offset,
1057                                        CandidateVector candidates){
1058
1059     pinyin_context_t * & context = instance->m_context;
1060     pinyin_option_t & options = context->m_options;
1061     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1062
1063     /* free memory */
1064     for (size_t i = 0; i < candidates->len; ++i) {
1065         lookup_candidate_t * candidate = &g_array_index
1066             (candidates, lookup_candidate_t, i);
1067         g_free(candidate->m_new_pinyins);
1068     }
1069     g_array_set_size(candidates, 0);
1070
1071     size_t pinyin_len = pinyin_keys->len - offset;
1072     pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1073     ssize_t i;
1074
1075     /* lookup the previous token here. */
1076     phrase_token_t prev_token = null_token;
1077
1078     if (options & DYNAMIC_ADJUST) {
1079         prev_token = _get_previous_token(instance, offset);
1080     }
1081
1082     SingleGram merged_gram;
1083     SingleGram * system_gram = NULL, * user_gram = NULL;
1084
1085     if (options & DYNAMIC_ADJUST) {
1086         if (null_token != prev_token) {
1087             context->m_system_bigram->load(prev_token, system_gram);
1088             context->m_user_bigram->load(prev_token, user_gram);
1089             merge_single_gram(&merged_gram, system_gram, user_gram);
1090         }
1091     }
1092
1093     PhraseIndexRanges ranges;
1094     memset(ranges, 0, sizeof(ranges));
1095     context->m_phrase_index->prepare_ranges(ranges);
1096
1097     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1098
1099     if (1 == pinyin_len) {
1100         /* because there is only one pinyin left,
1101          *  the following for-loop will not produce 2 character candidates.
1102          * the if-branch will fill the candidate list with
1103          *  2 character candidates.
1104          */
1105
1106         if (options & USE_DIVIDED_TABLE) {
1107             g_array_set_size(items, 0);
1108
1109             if (_try_divided_table(instance, ranges, offset, items)) {
1110
1111                 g_array_sort(items, compare_item_with_token);
1112
1113                 _remove_duplicated_items(items);
1114
1115                 _compute_frequency_of_items(context, prev_token,
1116                                             &merged_gram, items);
1117
1118                 /* sort the candidates of the same length by frequency. */
1119                 g_array_sort(items, compare_item_with_frequency);
1120
1121                 /* transfer back items to tokens, and save it into candidates */
1122                 for (i = 0; i < items->len; ++i) {
1123                     lookup_candidate_t * item = &g_array_index
1124                         (items, lookup_candidate_t, i);
1125                     g_array_append_val(candidates, *item);
1126                 }
1127             }
1128         }
1129     }
1130
1131     for (i = pinyin_len; i >= 1; --i) {
1132         bool found = false;
1133         g_array_set_size(items, 0);
1134
1135         if (2 == i) {
1136             /* handle fuzzy pinyin segment here. */
1137             if (options & USE_DIVIDED_TABLE) {
1138                 found = _try_divided_table(instance, ranges, offset, items) ||
1139                     found;
1140             }
1141             if (options & USE_RESPLIT_TABLE) {
1142                 found = _try_resplit_table(instance, ranges, offset, items) ||
1143                     found;
1144             }
1145         }
1146
1147         ChewingKey * keys = &g_array_index
1148             (pinyin_keys, ChewingKey, offset);
1149
1150         /* do pinyin search. */
1151         int retval = context->m_pinyin_table->search
1152             (i, keys, ranges);
1153
1154         found = (retval & SEARCH_OK) || found;
1155
1156         if ( !found )
1157             continue;
1158
1159         lookup_candidate_t template_item;
1160         _append_items(context, ranges, &template_item, items);
1161
1162         g_array_sort(items, compare_item_with_token);
1163
1164         _remove_duplicated_items(items);
1165
1166         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1167
1168         g_array_sort(items, compare_item_with_frequency);
1169
1170         for (size_t k = 0; k < items->len; ++k) {
1171             lookup_candidate_t * item = &g_array_index
1172                 (items, lookup_candidate_t, k);
1173             g_array_append_val(candidates, *item);
1174         }
1175
1176         if (!(retval & SEARCH_CONTINUED))
1177             break;
1178     }
1179
1180     g_array_free(items, TRUE);
1181
1182     context->m_phrase_index->destroy_ranges(ranges);
1183
1184     if (system_gram)
1185         delete system_gram;
1186     if (user_gram)
1187         delete user_gram;
1188     return true;
1189 }
1190
1191
1192 int pinyin_choose_candidate(pinyin_instance_t * instance,
1193                             size_t offset,
1194                             phrase_token_t token){
1195     pinyin_context_t * & context = instance->m_context;
1196
1197     guint8 len = context->m_pinyin_lookup->add_constraint
1198         (instance->m_constraints, offset, token);
1199
1200     bool retval = context->m_pinyin_lookup->validate_constraint
1201         (instance->m_constraints, instance->m_pinyin_keys) && len;
1202
1203     return offset + len;
1204 }
1205
1206 int pinyin_choose_full_pinyin_candidate(pinyin_instance_t * instance,
1207                                         size_t offset,
1208                                         lookup_candidate_t * candidate){
1209     pinyin_context_t * & context = instance->m_context;
1210
1211     if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1212         RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1213         /* update full pinyin. */
1214         gchar * oldpinyins = instance->m_raw_full_pinyin;
1215         const ChewingKeyRest rest = candidate->m_orig_rest;
1216         oldpinyins[rest.m_raw_begin] = '\0';
1217         const gchar * left_part = oldpinyins;
1218         const gchar * right_part = oldpinyins + rest.m_raw_end;
1219         gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1220                                          right_part, NULL);
1221         g_free(oldpinyins);
1222         instance->m_raw_full_pinyin = newpinyins;
1223
1224         /* re-parse the full pinyin.  */
1225         const gchar * pinyins = instance->m_raw_full_pinyin;
1226         int pinyin_len = strlen(pinyins);
1227         int parse_len = context->m_full_pinyin_parser->parse
1228             (context->m_options, instance->m_pinyin_keys,
1229              instance->m_pinyin_key_rests, pinyins, pinyin_len);
1230
1231         /* Note: there may be some un-parsable input here. */
1232     }
1233
1234     /* sync m_constraints to the length of m_pinyin_keys. */
1235     bool retval = context->m_pinyin_lookup->validate_constraint
1236         (instance->m_constraints, instance->m_pinyin_keys);
1237
1238     phrase_token_t token = candidate->m_token;
1239     guint8 len = context->m_pinyin_lookup->add_constraint
1240         (instance->m_constraints, offset, token);
1241
1242     /* safe guard: validate the m_constraints again. */
1243     retval = context->m_pinyin_lookup->validate_constraint
1244         (instance->m_constraints, instance->m_pinyin_keys) && len;
1245
1246     return offset + len;
1247 }
1248
1249
1250 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1251                              size_t offset){
1252     pinyin_context_t * & context = instance->m_context;
1253
1254     bool retval = context->m_pinyin_lookup->clear_constraint
1255         (instance->m_constraints, offset);
1256
1257     return retval;
1258 }
1259
1260 bool pinyin_clear_constraints(pinyin_instance_t * instance){
1261     pinyin_context_t * & context = instance->m_context;
1262     bool retval = true;
1263
1264     for ( size_t i = 0; i < instance->m_constraints->len; ++i ) {
1265         retval = context->m_pinyin_lookup->clear_constraint
1266             (instance->m_constraints, i) && retval;
1267     }
1268
1269     return retval;
1270 }
1271
1272 /* the returned word should be freed by g_free. */
1273 bool pinyin_translate_token(pinyin_instance_t * instance,
1274                             phrase_token_t token, char ** word){
1275     pinyin_context_t * & context = instance->m_context;
1276     PhraseItem item;
1277     ucs4_t buffer[MAX_PHRASE_LENGTH];
1278
1279     int retval = context->m_phrase_index->get_phrase_item(token, item);
1280     item.get_phrase_string(buffer);
1281     guint8 length = item.get_phrase_length();
1282     *word = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1283     return ERROR_OK == retval;
1284 }
1285
1286 bool pinyin_train(pinyin_instance_t * instance){
1287     if (!instance->m_context->m_user_dir)
1288         return false;
1289
1290     pinyin_context_t * & context = instance->m_context;
1291     context->m_modified = true;
1292
1293     bool retval = context->m_pinyin_lookup->train_result2
1294         (instance->m_pinyin_keys, instance->m_constraints,
1295          instance->m_match_results);
1296
1297     return retval;
1298 }
1299
1300 bool pinyin_reset(pinyin_instance_t * instance){
1301     g_array_set_size(instance->m_pinyin_keys, 0);
1302     g_array_set_size(instance->m_pinyin_key_rests, 0);
1303     g_array_set_size(instance->m_constraints, 0);
1304     g_array_set_size(instance->m_match_results, 0);
1305
1306     return true;
1307 }
1308
1309 /**
1310  *  Note: prefix is the text before the pre-edit string.
1311  */