6d6ff8430a03842b2bb7bbd1aeb4ae2e8b7ebacf
[platform/upstream/libpinyin.git] / src / pinyin.cpp
1 /* 
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *  
5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6  *  
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  *  
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  */
21
22
23 #include "pinyin.h"
24 #include <stdio.h>
25 #include <unistd.h>
26 #include <glib/gstdio.h>
27 #include "pinyin_internal.h"
28
29 using namespace pinyin;
30
31 /* a glue layer for input method integration. */
32
33 struct _pinyin_context_t{
34     pinyin_option_t m_options;
35
36     FullPinyinParser2 * m_full_pinyin_parser;
37     DoublePinyinParser2 * m_double_pinyin_parser;
38     ChewingParser2 * m_chewing_parser;
39
40     FacadeChewingTable * m_pinyin_table;
41     FacadePhraseTable2 * m_phrase_table;
42     FacadePhraseIndex * m_phrase_index;
43     Bigram * m_system_bigram;
44     Bigram * m_user_bigram;
45
46     PinyinLookup2 * m_pinyin_lookup;
47     PhraseLookup * m_phrase_lookup;
48
49     char * m_system_dir;
50     char * m_user_dir;
51     bool m_modified;
52 };
53
54 struct _pinyin_instance_t{
55     pinyin_context_t * m_context;
56     gchar * m_raw_full_pinyin;
57     TokenVector m_prefixes;
58     ChewingKeyVector m_pinyin_keys;
59     ChewingKeyRestVector m_pinyin_key_rests;
60     CandidateConstraints m_constraints;
61     MatchResults m_match_results;
62     CandidateVector m_candidates;
63 };
64
65 struct _lookup_candidate_t{
66     enum lookup_candidate_type_t m_candidate_type;
67     gchar * m_phrase_string;
68     phrase_token_t m_token;
69     ChewingKeyRest m_orig_rest;
70     gchar * m_new_pinyins;
71     guint32 m_freq; /* the amplifed gfloat numerical value. */
72 public:
73     _lookup_candidate_t() {
74         m_candidate_type = NORMAL_CANDIDATE;
75         m_phrase_string = NULL;
76         m_token = null_token;
77         m_new_pinyins = NULL;
78         m_freq = 0;
79     }
80 };
81
82 struct _import_iterator_t{
83     pinyin_context_t * m_context;
84     guint8 m_phrase_index;
85 };
86
87
88 static bool check_format(const char * userdir){
89     gchar * filename = g_build_filename
90         (userdir, "version", NULL);
91
92     MemoryChunk chunk;
93     bool exists = chunk.load(filename);
94
95     if (exists) {
96         exists = (0 == memcmp
97                   (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
98                    strlen(LIBPINYIN_FORMAT_VERSION) + 1));
99     }
100     g_free(filename);
101
102     if (exists)
103         return exists;
104
105     /* clean up files, if version mis-matches. */
106     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
107         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
108
109         if (NOT_USED == table_info->m_file_type)
110             continue;
111
112         if (NULL == table_info->m_user_filename)
113             continue;
114
115         const char * userfilename = table_info->m_user_filename;
116
117         /* remove dbin file. */
118         filename = g_build_filename(userdir, userfilename, NULL);
119         unlink(filename);
120         g_free(filename);
121     }
122
123     filename = g_build_filename
124         (userdir, "user_pinyin_index.bin", NULL);
125     unlink(filename);
126     g_free(filename);
127
128     filename = g_build_filename
129         (userdir, "user_phrase_index.bin", NULL);
130     unlink(filename);
131     g_free(filename);
132
133     filename = g_build_filename
134         (userdir, "user.db", NULL);
135     unlink(filename);
136     g_free(filename);
137
138     return exists;
139 }
140
141 static bool mark_version(const char * userdir){
142     gchar * filename = g_build_filename
143         (userdir, "version", NULL);
144     MemoryChunk chunk;
145     chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
146                       strlen(LIBPINYIN_FORMAT_VERSION) + 1);
147     bool retval = chunk.save(filename);
148     g_free(filename);
149     return retval;
150 }
151
152 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
153     pinyin_context_t * context = new pinyin_context_t;
154
155     context->m_options = USE_TONE;
156
157     context->m_system_dir = g_strdup(systemdir);
158     context->m_user_dir = g_strdup(userdir);
159     context->m_modified = false;
160
161     check_format(context->m_user_dir);
162
163     context->m_full_pinyin_parser = new FullPinyinParser2;
164     context->m_double_pinyin_parser = new DoublePinyinParser2;
165     context->m_chewing_parser = new ChewingParser2;
166
167     /* load chewing table. */
168     context->m_pinyin_table = new FacadeChewingTable;
169
170     /* load system chewing table. */
171     MemoryChunk * chunk = new MemoryChunk;
172     gchar * filename = g_build_filename
173         (context->m_system_dir, "pinyin_index.bin", NULL);
174     if (!chunk->load(filename)) {
175         fprintf(stderr, "open %s failed!\n", filename);
176         return NULL;
177     }
178     g_free(filename);
179
180     /* load user chewing table */
181     MemoryChunk * userchunk = new MemoryChunk;
182     filename = g_build_filename
183         (context->m_user_dir, "user_pinyin_index.bin", NULL);
184     if (!userchunk->load(filename)) {
185         /* hack here: use local Chewing Table to create empty memory chunk. */
186         ChewingLargeTable table(context->m_options);
187         table.store(userchunk);
188     }
189     g_free(filename);
190
191     context->m_pinyin_table->load(context->m_options, chunk, userchunk);
192
193     /* load phrase table */
194     context->m_phrase_table = new FacadePhraseTable2;
195
196     /* load system phrase table */
197     chunk = new MemoryChunk;
198     filename = g_build_filename
199         (context->m_system_dir, "phrase_index.bin", NULL);
200     if (!chunk->load(filename)) {
201         fprintf(stderr, "open %s failed!\n", filename);
202         return NULL;
203     }
204     g_free(filename);
205
206     /* load user phrase table */
207     userchunk = new MemoryChunk;
208     filename = g_build_filename
209         (context->m_user_dir, "user_phrase_index.bin", NULL);
210     if (!userchunk->load(filename)) {
211         /* hack here: use local Phrase Table to create empty memory chunk. */
212         PhraseLargeTable2 table;
213         table.store(userchunk);
214     }
215     g_free(filename);
216
217     context->m_phrase_table->load(chunk, userchunk);
218
219     context->m_phrase_index = new FacadePhraseIndex;
220
221     /* hack here: directly call load phrase library. */
222     pinyin_load_phrase_library(context, GB_DICTIONARY);
223     pinyin_load_phrase_library(context, MERGED_DICTIONARY);
224
225     context->m_system_bigram = new Bigram;
226     filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
227     context->m_system_bigram->attach(filename, ATTACH_READONLY);
228     g_free(filename);
229
230     context->m_user_bigram = new Bigram;
231     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
232     context->m_user_bigram->load_db(filename);
233     g_free(filename);
234
235     context->m_pinyin_lookup = new PinyinLookup2
236         ( context->m_options, context->m_pinyin_table,
237           context->m_phrase_index, context->m_system_bigram,
238           context->m_user_bigram);
239
240     context->m_phrase_lookup = new PhraseLookup
241         (context->m_phrase_table, context->m_phrase_index,
242          context->m_system_bigram, context->m_user_bigram);
243
244     return context;
245 }
246
247 bool pinyin_load_phrase_library(pinyin_context_t * context,
248                                 guint8 index){
249     if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
250         return false;
251
252     /* check whether the sub phrase index is already loaded. */
253     PhraseIndexRange range;
254     int retval = context->m_phrase_index->get_range(index, range);
255     if (ERROR_OK == retval)
256         return false;
257
258     const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
259
260     if (SYSTEM_FILE == table_info->m_file_type ||
261         DICTIONARY == table_info->m_file_type) {
262         /* system phrase library */
263         MemoryChunk * chunk = new MemoryChunk;
264
265         const char * systemfilename = table_info->m_system_filename;
266         /* check bin file in system dir. */
267         gchar * chunkfilename = g_build_filename(context->m_system_dir,
268                                                  systemfilename, NULL);
269         chunk->load(chunkfilename);
270         g_free(chunkfilename);
271
272         context->m_phrase_index->load(index, chunk);
273
274         const char * userfilename = table_info->m_user_filename;
275
276         chunkfilename = g_build_filename(context->m_user_dir,
277                                          userfilename, NULL);
278
279         MemoryChunk * log = new MemoryChunk;
280         log->load(chunkfilename);
281         g_free(chunkfilename);
282
283         /* merge the chunk log. */
284         context->m_phrase_index->merge(index, log);
285         return true;
286     }
287
288     if (USER_FILE == table_info->m_file_type) {
289         /* user phrase library */
290         MemoryChunk * chunk = new MemoryChunk;
291         const char * userfilename = table_info->m_user_filename;
292
293         gchar * chunkfilename = g_build_filename(context->m_user_dir,
294                                                  userfilename, NULL);
295
296         /* check bin file exists. if not, create a new one. */
297         if (chunk->load(chunkfilename)) {
298             context->m_phrase_index->load(index, chunk);
299         } else {
300             delete chunk;
301             context->m_phrase_index->create_sub_phrase(index);
302         }
303
304         g_free(chunkfilename);
305         return true;
306     }
307
308     return false;
309 }
310
311 bool pinyin_unload_phrase_library(pinyin_context_t * context,
312                                   guint8 index){
313     /* gb_char.bin and merged.bin can't be unloaded. */
314     if (GB_DICTIONARY == index || MERGED_DICTIONARY == index)
315         return false;
316
317     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
318
319     context->m_phrase_index->unload(index);
320     return true;
321 }
322
323 import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context,
324                                              guint8 index){
325     import_iterator_t * iter = new import_iterator_t;
326     iter->m_context = context;
327     iter->m_phrase_index = index;
328     return iter;
329 }
330
331 bool pinyin_iterator_add_phrase(import_iterator_t * iter,
332                                 const char * phrase,
333                                 const char * pinyin,
334                                 gint count){
335     /* if -1 == count, use the default value. */
336     const gint default_count = 5;
337     const guint32 unigram_factor = 3;
338     if (-1 == count)
339         count = default_count;
340
341     pinyin_context_t * & context = iter->m_context;
342     FacadePhraseTable2 * & phrase_table = context->m_phrase_table;
343     FacadeChewingTable * & pinyin_table = context->m_pinyin_table;
344     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
345
346     /* check whether the phrase exists in phrase table */
347     glong len_phrase = 0;
348     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL);
349
350     bool result = false;
351
352     pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
353     FullPinyinParser2 parser;
354     ChewingKeyVector keys =
355         g_array_new(FALSE, FALSE, sizeof(ChewingKey));
356     ChewingKeyRestVector key_rests =
357         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
358
359     /* parse the pinyin. */
360     parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
361
362     if (len_phrase != keys->len)
363         return result;
364
365     if (len_phrase >= MAX_PHRASE_LENGTH)
366         return result;
367
368     phrase_token_t token = null_token;
369     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
370
371     /* do phrase table search. */
372     PhraseTokens tokens;
373     memset(tokens, 0, sizeof(PhraseTokens));
374     phrase_index->prepare_tokens(tokens);
375     int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens);
376     int num = reduce_tokens(tokens, tokenarray);
377     phrase_index->destroy_tokens(tokens);
378
379     /* find the best token candidate. */
380     for (size_t i = 0; i < tokenarray->len; ++i) {
381         phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
382         if (null_token == token) {
383             token = candidate;
384             continue;
385         }
386
387         if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) {
388             /* only one phrase string per sub phrase index. */
389             assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index);
390             token = candidate;
391             continue;
392         }
393     }
394     g_array_free(tokenarray, TRUE);
395
396     PhraseItem item;
397     /* check whether it exists in the same sub phrase index; */
398     if (null_token != token &&
399         PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) {
400         /* if so, remove the phrase, add the pinyin for the phrase item,
401            then add it back;*/
402         phrase_index->get_phrase_item(token, item);
403         assert(len_phrase == item.get_phrase_length());
404         ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
405         item.get_phrase_string(tmp_phrase);
406         assert(0 == memcmp
407                (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase));
408
409         PhraseItem * removed_item = NULL;
410         retval = phrase_index->remove_phrase_item(token, removed_item);
411         if (ERROR_OK == retval) {
412             /* maybe check whether there are duplicated pronunciations here. */
413             removed_item->append_pronunciation((ChewingKey *)keys->data,
414                                                count);
415             phrase_index->add_phrase_item(token, removed_item);
416             delete removed_item;
417             result = true;
418         }
419     } else {
420         /* if not exists in the same sub phrase index,
421            get the maximum token,
422            then add it directly with maximum token + 1; */
423         PhraseIndexRange range;
424         retval = phrase_index->get_range(iter->m_phrase_index, range);
425
426         if (ERROR_OK == retval) {
427             token = range.m_range_end;
428             if (0x00000000 == (token & PHRASE_MASK))
429                 token++;
430
431             if (len_phrase == keys->len) { /* valid pinyin */
432                 phrase_table->add_index(len_phrase, ucs4_phrase, token);
433                 pinyin_table->add_index
434                     (keys->len, (ChewingKey *)(keys->data), token);
435
436                 item.set_phrase_string(len_phrase, ucs4_phrase);
437                 item.append_pronunciation((ChewingKey *)(keys->data), count);
438                 phrase_index->add_phrase_item(token, &item);
439                 phrase_index->add_unigram_frequency(token,
440                                                     count * unigram_factor);
441                 result = true;
442             }
443         }
444     }
445
446     g_array_free(key_rests, TRUE);
447     g_array_free(keys, TRUE);
448     g_free(ucs4_phrase);
449     return result;
450 }
451
452 void pinyin_end_add_phrases(import_iterator_t * iter){
453     /* compact the content memory chunk of phrase index. */
454     iter->m_context->m_phrase_index->compact();
455     delete iter;
456 }
457
458 bool pinyin_save(pinyin_context_t * context){
459     if (!context->m_user_dir)
460         return false;
461
462     if (!context->m_modified)
463         return false;
464
465     context->m_phrase_index->compact();
466
467     /* skip the reserved zero phrase library. */
468     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
469         PhraseIndexRange range;
470         int retval = context->m_phrase_index->get_range(i, range);
471
472         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
473             continue;
474
475         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
476
477         if (NOT_USED == table_info->m_file_type)
478             continue;
479
480         const char * userfilename = table_info->m_user_filename;
481
482         if (NULL == userfilename)
483             continue;
484
485         if (SYSTEM_FILE == table_info->m_file_type ||
486             DICTIONARY == table_info->m_file_type) {
487             /* system phrase library */
488             MemoryChunk * chunk = new MemoryChunk;
489             MemoryChunk * log = new MemoryChunk;
490             const char * systemfilename = table_info->m_system_filename;
491
492             /* check bin file in system dir. */
493             gchar * chunkfilename = g_build_filename(context->m_system_dir,
494                                                      systemfilename, NULL);
495             chunk->load(chunkfilename);
496             g_free(chunkfilename);
497             context->m_phrase_index->diff(i, chunk, log);
498
499             const char * userfilename = table_info->m_user_filename;
500             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
501
502             gchar * tmppathname = g_build_filename(context->m_user_dir,
503                                                    tmpfilename, NULL);
504             g_free(tmpfilename);
505
506             gchar * chunkpathname = g_build_filename(context->m_user_dir,
507                                                      userfilename, NULL);
508             log->save(tmppathname);
509             rename(tmppathname, chunkpathname);
510             g_free(chunkpathname);
511             g_free(tmppathname);
512             delete log;
513         }
514
515         if (USER_FILE == table_info->m_file_type) {
516             /* user phrase library */
517             MemoryChunk * chunk = new MemoryChunk;
518             context->m_phrase_index->store(i, chunk);
519
520             const char * userfilename = table_info->m_user_filename;
521             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
522             gchar * tmppathname = g_build_filename(context->m_user_dir,
523                                                    tmpfilename, NULL);
524             g_free(tmpfilename);
525
526             gchar * chunkpathname = g_build_filename(context->m_user_dir,
527                                                      userfilename, NULL);
528
529             chunk->save(tmppathname);
530             rename(tmppathname, chunkpathname);
531             g_free(chunkpathname);
532             g_free(tmppathname);
533             delete chunk;
534         }
535     }
536
537     /* save user chewing table */
538     gchar * tmpfilename = g_build_filename
539         (context->m_user_dir, "user_pinyin_index.bin.tmp", NULL);
540     unlink(tmpfilename);
541     gchar * filename = g_build_filename
542         (context->m_user_dir, "user_pinyin_index.bin", NULL);
543
544     MemoryChunk * chunk = new MemoryChunk;
545     context->m_pinyin_table->store(chunk);
546     chunk->save(tmpfilename);
547     delete chunk;
548     rename(tmpfilename, filename);
549     g_free(tmpfilename);
550     g_free(filename);
551
552     /* save user phrase table */
553     tmpfilename = g_build_filename
554         (context->m_user_dir, "user_phrase_index.bin.tmp", NULL);
555     unlink(tmpfilename);
556     filename = g_build_filename
557         (context->m_user_dir, "user_phrase_index.bin", NULL);
558
559     chunk = new MemoryChunk;
560     context->m_phrase_table->store(chunk);
561     chunk->save(tmpfilename);
562     delete chunk;
563     rename(tmpfilename, filename);
564     g_free(tmpfilename);
565     g_free(filename);
566
567     /* save user bi-gram */
568     tmpfilename = g_build_filename
569         (context->m_user_dir, "user.db.tmp", NULL);
570     unlink(tmpfilename);
571     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
572     context->m_user_bigram->save_db(tmpfilename);
573     rename(tmpfilename, filename);
574     g_free(tmpfilename);
575     g_free(filename);
576
577     mark_version(context->m_user_dir);
578
579     context->m_modified = false;
580     return true;
581 }
582
583 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
584                                      DoublePinyinScheme scheme){
585     context->m_double_pinyin_parser->set_scheme(scheme);
586     return true;
587 }
588
589 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
590                                ChewingScheme scheme){
591     context->m_chewing_parser->set_scheme(scheme);
592     return true;
593 }
594
595 void pinyin_fini(pinyin_context_t * context){
596     delete context->m_full_pinyin_parser;
597     delete context->m_double_pinyin_parser;
598     delete context->m_chewing_parser;
599     delete context->m_pinyin_table;
600     delete context->m_phrase_table;
601     delete context->m_phrase_index;
602     delete context->m_system_bigram;
603     delete context->m_user_bigram;
604     delete context->m_pinyin_lookup;
605     delete context->m_phrase_lookup;
606
607     g_free(context->m_system_dir);
608     g_free(context->m_user_dir);
609     context->m_modified = false;
610
611     delete context;
612 }
613
614 bool pinyin_mask_out(pinyin_context_t * context,
615                      phrase_token_t mask,
616                      phrase_token_t value) {
617
618     context->m_pinyin_table->mask_out(mask, value);
619     context->m_phrase_table->mask_out(mask, value);
620     context->m_user_bigram->mask_out(mask, value);
621
622     /* mask out the phrase index. */
623     for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
624         PhraseIndexRange range;
625         int retval = context->m_phrase_index->get_range(index, range);
626
627         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
628             continue;
629
630         const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
631
632         if (NOT_USED == table_info->m_file_type)
633             continue;
634
635         const char * userfilename = table_info->m_user_filename;
636
637         if (NULL == userfilename)
638             continue;
639
640         if (SYSTEM_FILE == table_info->m_file_type ||
641             DICTIONARY == table_info->m_file_type) {
642             /* system phrase library */
643             MemoryChunk * chunk = new MemoryChunk;
644
645             const char * systemfilename = table_info->m_system_filename;
646             /* check bin file in system dir. */
647             gchar * chunkfilename = g_build_filename(context->m_system_dir,
648                                                      systemfilename, NULL);
649             chunk->load(chunkfilename);
650             g_free(chunkfilename);
651
652             context->m_phrase_index->load(index, chunk);
653
654             const char * userfilename = table_info->m_user_filename;
655
656             chunkfilename = g_build_filename(context->m_user_dir,
657                                              userfilename, NULL);
658
659             MemoryChunk * log = new MemoryChunk;
660             log->load(chunkfilename);
661             g_free(chunkfilename);
662
663             /* merge the chunk log with mask. */
664             context->m_phrase_index->merge_with_mask(index, log, mask, value);
665         }
666
667         if (USER_FILE == table_info->m_file_type) {
668             /* user phrase library */
669             context->m_phrase_index->mask_out(index, mask, value);
670         }
671     }
672
673     context->m_phrase_index->compact();
674     return true;
675 }
676
677 /* copy from options to context->m_options. */
678 bool pinyin_set_options(pinyin_context_t * context,
679                         pinyin_option_t options){
680     context->m_options = options;
681     context->m_pinyin_table->set_options(context->m_options);
682     context->m_pinyin_lookup->set_options(context->m_options);
683     return true;
684 }
685
686
687 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
688     pinyin_instance_t * instance = new pinyin_instance_t;
689     instance->m_context = context;
690
691     instance->m_raw_full_pinyin = NULL;
692
693     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
694     instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
695     instance->m_pinyin_key_rests =
696         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
697     instance->m_constraints = g_array_new
698         (TRUE, FALSE, sizeof(lookup_constraint_t));
699     instance->m_match_results =
700         g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
701
702     return instance;
703 }
704
705 void pinyin_free_instance(pinyin_instance_t * instance){
706     g_free(instance->m_raw_full_pinyin);
707     g_array_free(instance->m_prefixes, TRUE);
708     g_array_free(instance->m_pinyin_keys, TRUE);
709     g_array_free(instance->m_pinyin_key_rests, TRUE);
710     g_array_free(instance->m_constraints, TRUE);
711     g_array_free(instance->m_match_results, TRUE);
712
713     delete instance;
714 }
715
716
717 static bool pinyin_update_constraints(pinyin_instance_t * instance){
718     pinyin_context_t * & context = instance->m_context;
719     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
720     CandidateConstraints & constraints = instance->m_constraints;
721
722     size_t key_len = constraints->len;
723     g_array_set_size(constraints, pinyin_keys->len);
724     for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
725         lookup_constraint_t * constraint =
726             &g_array_index(constraints, lookup_constraint_t, i);
727         constraint->m_type = NO_CONSTRAINT;
728     }
729
730     context->m_pinyin_lookup->validate_constraint
731         (constraints, pinyin_keys);
732
733     return true;
734 }
735
736
737 bool pinyin_guess_sentence(pinyin_instance_t * instance){
738     pinyin_context_t * & context = instance->m_context;
739
740     g_array_set_size(instance->m_prefixes, 0);
741     g_array_append_val(instance->m_prefixes, sentence_start);
742
743     pinyin_update_constraints(instance);
744     bool retval = context->m_pinyin_lookup->get_best_match
745         (instance->m_prefixes,
746          instance->m_pinyin_keys,
747          instance->m_constraints,
748          instance->m_match_results);
749
750     return retval;
751 }
752
753 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
754                                        const char * prefix){
755     pinyin_context_t * & context = instance->m_context;
756
757     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
758
759     g_array_set_size(instance->m_prefixes, 0);
760     g_array_append_val(instance->m_prefixes, sentence_start);
761
762     glong len_str = 0;
763     ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
764     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
765
766     if (ucs4_str && len_str) {
767         /* add prefixes. */
768         for (ssize_t i = 1; i <= len_str; ++i) {
769             if (i > MAX_PHRASE_LENGTH)
770                 break;
771
772             ucs4_t * start = ucs4_str + len_str - i;
773
774             PhraseTokens tokens;
775             memset(tokens, 0, sizeof(tokens));
776             phrase_index->prepare_tokens(tokens);
777             int result = context->m_phrase_table->search(i, start, tokens);
778             int num = reduce_tokens(tokens, tokenarray);
779             phrase_index->destroy_tokens(tokens);
780
781             if (result & SEARCH_OK)
782                 g_array_append_vals(instance->m_prefixes,
783                                     tokenarray->data, tokenarray->len);
784         }
785     }
786     g_array_free(tokenarray, TRUE);
787     g_free(ucs4_str);
788
789     pinyin_update_constraints(instance);
790     bool retval = context->m_pinyin_lookup->get_best_match
791         (instance->m_prefixes,
792          instance->m_pinyin_keys,
793          instance->m_constraints,
794          instance->m_match_results);
795
796     return retval;
797 }
798
799 bool pinyin_phrase_segment(pinyin_instance_t * instance,
800                            const char * sentence){
801     pinyin_context_t * & context = instance->m_context;
802
803     const glong num_of_chars = g_utf8_strlen(sentence, -1);
804     glong ucs4_len = 0;
805     ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
806
807     g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
808
809     bool retval = context->m_phrase_lookup->get_best_match
810         (ucs4_len, ucs4_str, instance->m_match_results);
811
812     g_free(ucs4_str);
813     return retval;
814 }
815
816 /* the returned sentence should be freed by g_free(). */
817 bool pinyin_get_sentence(pinyin_instance_t * instance,
818                          char ** sentence){
819     pinyin_context_t * & context = instance->m_context;
820
821     bool retval = pinyin::convert_to_utf8
822         (context->m_phrase_index, instance->m_match_results,
823          NULL, false, *sentence);
824
825     return retval;
826 }
827
828 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
829                               const char * onepinyin,
830                               ChewingKey * onekey){
831     pinyin_context_t * & context = instance->m_context;
832
833     int pinyin_len = strlen(onepinyin);
834     int parse_len = context->m_full_pinyin_parser->parse_one_key
835         ( context->m_options, *onekey, onepinyin, pinyin_len);
836     return pinyin_len == parse_len;
837 }
838
839 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
840                                       const char * pinyins){
841     pinyin_context_t * & context = instance->m_context;
842
843     g_free(instance->m_raw_full_pinyin);
844     instance->m_raw_full_pinyin = g_strdup(pinyins);
845     int pinyin_len = strlen(pinyins);
846
847     int parse_len = context->m_full_pinyin_parser->parse
848         ( context->m_options, instance->m_pinyin_keys,
849           instance->m_pinyin_key_rests, pinyins, pinyin_len);
850
851     return parse_len;
852 }
853
854 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
855                                 const char * onepinyin,
856                                 ChewingKey * onekey){
857     pinyin_context_t * & context = instance->m_context;
858
859     int pinyin_len = strlen(onepinyin);
860     int parse_len = context->m_double_pinyin_parser->parse_one_key
861         ( context->m_options, *onekey, onepinyin, pinyin_len);
862     return pinyin_len == parse_len;
863 }
864
865 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
866                                         const char * pinyins){
867     pinyin_context_t * & context = instance->m_context;
868     int pinyin_len = strlen(pinyins);
869
870     int parse_len = context->m_double_pinyin_parser->parse
871         ( context->m_options, instance->m_pinyin_keys,
872           instance->m_pinyin_key_rests, pinyins, pinyin_len);
873
874     return parse_len;
875 }
876
877 bool pinyin_parse_chewing(pinyin_instance_t * instance,
878                           const char * onechewing,
879                           ChewingKey * onekey){
880     pinyin_context_t * & context = instance->m_context;
881
882     int chewing_len = strlen(onechewing);
883     int parse_len = context->m_chewing_parser->parse_one_key
884         ( context->m_options, *onekey, onechewing, chewing_len );
885     return chewing_len == parse_len;
886 }
887
888 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
889                                   const char * chewings){
890     pinyin_context_t * & context = instance->m_context;
891     int chewing_len = strlen(chewings);
892
893     int parse_len = context->m_chewing_parser->parse
894         ( context->m_options, instance->m_pinyin_keys,
895           instance->m_pinyin_key_rests, chewings, chewing_len);
896
897     return parse_len;
898 }
899
900 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
901                                 const char key, const char ** symbol) {
902     pinyin_context_t * & context = instance->m_context;
903     return context->m_chewing_parser->in_chewing_scheme
904         (context->m_options, key, symbol);
905 }
906
907 #if 0
908 static gint compare_item_with_token(gconstpointer lhs,
909                                     gconstpointer rhs) {
910     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
911     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
912
913     phrase_token_t token_lhs = item_lhs->m_token;
914     phrase_token_t token_rhs = item_rhs->m_token;
915
916     return (token_lhs - token_rhs);
917 }
918 #endif
919
920 static gint compare_item_with_frequency(gconstpointer lhs,
921                                         gconstpointer rhs) {
922     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
923     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
924
925     guint32 freq_lhs = item_lhs->m_freq;
926     guint32 freq_rhs = item_rhs->m_freq;
927
928     return -(freq_lhs - freq_rhs); /* in descendant order */
929 }
930
931 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
932                                           size_t offset) {
933     phrase_token_t prev_token = null_token;
934     ssize_t i;
935
936     if (0 == offset) {
937         /* get previous token from prefixes. */
938         prev_token = sentence_start;
939         size_t prev_token_len = 0;
940
941         pinyin_context_t * context = instance->m_context;
942         TokenVector prefixes = instance->m_prefixes;
943         PhraseItem item;
944
945         for (size_t i = 0; i < prefixes->len; ++i) {
946             phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
947             if (sentence_start == token)
948                 continue;
949
950             int retval = context->m_phrase_index->get_phrase_item(token, item);
951             if (ERROR_OK == retval) {
952                 size_t token_len = item.get_phrase_length();
953                 if (token_len > prev_token_len) {
954                     /* found longer match, and save it. */
955                     prev_token = token;
956                     prev_token_len = token_len;
957                 }
958             }
959         }
960     } else {
961         /* get previous token from match results. */
962         assert (0 < offset);
963
964         phrase_token_t cur_token = g_array_index
965             (instance->m_match_results, phrase_token_t, offset);
966         if (null_token != cur_token) {
967             for (i = offset - 1; i >= 0; --i) {
968                 cur_token = g_array_index
969                     (instance->m_match_results, phrase_token_t, i);
970                 if (null_token != cur_token) {
971                     prev_token = cur_token;
972                     break;
973                 }
974             }
975         }
976     }
977
978     return prev_token;
979 }
980
981 static void _append_items(pinyin_context_t * context,
982                           PhraseIndexRanges ranges,
983                           lookup_candidate_t * template_item,
984                           CandidateVector items) {
985     /* reduce and append to a single GArray. */
986     for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
987         if (NULL == ranges[m])
988             continue;
989
990         for (size_t n = 0; n < ranges[m]->len; ++n) {
991             PhraseIndexRange * range =
992                 &g_array_index(ranges[m], PhraseIndexRange, n);
993             for (size_t k = range->m_range_begin;
994                  k < range->m_range_end; ++k) {
995                 lookup_candidate_t item;
996                 item.m_candidate_type = template_item->m_candidate_type;
997                 item.m_token = k;
998                 item.m_orig_rest = template_item->m_orig_rest;
999                 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
1000                 item.m_freq = template_item->m_freq;
1001                 g_array_append_val(items, item);
1002             }
1003         }
1004     }
1005 }
1006
1007 #if 0
1008 static void _remove_duplicated_items(CandidateVector items) {
1009     /* remove the duplicated items. */
1010     phrase_token_t last_token = null_token, saved_token;
1011     for (size_t n = 0; n < items->len; ++n) {
1012         lookup_candidate_t * item = &g_array_index
1013             (items, lookup_candidate_t, n);
1014
1015         saved_token = item->m_token;
1016         if (last_token == saved_token) {
1017             g_array_remove_index(items, n);
1018             n--;
1019         }
1020         last_token = saved_token;
1021     }
1022 }
1023 #endif
1024
1025 static void _compute_frequency_of_items(pinyin_context_t * context,
1026                                         phrase_token_t prev_token,
1027                                         SingleGram * merged_gram,
1028                                         CandidateVector items) {
1029     pinyin_option_t & options = context->m_options;
1030     ssize_t i;
1031
1032     PhraseItem cached_item;
1033     /* compute all freqs. */
1034     for (i = 0; i < items->len; ++i) {
1035         lookup_candidate_t * item = &g_array_index
1036             (items, lookup_candidate_t, i);
1037         phrase_token_t & token = item->m_token;
1038
1039         gfloat bigram_poss = 0; guint32 total_freq = 0;
1040         if (options & DYNAMIC_ADJUST) {
1041             if (null_token != prev_token) {
1042                 guint32 bigram_freq = 0;
1043                 merged_gram->get_total_freq(total_freq);
1044                 merged_gram->get_freq(token, bigram_freq);
1045                 if (0 != total_freq)
1046                     bigram_poss = bigram_freq / (gfloat)total_freq;
1047             }
1048         }
1049
1050         /* compute the m_freq. */
1051         FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1052         phrase_index->get_phrase_item(token, cached_item);
1053         total_freq = phrase_index->get_phrase_index_total_freq();
1054         assert (0 < total_freq);
1055
1056         /* Note: possibility value <= 1.0. */
1057         guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
1058                         (1 - LAMBDA_PARAMETER) *
1059                         cached_item.get_unigram_frequency() /
1060                         (gfloat) total_freq) * 256 * 256 * 256;
1061         item->m_freq = freq;
1062     }
1063 }
1064
1065 static bool _prepend_sentence_candidate(pinyin_instance_t * instance,
1066                                         CandidateVector candidates) {
1067     /* check whether the best match candidate exists. */
1068     gchar * sentence = NULL;
1069     pinyin_get_sentence(instance, &sentence);
1070     if (NULL == sentence)
1071         return false;
1072     g_free(sentence);
1073
1074     /* prepend best match candidate to candidates. */
1075     lookup_candidate_t candidate;
1076     candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
1077     g_array_prepend_val(candidates, candidate);
1078
1079     return true;
1080 }
1081
1082 static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
1083                                              size_t offset,
1084                                              CandidateVector candidates) {
1085     /* populate m_phrase_string in lookup_candidate_t. */
1086
1087     for(size_t i = 0; i < candidates->len; ++i) {
1088         lookup_candidate_t * candidate = &g_array_index
1089             (candidates, lookup_candidate_t, i);
1090
1091         switch(candidate->m_candidate_type) {
1092         case BEST_MATCH_CANDIDATE: {
1093             gchar * sentence = NULL;
1094             pinyin_get_sentence(instance, &sentence);
1095             candidate->m_phrase_string = g_strdup
1096                 (g_utf8_offset_to_pointer(sentence, offset));
1097             g_free(sentence);
1098             break;
1099         }
1100         case NORMAL_CANDIDATE:
1101         case DIVIDED_CANDIDATE:
1102         case RESPLIT_CANDIDATE:
1103             pinyin_token_get_phrase
1104                 (instance, candidate->m_token, NULL,
1105                  &(candidate->m_phrase_string));
1106             break;
1107         case ZOMBIE_CANDIDATE:
1108             break;
1109         }
1110     }
1111
1112     return true;
1113 }
1114
1115 static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
1116                                                     gconstpointer rhs,
1117                                                     gpointer userdata) {
1118     size_t index_lhs = *((size_t *) lhs);
1119     size_t index_rhs = *((size_t *) rhs);
1120     CandidateVector candidates = (CandidateVector) userdata;
1121
1122     lookup_candidate_t * candidate_lhs =
1123         &g_array_index(candidates, lookup_candidate_t, index_lhs);
1124     lookup_candidate_t * candidate_rhs =
1125         &g_array_index(candidates, lookup_candidate_t, index_rhs);
1126
1127     return -strcmp(candidate_lhs->m_phrase_string,
1128                    candidate_rhs->m_phrase_string); /* in descendant order */
1129 }
1130
1131
1132 static bool _remove_duplicated_items_by_phrase_string
1133 (pinyin_instance_t * instance,
1134  CandidateVector candidates) {
1135     size_t i;
1136     /* create the GArray of indexed item */
1137     GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
1138     for (i = 0; i < candidates->len; ++i)
1139         g_array_append_val(indices, i);
1140
1141     /* sort the indices array by phrase array */
1142     g_array_sort_with_data
1143         (indices, compare_indexed_item_with_phrase_string, candidates);
1144
1145     /* mark duplicated items as zombie candidate */
1146     lookup_candidate_t * cur_item, * saved_item = NULL;
1147     for (i = 0; i < indices->len; ++i) {
1148         size_t cur_index = g_array_index(indices, size_t, i);
1149         cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
1150
1151         /* handle the first candidate */
1152         if (NULL == saved_item) {
1153             saved_item = cur_item;
1154             continue;
1155         }
1156
1157         if (0 == strcmp(saved_item->m_phrase_string,
1158                         cur_item->m_phrase_string)) {
1159             /* found duplicated candidates */
1160
1161             /* keep best match candidate */
1162             if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
1163                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1164                 continue;
1165             }
1166
1167             if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
1168                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1169                 saved_item = cur_item;
1170                 continue;
1171             }
1172
1173             /* keep the higher possiblity one
1174                to quickly move the word forward in the candidate list */
1175             if (cur_item->m_freq > saved_item->m_freq) {
1176                 /* find better candidate */
1177                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1178                 saved_item = cur_item;
1179                 continue;
1180             } else {
1181                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1182                 continue;
1183             }
1184         } else {
1185             /* keep the current candidate */
1186             saved_item = cur_item;
1187         }
1188     }
1189
1190     g_array_free(indices, TRUE);
1191
1192     /* remove zombie candidate from the returned candidates */
1193     for (i = 0; i < candidates->len; ++i) {
1194         lookup_candidate_t * candidate = &g_array_index
1195             (candidates, lookup_candidate_t, i);
1196
1197         if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
1198             g_free(candidate->m_phrase_string);
1199             g_free(candidate->m_new_pinyins);
1200             g_array_remove_index(candidates, i);
1201             i--;
1202         }
1203     }
1204
1205     return true;
1206 }
1207
1208 static bool _free_candidates(CandidateVector candidates) {
1209     /* free candidates */
1210     for (size_t i = 0; i < candidates->len; ++i) {
1211         lookup_candidate_t * candidate = &g_array_index
1212             (candidates, lookup_candidate_t, i);
1213         g_free(candidate->m_phrase_string);
1214         g_free(candidate->m_new_pinyins);
1215     }
1216     g_array_set_size(candidates, 0);
1217
1218     return true;
1219 }
1220
1221 bool pinyin_get_candidates(pinyin_instance_t * instance,
1222                            size_t offset,
1223                            CandidateVector candidates) {
1224
1225     pinyin_context_t * & context = instance->m_context;
1226     pinyin_option_t & options = context->m_options;
1227     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1228
1229     _free_candidates(candidates);
1230
1231     size_t pinyin_len = pinyin_keys->len - offset;
1232     ssize_t i;
1233
1234     /* lookup the previous token here. */
1235     phrase_token_t prev_token = null_token;
1236
1237     if (options & DYNAMIC_ADJUST) {
1238         prev_token = _get_previous_token(instance, offset);
1239     }
1240
1241     SingleGram merged_gram;
1242     SingleGram * system_gram = NULL, * user_gram = NULL;
1243
1244     if (options & DYNAMIC_ADJUST) {
1245         if (null_token != prev_token) {
1246             context->m_system_bigram->load(prev_token, system_gram);
1247             context->m_user_bigram->load(prev_token, user_gram);
1248             merge_single_gram(&merged_gram, system_gram, user_gram);
1249         }
1250     }
1251
1252     PhraseIndexRanges ranges;
1253     memset(ranges, 0, sizeof(ranges));
1254     context->m_phrase_index->prepare_ranges(ranges);
1255
1256     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1257
1258     for (i = pinyin_len; i >= 1; --i) {
1259         g_array_set_size(items, 0);
1260
1261         ChewingKey * keys = &g_array_index
1262             (pinyin_keys, ChewingKey, offset);
1263
1264         /* do pinyin search. */
1265         int retval = context->m_pinyin_table->search
1266             (i, keys, ranges);
1267
1268         if ( !(retval & SEARCH_OK) )
1269             continue;
1270
1271         lookup_candidate_t template_item;
1272         _append_items(context, ranges, &template_item, items);
1273
1274 #if 0
1275         g_array_sort(items, compare_item_with_token);
1276
1277         _remove_duplicated_items(items);
1278 #endif
1279
1280         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1281
1282         /* sort the candidates of the same length by frequency. */
1283         g_array_sort(items, compare_item_with_frequency);
1284
1285         /* transfer back items to tokens, and save it into candidates */
1286         for (size_t k = 0; k < items->len; ++k) {
1287             lookup_candidate_t * item = &g_array_index
1288                 (items, lookup_candidate_t, k);
1289             g_array_append_val(candidates, *item);
1290         }
1291
1292 #if 0
1293         if (!(retval & SEARCH_CONTINUED))
1294             break;
1295 #endif
1296     }
1297
1298     g_array_free(items, TRUE);
1299     context->m_phrase_index->destroy_ranges(ranges);
1300     if (system_gram)
1301         delete system_gram;
1302     if (user_gram)
1303         delete user_gram;
1304
1305     /* post process to remove duplicated candidates */
1306
1307     _prepend_sentence_candidate(instance, candidates);
1308
1309     _compute_phrase_strings_of_items(instance, offset, candidates);
1310
1311     _remove_duplicated_items_by_phrase_string(instance, candidates);
1312
1313     return true;
1314 }
1315
1316
1317 static bool _try_divided_table(pinyin_instance_t * instance,
1318                                PhraseIndexRanges ranges,
1319                                size_t offset,
1320                                CandidateVector items){
1321     bool found = false;
1322
1323     pinyin_context_t * & context = instance->m_context;
1324     pinyin_option_t & options = context->m_options;
1325     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1326     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1327
1328     assert(pinyin_keys->len == pinyin_key_rests->len);
1329     guint num_keys = pinyin_keys->len;
1330     assert(offset < num_keys);
1331
1332     /* handle "^xian$" -> "xi'an" here */
1333     ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
1334     ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
1335                                            ChewingKeyRest, offset);
1336     ChewingKeyRest orig_rest = *rest;
1337     guint16 tone = CHEWING_ZERO_TONE;
1338
1339     const divided_table_item_t * item = NULL;
1340
1341     /* back up tone */
1342     if (options & USE_TONE) {
1343         tone = key->m_tone;
1344         if (CHEWING_ZERO_TONE != tone) {
1345             key->m_tone = CHEWING_ZERO_TONE;
1346             rest->m_raw_end --;
1347         }
1348     }
1349
1350     item = context->m_full_pinyin_parser->retrieve_divided_item
1351         (options, key, rest, instance->m_raw_full_pinyin,
1352          strlen(instance->m_raw_full_pinyin));
1353
1354     if (item) {
1355         /* no ops */
1356         assert(item->m_new_freq > 0);
1357
1358         ChewingKey divided_keys[2];
1359         const char * pinyin = item->m_new_keys[0];
1360         assert(context->m_full_pinyin_parser->
1361                parse_one_key(options, divided_keys[0],
1362                              pinyin, strlen(pinyin)));
1363         pinyin = item->m_new_keys[1];
1364         assert(context->m_full_pinyin_parser->
1365                parse_one_key(options, divided_keys[1],
1366                              pinyin, strlen(pinyin)));
1367
1368         gchar * new_pinyins = g_strdup_printf
1369             ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
1370
1371         /* propagate the tone */
1372         if (options & USE_TONE) {
1373             if (CHEWING_ZERO_TONE != tone) {
1374                 assert(0 < tone && tone <= 5);
1375                 divided_keys[1].m_tone = tone;
1376
1377                 gchar * tmp_str = g_strdup_printf
1378                     ("%s%d", new_pinyins, tone);
1379                 g_free(new_pinyins);
1380                 new_pinyins = tmp_str;
1381             }
1382         }
1383
1384         /* do pinyin search. */
1385         int retval = context->m_pinyin_table->search
1386             (2, divided_keys, ranges);
1387
1388         if (retval & SEARCH_OK) {
1389             lookup_candidate_t template_item;
1390             template_item.m_candidate_type = DIVIDED_CANDIDATE;
1391             template_item.m_orig_rest = orig_rest;
1392             template_item.m_new_pinyins = new_pinyins;
1393
1394             _append_items(context, ranges, &template_item, items);
1395             found = true;
1396         }
1397         g_free(new_pinyins);
1398     }
1399
1400     /* restore tones */
1401     if (options & USE_TONE) {
1402         if (CHEWING_ZERO_TONE != tone) {
1403             key->m_tone = tone;
1404             rest->m_raw_end ++;
1405         }
1406     }
1407
1408     return found;
1409 }
1410
1411 static bool _try_resplit_table(pinyin_instance_t * instance,
1412                                PhraseIndexRanges ranges,
1413                                size_t offset,
1414                                CandidateVector items){
1415     bool found = false;
1416
1417     pinyin_context_t * & context = instance->m_context;
1418     pinyin_option_t & options = context->m_options;
1419     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1420     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1421
1422     assert(pinyin_keys->len == pinyin_key_rests->len);
1423     guint num_keys = pinyin_keys->len;
1424     assert(offset + 1 < num_keys);
1425
1426     guint16 next_tone = CHEWING_ZERO_TONE;
1427
1428     /* handle "^fa'nan$" -> "fan'an" here */
1429     ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
1430                                                ChewingKeyRest, offset);
1431     ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
1432                                                 ChewingKeyRest, offset + 1);
1433     /* some "'" here */
1434     if (cur_rest->m_raw_end != next_rest->m_raw_begin)
1435         return found;
1436
1437     ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
1438     ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
1439                                            offset + 1);
1440
1441     /* some tone here */
1442     if (CHEWING_ZERO_TONE != cur_key->m_tone)
1443         return found;
1444
1445     ChewingKeyRest orig_rest;
1446     orig_rest.m_raw_begin = cur_rest->m_raw_begin;
1447     orig_rest.m_raw_end = next_rest->m_raw_end;
1448
1449     /* backup tone */
1450     if (options & USE_TONE) {
1451         next_tone = next_key->m_tone;
1452         if (CHEWING_ZERO_TONE != next_tone) {
1453             next_key->m_tone = CHEWING_ZERO_TONE;
1454             next_rest->m_raw_end --;
1455         }
1456     }
1457
1458     /* lookup re-split table */
1459     const char * str = instance->m_raw_full_pinyin;
1460     const resplit_table_item_t * item_by_orig =
1461         context->m_full_pinyin_parser->
1462         retrieve_resplit_item_by_original_pinyins
1463         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1464
1465     const resplit_table_item_t * item_by_new =
1466         context->m_full_pinyin_parser->
1467         retrieve_resplit_item_by_resplit_pinyins
1468         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1469
1470     /* there are no same couple of pinyins in re-split table. */
1471     assert(!(item_by_orig && item_by_new));
1472
1473     ChewingKey resplit_keys[2];
1474     const char * pinyins[2];
1475
1476     bool tosearch = false;
1477     if (item_by_orig && item_by_orig->m_new_freq) {
1478         pinyins[0] = item_by_orig->m_new_keys[0];
1479         pinyins[1] = item_by_orig->m_new_keys[1];
1480
1481         assert(context->m_full_pinyin_parser->
1482                parse_one_key(options, resplit_keys[0],
1483                              pinyins[0], strlen(pinyins[0])));
1484
1485         assert(context->m_full_pinyin_parser->
1486                parse_one_key(options, resplit_keys[1],
1487                              pinyins[1], strlen(pinyins[1])));
1488         tosearch = true;
1489     }
1490
1491     if (item_by_new && item_by_new->m_orig_freq) {
1492         pinyins[0] = item_by_new->m_orig_keys[0];
1493         pinyins[1] = item_by_new->m_orig_keys[1];
1494
1495         assert(context->m_full_pinyin_parser->
1496                parse_one_key(options, resplit_keys[0],
1497                              pinyins[0], strlen(pinyins[0])));
1498
1499         assert(context->m_full_pinyin_parser->
1500                parse_one_key(options, resplit_keys[1],
1501                              pinyins[1], strlen(pinyins[1])));
1502         tosearch = true;
1503     }
1504
1505     if (tosearch) {
1506         gchar * new_pinyins = g_strdup_printf
1507             ("%s'%s", pinyins[0], pinyins[1]);
1508
1509         /* propagate the tone */
1510         if (options & USE_TONE) {
1511             if (CHEWING_ZERO_TONE != next_tone) {
1512                 assert(0 < next_tone && next_tone <= 5);
1513                 resplit_keys[1].m_tone = next_tone;
1514
1515                 gchar * tmp_str = g_strdup_printf
1516                     ("%s%d", new_pinyins, next_tone);
1517                 g_free(new_pinyins);
1518                 new_pinyins = tmp_str;
1519             }
1520         }
1521
1522         /* do pinyin search. */
1523         int retval = context->m_pinyin_table->search
1524             (2, resplit_keys, ranges);
1525
1526         if (retval & SEARCH_OK) {
1527             lookup_candidate_t template_item;
1528             template_item.m_candidate_type = RESPLIT_CANDIDATE;
1529             template_item.m_orig_rest = orig_rest;
1530             template_item.m_new_pinyins = new_pinyins;
1531
1532             _append_items(context, ranges, &template_item, items);
1533             found = true;
1534         }
1535         g_free(new_pinyins);
1536     }
1537
1538     /* restore tones */
1539     if (options & USE_TONE) {
1540         if (CHEWING_ZERO_TONE != next_tone) {
1541             next_key->m_tone = next_tone;
1542             next_rest->m_raw_end ++;
1543         }
1544     }
1545
1546     return found;
1547 }
1548
1549 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1550                                        size_t offset,
1551                                        CandidateVector candidates){
1552
1553     pinyin_context_t * & context = instance->m_context;
1554     pinyin_option_t & options = context->m_options;
1555     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1556
1557     _free_candidates(candidates);
1558
1559     size_t pinyin_len = pinyin_keys->len - offset;
1560     pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1561     ssize_t i;
1562
1563     /* lookup the previous token here. */
1564     phrase_token_t prev_token = null_token;
1565
1566     if (options & DYNAMIC_ADJUST) {
1567         prev_token = _get_previous_token(instance, offset);
1568     }
1569
1570     SingleGram merged_gram;
1571     SingleGram * system_gram = NULL, * user_gram = NULL;
1572
1573     if (options & DYNAMIC_ADJUST) {
1574         if (null_token != prev_token) {
1575             context->m_system_bigram->load(prev_token, system_gram);
1576             context->m_user_bigram->load(prev_token, user_gram);
1577             merge_single_gram(&merged_gram, system_gram, user_gram);
1578         }
1579     }
1580
1581     PhraseIndexRanges ranges;
1582     memset(ranges, 0, sizeof(ranges));
1583     context->m_phrase_index->prepare_ranges(ranges);
1584
1585     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1586
1587     if (1 == pinyin_len) {
1588         /* because there is only one pinyin left,
1589          *  the following for-loop will not produce 2 character candidates.
1590          * the if-branch will fill the candidate list with
1591          *  2 character candidates.
1592          */
1593
1594         if (options & USE_DIVIDED_TABLE) {
1595             g_array_set_size(items, 0);
1596
1597             if (_try_divided_table(instance, ranges, offset, items)) {
1598
1599 #if 0
1600                 g_array_sort(items, compare_item_with_token);
1601
1602                 _remove_duplicated_items(items);
1603 #endif
1604
1605                 _compute_frequency_of_items(context, prev_token,
1606                                             &merged_gram, items);
1607
1608                 /* sort the candidates of the same length by frequency. */
1609                 g_array_sort(items, compare_item_with_frequency);
1610
1611                 /* transfer back items to tokens, and save it into candidates */
1612                 for (i = 0; i < items->len; ++i) {
1613                     lookup_candidate_t * item = &g_array_index
1614                         (items, lookup_candidate_t, i);
1615                     g_array_append_val(candidates, *item);
1616                 }
1617             }
1618         }
1619     }
1620
1621     for (i = pinyin_len; i >= 1; --i) {
1622         bool found = false;
1623         g_array_set_size(items, 0);
1624
1625         if (2 == i) {
1626             /* handle fuzzy pinyin segment here. */
1627             if (options & USE_DIVIDED_TABLE) {
1628                 found = _try_divided_table(instance, ranges, offset, items) ||
1629                     found;
1630             }
1631             if (options & USE_RESPLIT_TABLE) {
1632                 found = _try_resplit_table(instance, ranges, offset, items) ||
1633                     found;
1634             }
1635         }
1636
1637         ChewingKey * keys = &g_array_index
1638             (pinyin_keys, ChewingKey, offset);
1639
1640         /* do pinyin search. */
1641         int retval = context->m_pinyin_table->search
1642             (i, keys, ranges);
1643
1644         found = (retval & SEARCH_OK) || found;
1645
1646         if ( !found )
1647             continue;
1648
1649         lookup_candidate_t template_item;
1650         _append_items(context, ranges, &template_item, items);
1651
1652 #if 0
1653         g_array_sort(items, compare_item_with_token);
1654
1655         _remove_duplicated_items(items);
1656 #endif
1657
1658         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1659
1660         g_array_sort(items, compare_item_with_frequency);
1661
1662         for (size_t k = 0; k < items->len; ++k) {
1663             lookup_candidate_t * item = &g_array_index
1664                 (items, lookup_candidate_t, k);
1665             g_array_append_val(candidates, *item);
1666         }
1667
1668 #if 0
1669         if (!(retval & SEARCH_CONTINUED))
1670             break;
1671 #endif
1672     }
1673
1674     g_array_free(items, TRUE);
1675     context->m_phrase_index->destroy_ranges(ranges);
1676     if (system_gram)
1677         delete system_gram;
1678     if (user_gram)
1679         delete user_gram;
1680
1681     /* post process to remove duplicated candidates */
1682
1683     _prepend_sentence_candidate(instance, candidates);
1684
1685     _compute_phrase_strings_of_items(instance, offset, candidates);
1686
1687     _remove_duplicated_items_by_phrase_string(instance, candidates);
1688
1689     return true;
1690 }
1691
1692
1693 int pinyin_choose_candidate(pinyin_instance_t * instance,
1694                             size_t offset,
1695                             lookup_candidate_t * candidate){
1696     pinyin_context_t * & context = instance->m_context;
1697
1698     if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1699         RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1700         /* update full pinyin. */
1701         gchar * oldpinyins = instance->m_raw_full_pinyin;
1702         const ChewingKeyRest rest = candidate->m_orig_rest;
1703         oldpinyins[rest.m_raw_begin] = '\0';
1704         const gchar * left_part = oldpinyins;
1705         const gchar * right_part = oldpinyins + rest.m_raw_end;
1706         gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1707                                          right_part, NULL);
1708         g_free(oldpinyins);
1709         instance->m_raw_full_pinyin = newpinyins;
1710
1711         /* re-parse the full pinyin.  */
1712         const gchar * pinyins = instance->m_raw_full_pinyin;
1713         int pinyin_len = strlen(pinyins);
1714         int parse_len = context->m_full_pinyin_parser->parse
1715             (context->m_options, instance->m_pinyin_keys,
1716              instance->m_pinyin_key_rests, pinyins, pinyin_len);
1717
1718         /* Note: there may be some un-parsable input here. */
1719     }
1720
1721     /* sync m_constraints to the length of m_pinyin_keys. */
1722     bool retval = context->m_pinyin_lookup->validate_constraint
1723         (instance->m_constraints, instance->m_pinyin_keys);
1724
1725     phrase_token_t token = candidate->m_token;
1726     guint8 len = context->m_pinyin_lookup->add_constraint
1727         (instance->m_constraints, offset, token);
1728
1729     /* safe guard: validate the m_constraints again. */
1730     retval = context->m_pinyin_lookup->validate_constraint
1731         (instance->m_constraints, instance->m_pinyin_keys) && len;
1732
1733     return offset + len;
1734 }
1735
1736
1737 bool pinyin_free_candidates(pinyin_instance_t * instance,
1738                             CandidateVector candidates) {
1739     _free_candidates(candidates);
1740     return true;
1741 }
1742
1743 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1744                              size_t offset){
1745     pinyin_context_t * & context = instance->m_context;
1746
1747     bool retval = context->m_pinyin_lookup->clear_constraint
1748         (instance->m_constraints, offset);
1749
1750     return retval;
1751 }
1752
1753 bool pinyin_lookup_tokens(pinyin_instance_t * instance,
1754                           const char * phrase, GArray * tokenarray){
1755     pinyin_context_t * & context = instance->m_context;
1756     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1757
1758     glong ucs4_len = 0;
1759     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
1760
1761     PhraseTokens tokens;
1762     memset(tokens, 0, sizeof(PhraseTokens));
1763     phrase_index->prepare_tokens(tokens);
1764     int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
1765     int num = reduce_tokens(tokens, tokenarray);
1766     phrase_index->destroy_tokens(tokens);
1767
1768     return SEARCH_OK & retval;
1769 }
1770
1771 bool pinyin_train(pinyin_instance_t * instance){
1772     if (!instance->m_context->m_user_dir)
1773         return false;
1774
1775     pinyin_context_t * & context = instance->m_context;
1776     context->m_modified = true;
1777
1778     bool retval = context->m_pinyin_lookup->train_result2
1779         (instance->m_pinyin_keys, instance->m_constraints,
1780          instance->m_match_results);
1781
1782     return retval;
1783 }
1784
1785 bool pinyin_reset(pinyin_instance_t * instance){
1786     g_free(instance->m_raw_full_pinyin);
1787     instance->m_raw_full_pinyin = NULL;
1788
1789     g_array_set_size(instance->m_prefixes, 0);
1790     g_array_set_size(instance->m_pinyin_keys, 0);
1791     g_array_set_size(instance->m_pinyin_key_rests, 0);
1792     g_array_set_size(instance->m_constraints, 0);
1793     g_array_set_size(instance->m_match_results, 0);
1794
1795     return true;
1796 }
1797
1798 bool pinyin_get_chewing_string(pinyin_instance_t * instance,
1799                                ChewingKey * key,
1800                                gchar ** utf8_str) {
1801     *utf8_str = NULL;
1802     if (0 == key->get_table_index())
1803         return false;
1804
1805     *utf8_str = key->get_chewing_string();
1806     return true;
1807 }
1808
1809 bool pinyin_get_pinyin_string(pinyin_instance_t * instance,
1810                               ChewingKey * key,
1811                               gchar ** utf8_str) {
1812     *utf8_str = NULL;
1813     if (0 == key->get_table_index())
1814         return false;
1815
1816     *utf8_str = key->get_pinyin_string();
1817     return true;
1818 }
1819
1820 bool pinyin_get_pinyin_strings(pinyin_instance_t * instance,
1821                                ChewingKey * key,
1822                                gchar ** shengmu,
1823                                gchar ** yunmu) {
1824     *shengmu = NULL; *yunmu = NULL;
1825     if (0 == key->get_table_index())
1826         return false;
1827
1828     *shengmu = key->get_shengmu_string();
1829     *yunmu = key->get_yunmu_string();
1830     return true;
1831 }
1832
1833 bool pinyin_token_get_phrase(pinyin_instance_t * instance,
1834                              phrase_token_t token,
1835                              guint * len,
1836                              gchar ** utf8_str) {
1837     pinyin_context_t * & context = instance->m_context;
1838     PhraseItem item;
1839     ucs4_t buffer[MAX_PHRASE_LENGTH];
1840
1841     int retval = context->m_phrase_index->get_phrase_item(token, item);
1842     if (ERROR_OK != retval)
1843         return false;
1844
1845     item.get_phrase_string(buffer);
1846     guint length = item.get_phrase_length();
1847     if (len)
1848         *len = length;
1849     if (utf8_str)
1850         *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1851     return true;
1852 }
1853
1854 bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
1855                                       phrase_token_t token,
1856                                       guint * num){
1857     *num = 0;
1858     pinyin_context_t * & context = instance->m_context;
1859     PhraseItem item;
1860
1861     int retval = context->m_phrase_index->get_phrase_item(token, item);
1862     if (ERROR_OK != retval)
1863         return false;
1864
1865     *num = item.get_n_pronunciation();
1866     return true;
1867 }
1868
1869 bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance,
1870                                         phrase_token_t token,
1871                                         guint nth,
1872                                         ChewingKeyVector keys){
1873     g_array_set_size(keys, 0);
1874     pinyin_context_t * & context = instance->m_context;
1875     PhraseItem item;
1876     ChewingKey buffer[MAX_PHRASE_LENGTH];
1877     guint32 freq = 0;
1878
1879     int retval = context->m_phrase_index->get_phrase_item(token, item);
1880     if (ERROR_OK != retval)
1881         return false;
1882
1883     item.get_nth_pronunciation(nth, buffer, freq);
1884     guint8 len = item.get_phrase_length();
1885     g_array_append_vals(keys, buffer, len);
1886     return true;
1887 }
1888
1889 bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance,
1890                                         phrase_token_t token,
1891                                         guint * freq) {
1892     *freq = 0;
1893     pinyin_context_t * & context = instance->m_context;
1894     PhraseItem item;
1895
1896     int retval = context->m_phrase_index->get_phrase_item(token, item);
1897     if (ERROR_OK != retval)
1898         return false;
1899
1900     *freq = item.get_unigram_frequency();
1901     return true;
1902 }
1903
1904 bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance,
1905                                         phrase_token_t token,
1906                                         guint delta){
1907     pinyin_context_t * & context = instance->m_context;
1908     int retval = context->m_phrase_index->add_unigram_frequency
1909         (token, delta);
1910     return ERROR_OK == retval;
1911 }
1912
1913
1914
1915 /**
1916  *  Note: prefix is the text before the pre-edit string.
1917  */