src/pinyin.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22
  23 #include "pinyin.h"
  24 #include <stdio.h>
  25 #include <unistd.h>
  26 #include <glib/gstdio.h>
  27 #include "pinyin_internal.h"
  28
  29 using namespace pinyin;
  30
  31 /* a glue layer for input method integration. */
  32
  33 struct _pinyin_context_t{
  34     pinyin_option_t m_options;
  35
  36     FullPinyinParser2 * m_full_pinyin_parser;
  37     DoublePinyinParser2 * m_double_pinyin_parser;
  38     ChewingParser2 * m_chewing_parser;
  39
  40     FacadeChewingTable * m_pinyin_table;
  41     FacadePhraseTable2 * m_phrase_table;
  42     FacadePhraseIndex * m_phrase_index;
  43     Bigram * m_system_bigram;
  44     Bigram * m_user_bigram;
  45
  46     PinyinLookup2 * m_pinyin_lookup;
  47     PhraseLookup * m_phrase_lookup;
  48
  49     char * m_system_dir;
  50     char * m_user_dir;
  51     bool m_modified;
  52 };
  53
  54 struct _pinyin_instance_t{
  55     pinyin_context_t * m_context;
  56     gchar * m_raw_full_pinyin;
  57     TokenVector m_prefixes;
  58     ChewingKeyVector m_pinyin_keys;
  59     ChewingKeyRestVector m_pinyin_key_rests;
  60     CandidateConstraints m_constraints;
  61     MatchResults m_match_results;
  62     CandidateVector m_candidates;
  63 };
  64
  65 struct _lookup_candidate_t{
  66     enum lookup_candidate_type_t m_candidate_type;
  67     gchar * m_phrase_string;
  68     phrase_token_t m_token;
  69     ChewingKeyRest m_orig_rest;
  70     gchar * m_new_pinyins;
  71     guint32 m_freq; /* the amplifed gfloat numerical value. */
  72 public:
  73     _lookup_candidate_t() {
  74         m_candidate_type = NORMAL_CANDIDATE;
  75         m_phrase_string = NULL;
  76         m_token = null_token;
  77         m_new_pinyins = NULL;
  78         m_freq = 0;
  79     }
  80 };
  81
  82 struct _import_iterator_t{
  83     pinyin_context_t * m_context;
  84     guint8 m_phrase_index;
  85 };
  86
  87
  88 static bool check_format(const char * userdir){
  89     gchar * filename = g_build_filename
  90         (userdir, "version", NULL);
  91
  92     MemoryChunk chunk;
  93     bool exists = chunk.load(filename);
  94
  95     if (exists) {
  96         exists = (0 == memcmp
  97                   (LIBPINYIN_FORMAT_VERSION, chunk.begin(),
  98                    strlen(LIBPINYIN_FORMAT_VERSION) + 1));
  99     }
 100     g_free(filename);
 101
 102     if (exists)
 103         return exists;
 104
 105     /* clean up files, if version mis-matches. */
 106     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
 107         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
 108
 109         if (NOT_USED == table_info->m_file_type)
 110             continue;
 111
 112         if (NULL == table_info->m_user_filename)
 113             continue;
 114
 115         const char * userfilename = table_info->m_user_filename;
 116
 117         /* remove dbin file. */
 118         filename = g_build_filename(userdir, userfilename, NULL);
 119         unlink(filename);
 120         g_free(filename);
 121     }
 122
 123     filename = g_build_filename
 124         (userdir, "user_pinyin_index.bin", NULL);
 125     unlink(filename);
 126     g_free(filename);
 127
 128     filename = g_build_filename
 129         (userdir, "user_phrase_index.bin", NULL);
 130     unlink(filename);
 131     g_free(filename);
 132
 133     filename = g_build_filename
 134         (userdir, "user.db", NULL);
 135     unlink(filename);
 136     g_free(filename);
 137
 138     return exists;
 139 }
 140
 141 static bool mark_version(const char * userdir){
 142     gchar * filename = g_build_filename
 143         (userdir, "version", NULL);
 144     MemoryChunk chunk;
 145     chunk.set_content(0, LIBPINYIN_FORMAT_VERSION,
 146                       strlen(LIBPINYIN_FORMAT_VERSION) + 1);
 147     bool retval = chunk.save(filename);
 148     g_free(filename);
 149     return retval;
 150 }
 151
 152 pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
 153     pinyin_context_t * context = new pinyin_context_t;
 154
 155     context->m_options = USE_TONE;
 156
 157     context->m_system_dir = g_strdup(systemdir);
 158     context->m_user_dir = g_strdup(userdir);
 159     context->m_modified = false;
 160
 161     check_format(context->m_user_dir);
 162
 163     context->m_full_pinyin_parser = new FullPinyinParser2;
 164     context->m_double_pinyin_parser = new DoublePinyinParser2;
 165     context->m_chewing_parser = new ChewingParser2;
 166
 167     /* load chewing table. */
 168     context->m_pinyin_table = new FacadeChewingTable;
 169
 170     /* load system chewing table. */
 171     MemoryChunk * chunk = new MemoryChunk;
 172     gchar * filename = g_build_filename
 173         (context->m_system_dir, "pinyin_index.bin", NULL);
 174     if (!chunk->load(filename)) {
 175         fprintf(stderr, "open %s failed!\n", filename);
 176         return NULL;
 177     }
 178     g_free(filename);
 179
 180     /* load user chewing table */
 181     MemoryChunk * userchunk = new MemoryChunk;
 182     filename = g_build_filename
 183         (context->m_user_dir, "user_pinyin_index.bin", NULL);
 184     if (!userchunk->load(filename)) {
 185         /* hack here: use local Chewing Table to create empty memory chunk. */
 186         ChewingLargeTable table(context->m_options);
 187         table.store(userchunk);
 188     }
 189     g_free(filename);
 190
 191     context->m_pinyin_table->load(context->m_options, chunk, userchunk);
 192
 193     /* load phrase table */
 194     context->m_phrase_table = new FacadePhraseTable2;
 195
 196     /* load system phrase table */
 197     chunk = new MemoryChunk;
 198     filename = g_build_filename
 199         (context->m_system_dir, "phrase_index.bin", NULL);
 200     if (!chunk->load(filename)) {
 201         fprintf(stderr, "open %s failed!\n", filename);
 202         return NULL;
 203     }
 204     g_free(filename);
 205
 206     /* load user phrase table */
 207     userchunk = new MemoryChunk;
 208     filename = g_build_filename
 209         (context->m_user_dir, "user_phrase_index.bin", NULL);
 210     if (!userchunk->load(filename)) {
 211         /* hack here: use local Phrase Table to create empty memory chunk. */
 212         PhraseLargeTable2 table;
 213         table.store(userchunk);
 214     }
 215     g_free(filename);
 216
 217     context->m_phrase_table->load(chunk, userchunk);
 218
 219     context->m_phrase_index = new FacadePhraseIndex;
 220
 221     /* hack here: directly call load phrase library. */
 222     pinyin_load_phrase_library(context, GB_DICTIONARY);
 223     pinyin_load_phrase_library(context, MERGED_DICTIONARY);
 224
 225     context->m_system_bigram = new Bigram;
 226     filename = g_build_filename(context->m_system_dir, "bigram.db", NULL);
 227     context->m_system_bigram->attach(filename, ATTACH_READONLY);
 228     g_free(filename);
 229
 230     context->m_user_bigram = new Bigram;
 231     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
 232     context->m_user_bigram->load_db(filename);
 233     g_free(filename);
 234
 235     context->m_pinyin_lookup = new PinyinLookup2
 236         ( context->m_options, context->m_pinyin_table,
 237           context->m_phrase_index, context->m_system_bigram,
 238           context->m_user_bigram);
 239
 240     context->m_phrase_lookup = new PhraseLookup
 241         (context->m_phrase_table, context->m_phrase_index,
 242          context->m_system_bigram, context->m_user_bigram);
 243
 244     return context;
 245 }
 246
 247 bool pinyin_load_phrase_library(pinyin_context_t * context,
 248                                 guint8 index){
 249     if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
 250         return false;
 251
 252     /* check whether the sub phrase index is already loaded. */
 253     PhraseIndexRange range;
 254     int retval = context->m_phrase_index->get_range(index, range);
 255     if (ERROR_OK == retval)
 256         return false;
 257
 258     const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
 259
 260     if (SYSTEM_FILE == table_info->m_file_type ||
 261         DICTIONARY == table_info->m_file_type) {
 262         /* system phrase library */
 263         MemoryChunk * chunk = new MemoryChunk;
 264
 265         const char * systemfilename = table_info->m_system_filename;
 266         /* check bin file in system dir. */
 267         gchar * chunkfilename = g_build_filename(context->m_system_dir,
 268                                                  systemfilename, NULL);
 269         chunk->load(chunkfilename);
 270         g_free(chunkfilename);
 271
 272         context->m_phrase_index->load(index, chunk);
 273
 274         const char * userfilename = table_info->m_user_filename;
 275
 276         chunkfilename = g_build_filename(context->m_user_dir,
 277                                          userfilename, NULL);
 278
 279         MemoryChunk * log = new MemoryChunk;
 280         log->load(chunkfilename);
 281         g_free(chunkfilename);
 282
 283         /* merge the chunk log. */
 284         context->m_phrase_index->merge(index, log);
 285         return true;
 286     }
 287
 288     if (USER_FILE == table_info->m_file_type) {
 289         /* user phrase library */
 290         MemoryChunk * chunk = new MemoryChunk;
 291         const char * userfilename = table_info->m_user_filename;
 292
 293         gchar * chunkfilename = g_build_filename(context->m_user_dir,
 294                                                  userfilename, NULL);
 295
 296         /* check bin file exists. if not, create a new one. */
 297         if (chunk->load(chunkfilename)) {
 298             context->m_phrase_index->load(index, chunk);
 299         } else {
 300             delete chunk;
 301             context->m_phrase_index->create_sub_phrase(index);
 302         }
 303
 304         g_free(chunkfilename);
 305         return true;
 306     }
 307
 308     return false;
 309 }
 310
 311 bool pinyin_unload_phrase_library(pinyin_context_t * context,
 312                                   guint8 index){
 313     /* gb_char.bin and merged.bin can't be unloaded. */
 314     if (GB_DICTIONARY == index || MERGED_DICTIONARY == index)
 315         return false;
 316
 317     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
 318
 319     context->m_phrase_index->unload(index);
 320     return true;
 321 }
 322
 323 import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context,
 324                                              guint8 index){
 325     import_iterator_t * iter = new import_iterator_t;
 326     iter->m_context = context;
 327     iter->m_phrase_index = index;
 328     return iter;
 329 }
 330
 331 bool pinyin_iterator_add_phrase(import_iterator_t * iter,
 332                                 const char * phrase,
 333                                 const char * pinyin,
 334                                 gint count){
 335     /* if -1 == count, use the default value. */
 336     const gint default_count = 5;
 337     const guint32 unigram_factor = 3;
 338     if (-1 == count)
 339         count = default_count;
 340
 341     pinyin_context_t * & context = iter->m_context;
 342     FacadePhraseTable2 * & phrase_table = context->m_phrase_table;
 343     FacadeChewingTable * & pinyin_table = context->m_pinyin_table;
 344     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
 345
 346     /* check whether the phrase exists in phrase table */
 347     glong len_phrase = 0;
 348     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL);
 349
 350     bool result = false;
 351
 352     pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
 353     FullPinyinParser2 parser;
 354     ChewingKeyVector keys =
 355         g_array_new(FALSE, FALSE, sizeof(ChewingKey));
 356     ChewingKeyRestVector key_rests =
 357         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
 358
 359     /* parse the pinyin. */
 360     parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
 361
 362     if (len_phrase != keys->len)
 363         return result;
 364
 365     if (len_phrase >= MAX_PHRASE_LENGTH)
 366         return result;
 367
 368     phrase_token_t token = null_token;
 369     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 370
 371     /* do phrase table search. */
 372     PhraseTokens tokens;
 373     memset(tokens, 0, sizeof(PhraseTokens));
 374     phrase_index->prepare_tokens(tokens);
 375     int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens);
 376     int num = reduce_tokens(tokens, tokenarray);
 377     phrase_index->destroy_tokens(tokens);
 378
 379     /* find the best token candidate. */
 380     for (size_t i = 0; i < tokenarray->len; ++i) {
 381         phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
 382         if (null_token == token) {
 383             token = candidate;
 384             continue;
 385         }
 386
 387         if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) {
 388             /* only one phrase string per sub phrase index. */
 389             assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index);
 390             token = candidate;
 391             continue;
 392         }
 393     }
 394     g_array_free(tokenarray, TRUE);
 395
 396     PhraseItem item;
 397     /* check whether it exists in the same sub phrase index; */
 398     if (null_token != token &&
 399         PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) {
 400         /* if so, remove the phrase, add the pinyin for the phrase item,
 401            then add it back;*/
 402         phrase_index->get_phrase_item(token, item);
 403         assert(len_phrase == item.get_phrase_length());
 404         ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
 405         item.get_phrase_string(tmp_phrase);
 406         assert(0 == memcmp
 407                (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase));
 408
 409         PhraseItem * removed_item = NULL;
 410         retval = phrase_index->remove_phrase_item(token, removed_item);
 411         if (ERROR_OK == retval) {
 412             /* maybe check whether there are duplicated pronunciations here. */
 413             removed_item->append_pronunciation((ChewingKey *)keys->data,
 414                                                count);
 415             phrase_index->add_phrase_item(token, removed_item);
 416             delete removed_item;
 417             result = true;
 418         }
 419     } else {
 420         /* if not exists in the same sub phrase index,
 421            get the maximum token,
 422            then add it directly with maximum token + 1; */
 423         PhraseIndexRange range;
 424         retval = phrase_index->get_range(iter->m_phrase_index, range);
 425
 426         if (ERROR_OK == retval) {
 427             token = range.m_range_end;
 428             if (0x00000000 == (token & PHRASE_MASK))
 429                 token++;
 430
 431             if (len_phrase == keys->len) { /* valid pinyin */
 432                 phrase_table->add_index(len_phrase, ucs4_phrase, token);
 433                 pinyin_table->add_index
 434                     (keys->len, (ChewingKey *)(keys->data), token);
 435
 436                 item.set_phrase_string(len_phrase, ucs4_phrase);
 437                 item.append_pronunciation((ChewingKey *)(keys->data), count);
 438                 phrase_index->add_phrase_item(token, &item);
 439                 phrase_index->add_unigram_frequency(token,
 440                                                     count * unigram_factor);
 441                 result = true;
 442             }
 443         }
 444     }
 445
 446     g_array_free(key_rests, TRUE);
 447     g_array_free(keys, TRUE);
 448     g_free(ucs4_phrase);
 449     return result;
 450 }
 451
 452 void pinyin_end_add_phrases(import_iterator_t * iter){
 453     /* compact the content memory chunk of phrase index. */
 454     iter->m_context->m_phrase_index->compact();
 455     delete iter;
 456 }
 457
 458 bool pinyin_save(pinyin_context_t * context){
 459     if (!context->m_user_dir)
 460         return false;
 461
 462     if (!context->m_modified)
 463         return false;
 464
 465     context->m_phrase_index->compact();
 466
 467     /* skip the reserved zero phrase library. */
 468     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
 469         PhraseIndexRange range;
 470         int retval = context->m_phrase_index->get_range(i, range);
 471
 472         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
 473             continue;
 474
 475         const pinyin_table_info_t * table_info = pinyin_phrase_files + i;
 476
 477         if (NOT_USED == table_info->m_file_type)
 478             continue;
 479
 480         const char * userfilename = table_info->m_user_filename;
 481
 482         if (NULL == userfilename)
 483             continue;
 484
 485         if (SYSTEM_FILE == table_info->m_file_type ||
 486             DICTIONARY == table_info->m_file_type) {
 487             /* system phrase library */
 488             MemoryChunk * chunk = new MemoryChunk;
 489             MemoryChunk * log = new MemoryChunk;
 490             const char * systemfilename = table_info->m_system_filename;
 491
 492             /* check bin file in system dir. */
 493             gchar * chunkfilename = g_build_filename(context->m_system_dir,
 494                                                      systemfilename, NULL);
 495             chunk->load(chunkfilename);
 496             g_free(chunkfilename);
 497             context->m_phrase_index->diff(i, chunk, log);
 498
 499             const char * userfilename = table_info->m_user_filename;
 500             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
 501
 502             gchar * tmppathname = g_build_filename(context->m_user_dir,
 503                                                    tmpfilename, NULL);
 504             g_free(tmpfilename);
 505
 506             gchar * chunkpathname = g_build_filename(context->m_user_dir,
 507                                                      userfilename, NULL);
 508             log->save(tmppathname);
 509             rename(tmppathname, chunkpathname);
 510             g_free(chunkpathname);
 511             g_free(tmppathname);
 512             delete log;
 513         }
 514
 515         if (USER_FILE == table_info->m_file_type) {
 516             /* user phrase library */
 517             MemoryChunk * chunk = new MemoryChunk;
 518             context->m_phrase_index->store(i, chunk);
 519
 520             const char * userfilename = table_info->m_user_filename;
 521             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
 522             gchar * tmppathname = g_build_filename(context->m_user_dir,
 523                                                    tmpfilename, NULL);
 524             g_free(tmpfilename);
 525
 526             gchar * chunkpathname = g_build_filename(context->m_user_dir,
 527                                                      userfilename, NULL);
 528
 529             chunk->save(tmppathname);
 530             rename(tmppathname, chunkpathname);
 531             g_free(chunkpathname);
 532             g_free(tmppathname);
 533             delete chunk;
 534         }
 535     }
 536
 537     /* save user chewing table */
 538     gchar * tmpfilename = g_build_filename
 539         (context->m_user_dir, "user_pinyin_index.bin.tmp", NULL);
 540     unlink(tmpfilename);
 541     gchar * filename = g_build_filename
 542         (context->m_user_dir, "user_pinyin_index.bin", NULL);
 543
 544     MemoryChunk * chunk = new MemoryChunk;
 545     context->m_pinyin_table->store(chunk);
 546     chunk->save(tmpfilename);
 547     delete chunk;
 548     rename(tmpfilename, filename);
 549     g_free(tmpfilename);
 550     g_free(filename);
 551
 552     /* save user phrase table */
 553     tmpfilename = g_build_filename
 554         (context->m_user_dir, "user_phrase_index.bin.tmp", NULL);
 555     unlink(tmpfilename);
 556     filename = g_build_filename
 557         (context->m_user_dir, "user_phrase_index.bin", NULL);
 558
 559     chunk = new MemoryChunk;
 560     context->m_phrase_table->store(chunk);
 561     chunk->save(tmpfilename);
 562     delete chunk;
 563     rename(tmpfilename, filename);
 564     g_free(tmpfilename);
 565     g_free(filename);
 566
 567     /* save user bi-gram */
 568     tmpfilename = g_build_filename
 569         (context->m_user_dir, "user.db.tmp", NULL);
 570     unlink(tmpfilename);
 571     filename = g_build_filename(context->m_user_dir, "user.db", NULL);
 572     context->m_user_bigram->save_db(tmpfilename);
 573     rename(tmpfilename, filename);
 574     g_free(tmpfilename);
 575     g_free(filename);
 576
 577     mark_version(context->m_user_dir);
 578
 579     context->m_modified = false;
 580     return true;
 581 }
 582
 583 bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
 584                                      DoublePinyinScheme scheme){
 585     context->m_double_pinyin_parser->set_scheme(scheme);
 586     return true;
 587 }
 588
 589 bool pinyin_set_chewing_scheme(pinyin_context_t * context,
 590                                ChewingScheme scheme){
 591     context->m_chewing_parser->set_scheme(scheme);
 592     return true;
 593 }
 594
 595 void pinyin_fini(pinyin_context_t * context){
 596     delete context->m_full_pinyin_parser;
 597     delete context->m_double_pinyin_parser;
 598     delete context->m_chewing_parser;
 599     delete context->m_pinyin_table;
 600     delete context->m_phrase_table;
 601     delete context->m_phrase_index;
 602     delete context->m_system_bigram;
 603     delete context->m_user_bigram;
 604     delete context->m_pinyin_lookup;
 605     delete context->m_phrase_lookup;
 606
 607     g_free(context->m_system_dir);
 608     g_free(context->m_user_dir);
 609     context->m_modified = false;
 610
 611     delete context;
 612 }
 613
 614 bool pinyin_mask_out(pinyin_context_t * context,
 615                      phrase_token_t mask,
 616                      phrase_token_t value) {
 617
 618     context->m_pinyin_table->mask_out(mask, value);
 619     context->m_phrase_table->mask_out(mask, value);
 620     context->m_user_bigram->mask_out(mask, value);
 621
 622     /* mask out the phrase index. */
 623     for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
 624         PhraseIndexRange range;
 625         int retval = context->m_phrase_index->get_range(index, range);
 626
 627         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
 628             continue;
 629
 630         const pinyin_table_info_t * table_info = pinyin_phrase_files + index;
 631
 632         if (NOT_USED == table_info->m_file_type)
 633             continue;
 634
 635         const char * userfilename = table_info->m_user_filename;
 636
 637         if (NULL == userfilename)
 638             continue;
 639
 640         if (SYSTEM_FILE == table_info->m_file_type ||
 641             DICTIONARY == table_info->m_file_type) {
 642             /* system phrase library */
 643             MemoryChunk * chunk = new MemoryChunk;
 644
 645             const char * systemfilename = table_info->m_system_filename;
 646             /* check bin file in system dir. */
 647             gchar * chunkfilename = g_build_filename(context->m_system_dir,
 648                                                      systemfilename, NULL);
 649             chunk->load(chunkfilename);
 650             g_free(chunkfilename);
 651
 652             context->m_phrase_index->load(index, chunk);
 653
 654             const char * userfilename = table_info->m_user_filename;
 655
 656             chunkfilename = g_build_filename(context->m_user_dir,
 657                                              userfilename, NULL);
 658
 659             MemoryChunk * log = new MemoryChunk;
 660             log->load(chunkfilename);
 661             g_free(chunkfilename);
 662
 663             /* merge the chunk log with mask. */
 664             context->m_phrase_index->merge_with_mask(index, log, mask, value);
 665         }
 666
 667         if (USER_FILE == table_info->m_file_type) {
 668             /* user phrase library */
 669             context->m_phrase_index->mask_out(index, mask, value);
 670         }
 671     }
 672
 673     context->m_phrase_index->compact();
 674     return true;
 675 }
 676
 677 /* copy from options to context->m_options. */
 678 bool pinyin_set_options(pinyin_context_t * context,
 679                         pinyin_option_t options){
 680     context->m_options = options;
 681     context->m_pinyin_table->set_options(context->m_options);
 682     context->m_pinyin_lookup->set_options(context->m_options);
 683     return true;
 684 }
 685
 686
 687 pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
 688     pinyin_instance_t * instance = new pinyin_instance_t;
 689     instance->m_context = context;
 690
 691     instance->m_raw_full_pinyin = NULL;
 692
 693     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 694     instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
 695     instance->m_pinyin_key_rests =
 696         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
 697     instance->m_constraints = g_array_new
 698         (TRUE, FALSE, sizeof(lookup_constraint_t));
 699     instance->m_match_results =
 700         g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 701
 702     return instance;
 703 }
 704
 705 void pinyin_free_instance(pinyin_instance_t * instance){
 706     g_free(instance->m_raw_full_pinyin);
 707     g_array_free(instance->m_prefixes, TRUE);
 708     g_array_free(instance->m_pinyin_keys, TRUE);
 709     g_array_free(instance->m_pinyin_key_rests, TRUE);
 710     g_array_free(instance->m_constraints, TRUE);
 711     g_array_free(instance->m_match_results, TRUE);
 712
 713     delete instance;
 714 }
 715
 716
 717 static bool pinyin_update_constraints(pinyin_instance_t * instance){
 718     pinyin_context_t * & context = instance->m_context;
 719     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
 720     CandidateConstraints & constraints = instance->m_constraints;
 721
 722     size_t key_len = constraints->len;
 723     g_array_set_size(constraints, pinyin_keys->len);
 724     for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
 725         lookup_constraint_t * constraint =
 726             &g_array_index(constraints, lookup_constraint_t, i);
 727         constraint->m_type = NO_CONSTRAINT;
 728     }
 729
 730     context->m_pinyin_lookup->validate_constraint
 731         (constraints, pinyin_keys);
 732
 733     return true;
 734 }
 735
 736
 737 bool pinyin_guess_sentence(pinyin_instance_t * instance){
 738     pinyin_context_t * & context = instance->m_context;
 739
 740     g_array_set_size(instance->m_prefixes, 0);
 741     g_array_append_val(instance->m_prefixes, sentence_start);
 742
 743     pinyin_update_constraints(instance);
 744     bool retval = context->m_pinyin_lookup->get_best_match
 745         (instance->m_prefixes,
 746          instance->m_pinyin_keys,
 747          instance->m_constraints,
 748          instance->m_match_results);
 749
 750     return retval;
 751 }
 752
 753 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
 754                                        const char * prefix){
 755     pinyin_context_t * & context = instance->m_context;
 756
 757     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
 758
 759     g_array_set_size(instance->m_prefixes, 0);
 760     g_array_append_val(instance->m_prefixes, sentence_start);
 761
 762     glong len_str = 0;
 763     ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
 764     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 765
 766     if (ucs4_str && len_str) {
 767         /* add prefixes. */
 768         for (ssize_t i = 1; i <= len_str; ++i) {
 769             if (i > MAX_PHRASE_LENGTH)
 770                 break;
 771
 772             ucs4_t * start = ucs4_str + len_str - i;
 773
 774             PhraseTokens tokens;
 775             memset(tokens, 0, sizeof(tokens));
 776             phrase_index->prepare_tokens(tokens);
 777             int result = context->m_phrase_table->search(i, start, tokens);
 778             int num = reduce_tokens(tokens, tokenarray);
 779             phrase_index->destroy_tokens(tokens);
 780
 781             if (result & SEARCH_OK)
 782                 g_array_append_vals(instance->m_prefixes,
 783                                     tokenarray->data, tokenarray->len);
 784         }
 785     }
 786     g_array_free(tokenarray, TRUE);
 787     g_free(ucs4_str);
 788
 789     pinyin_update_constraints(instance);
 790     bool retval = context->m_pinyin_lookup->get_best_match
 791         (instance->m_prefixes,
 792          instance->m_pinyin_keys,
 793          instance->m_constraints,
 794          instance->m_match_results);
 795
 796     return retval;
 797 }
 798
 799 bool pinyin_phrase_segment(pinyin_instance_t * instance,
 800                            const char * sentence){
 801     pinyin_context_t * & context = instance->m_context;
 802
 803     const glong num_of_chars = g_utf8_strlen(sentence, -1);
 804     glong ucs4_len = 0;
 805     ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
 806
 807     g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
 808
 809     bool retval = context->m_phrase_lookup->get_best_match
 810         (ucs4_len, ucs4_str, instance->m_match_results);
 811
 812     g_free(ucs4_str);
 813     return retval;
 814 }
 815
 816 /* the returned sentence should be freed by g_free(). */
 817 bool pinyin_get_sentence(pinyin_instance_t * instance,
 818                          char ** sentence){
 819     pinyin_context_t * & context = instance->m_context;
 820
 821     bool retval = pinyin::convert_to_utf8
 822         (context->m_phrase_index, instance->m_match_results,
 823          NULL, false, *sentence);
 824
 825     return retval;
 826 }
 827
 828 bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
 829                               const char * onepinyin,
 830                               ChewingKey * onekey){
 831     pinyin_context_t * & context = instance->m_context;
 832
 833     int pinyin_len = strlen(onepinyin);
 834     int parse_len = context->m_full_pinyin_parser->parse_one_key
 835         ( context->m_options, *onekey, onepinyin, pinyin_len);
 836     return pinyin_len == parse_len;
 837 }
 838
 839 size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
 840                                       const char * pinyins){
 841     pinyin_context_t * & context = instance->m_context;
 842
 843     g_free(instance->m_raw_full_pinyin);
 844     instance->m_raw_full_pinyin = g_strdup(pinyins);
 845     int pinyin_len = strlen(pinyins);
 846
 847     int parse_len = context->m_full_pinyin_parser->parse
 848         ( context->m_options, instance->m_pinyin_keys,
 849           instance->m_pinyin_key_rests, pinyins, pinyin_len);
 850
 851     return parse_len;
 852 }
 853
 854 bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
 855                                 const char * onepinyin,
 856                                 ChewingKey * onekey){
 857     pinyin_context_t * & context = instance->m_context;
 858
 859     int pinyin_len = strlen(onepinyin);
 860     int parse_len = context->m_double_pinyin_parser->parse_one_key
 861         ( context->m_options, *onekey, onepinyin, pinyin_len);
 862     return pinyin_len == parse_len;
 863 }
 864
 865 size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
 866                                         const char * pinyins){
 867     pinyin_context_t * & context = instance->m_context;
 868     int pinyin_len = strlen(pinyins);
 869
 870     int parse_len = context->m_double_pinyin_parser->parse
 871         ( context->m_options, instance->m_pinyin_keys,
 872           instance->m_pinyin_key_rests, pinyins, pinyin_len);
 873
 874     return parse_len;
 875 }
 876
 877 bool pinyin_parse_chewing(pinyin_instance_t * instance,
 878                           const char * onechewing,
 879                           ChewingKey * onekey){
 880     pinyin_context_t * & context = instance->m_context;
 881
 882     int chewing_len = strlen(onechewing);
 883     int parse_len = context->m_chewing_parser->parse_one_key
 884         ( context->m_options, *onekey, onechewing, chewing_len );
 885     return chewing_len == parse_len;
 886 }
 887
 888 size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
 889                                   const char * chewings){
 890     pinyin_context_t * & context = instance->m_context;
 891     int chewing_len = strlen(chewings);
 892
 893     int parse_len = context->m_chewing_parser->parse
 894         ( context->m_options, instance->m_pinyin_keys,
 895           instance->m_pinyin_key_rests, chewings, chewing_len);
 896
 897     return parse_len;
 898 }
 899
 900 bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
 901                                 const char key, const char ** symbol) {
 902     pinyin_context_t * & context = instance->m_context;
 903     return context->m_chewing_parser->in_chewing_scheme
 904         (context->m_options, key, symbol);
 905 }
 906
 907 #if 0
 908 static gint compare_item_with_token(gconstpointer lhs,
 909                                     gconstpointer rhs) {
 910     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
 911     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
 912
 913     phrase_token_t token_lhs = item_lhs->m_token;
 914     phrase_token_t token_rhs = item_rhs->m_token;
 915
 916     return (token_lhs - token_rhs);
 917 }
 918 #endif
 919
 920 static gint compare_item_with_frequency(gconstpointer lhs,
 921                                         gconstpointer rhs) {
 922     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
 923     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
 924
 925     guint32 freq_lhs = item_lhs->m_freq;
 926     guint32 freq_rhs = item_rhs->m_freq;
 927
 928     return -(freq_lhs - freq_rhs); /* in descendant order */
 929 }
 930
 931 static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
 932                                           size_t offset) {
 933     phrase_token_t prev_token = null_token;
 934     ssize_t i;
 935
 936     if (0 == offset) {
 937         /* get previous token from prefixes. */
 938         prev_token = sentence_start;
 939         size_t prev_token_len = 0;
 940
 941         pinyin_context_t * context = instance->m_context;
 942         TokenVector prefixes = instance->m_prefixes;
 943         PhraseItem item;
 944
 945         for (size_t i = 0; i < prefixes->len; ++i) {
 946             phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
 947             if (sentence_start == token)
 948                 continue;
 949
 950             int retval = context->m_phrase_index->get_phrase_item(token, item);
 951             if (ERROR_OK == retval) {
 952                 size_t token_len = item.get_phrase_length();
 953                 if (token_len > prev_token_len) {
 954                     /* found longer match, and save it. */
 955                     prev_token = token;
 956                     prev_token_len = token_len;
 957                 }
 958             }
 959         }
 960     } else {
 961         /* get previous token from match results. */
 962         assert (0 < offset);
 963
 964         phrase_token_t cur_token = g_array_index
 965             (instance->m_match_results, phrase_token_t, offset);
 966         if (null_token != cur_token) {
 967             for (i = offset - 1; i >= 0; --i) {
 968                 cur_token = g_array_index
 969                     (instance->m_match_results, phrase_token_t, i);
 970                 if (null_token != cur_token) {
 971                     prev_token = cur_token;
 972                     break;
 973                 }
 974             }
 975         }
 976     }
 977
 978     return prev_token;
 979 }
 980
 981 static void _append_items(pinyin_context_t * context,
 982                           PhraseIndexRanges ranges,
 983                           lookup_candidate_t * template_item,
 984                           CandidateVector items) {
 985     /* reduce and append to a single GArray. */
 986     for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
 987         if (NULL == ranges[m])
 988             continue;
 989
 990         for (size_t n = 0; n < ranges[m]->len; ++n) {
 991             PhraseIndexRange * range =
 992                 &g_array_index(ranges[m], PhraseIndexRange, n);
 993             for (size_t k = range->m_range_begin;
 994                  k < range->m_range_end; ++k) {
 995                 lookup_candidate_t item;
 996                 item.m_candidate_type = template_item->m_candidate_type;
 997                 item.m_token = k;
 998                 item.m_orig_rest = template_item->m_orig_rest;
 999                 item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
1000                 item.m_freq = template_item->m_freq;
1001                 g_array_append_val(items, item);
1002             }
1003         }
1004     }
1005 }
1006
1007 #if 0
1008 static void _remove_duplicated_items(CandidateVector items) {
1009     /* remove the duplicated items. */
1010     phrase_token_t last_token = null_token, saved_token;
1011     for (size_t n = 0; n < items->len; ++n) {
1012         lookup_candidate_t * item = &g_array_index
1013             (items, lookup_candidate_t, n);
1014
1015         saved_token = item->m_token;
1016         if (last_token == saved_token) {
1017             g_array_remove_index(items, n);
1018             n--;
1019         }
1020         last_token = saved_token;
1021     }
1022 }
1023 #endif
1024
1025 static void _compute_frequency_of_items(pinyin_context_t * context,
1026                                         phrase_token_t prev_token,
1027                                         SingleGram * merged_gram,
1028                                         CandidateVector items) {
1029     pinyin_option_t & options = context->m_options;
1030     ssize_t i;
1031
1032     PhraseItem cached_item;
1033     /* compute all freqs. */
1034     for (i = 0; i < items->len; ++i) {
1035         lookup_candidate_t * item = &g_array_index
1036             (items, lookup_candidate_t, i);
1037         phrase_token_t & token = item->m_token;
1038
1039         gfloat bigram_poss = 0; guint32 total_freq = 0;
1040         if (options & DYNAMIC_ADJUST) {
1041             if (null_token != prev_token) {
1042                 guint32 bigram_freq = 0;
1043                 merged_gram->get_total_freq(total_freq);
1044                 merged_gram->get_freq(token, bigram_freq);
1045                 if (0 != total_freq)
1046                     bigram_poss = bigram_freq / (gfloat)total_freq;
1047             }
1048         }
1049
1050         /* compute the m_freq. */
1051         FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1052         phrase_index->get_phrase_item(token, cached_item);
1053         total_freq = phrase_index->get_phrase_index_total_freq();
1054         assert (0 < total_freq);
1055
1056         /* Note: possibility value <= 1.0. */
1057         guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
1058                         (1 - LAMBDA_PARAMETER) *
1059                         cached_item.get_unigram_frequency() /
1060                         (gfloat) total_freq) * 256 * 256 * 256;
1061         item->m_freq = freq;
1062     }
1063 }
1064
1065 static bool _prepend_sentence_candidate(pinyin_instance_t * instance,
1066                                         CandidateVector candidates) {
1067     /* check whether the best match candidate exists. */
1068     gchar * sentence = NULL;
1069     pinyin_get_sentence(instance, &sentence);
1070     if (NULL == sentence)
1071         return false;
1072     g_free(sentence);
1073
1074     /* prepend best match candidate to candidates. */
1075     lookup_candidate_t candidate;
1076     candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
1077     g_array_prepend_val(candidates, candidate);
1078
1079     return true;
1080 }
1081
1082 static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
1083                                              size_t offset,
1084                                              CandidateVector candidates) {
1085     /* populate m_phrase_string in lookup_candidate_t. */
1086
1087     for(size_t i = 0; i < candidates->len; ++i) {
1088         lookup_candidate_t * candidate = &g_array_index
1089             (candidates, lookup_candidate_t, i);
1090
1091         switch(candidate->m_candidate_type) {
1092         case BEST_MATCH_CANDIDATE: {
1093             gchar * sentence = NULL;
1094             pinyin_get_sentence(instance, &sentence);
1095             candidate->m_phrase_string = g_strdup
1096                 (g_utf8_offset_to_pointer(sentence, offset));
1097             g_free(sentence);
1098             break;
1099         }
1100         case NORMAL_CANDIDATE:
1101         case DIVIDED_CANDIDATE:
1102         case RESPLIT_CANDIDATE:
1103             pinyin_token_get_phrase
1104                 (instance, candidate->m_token, NULL,
1105                  &(candidate->m_phrase_string));
1106             break;
1107         case ZOMBIE_CANDIDATE:
1108             break;
1109         }
1110     }
1111
1112     return true;
1113 }
1114
1115 static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
1116                                                     gconstpointer rhs,
1117                                                     gpointer userdata) {
1118     size_t index_lhs = *((size_t *) lhs);
1119     size_t index_rhs = *((size_t *) rhs);
1120     CandidateVector candidates = (CandidateVector) userdata;
1121
1122     lookup_candidate_t * candidate_lhs =
1123         &g_array_index(candidates, lookup_candidate_t, index_lhs);
1124     lookup_candidate_t * candidate_rhs =
1125         &g_array_index(candidates, lookup_candidate_t, index_rhs);
1126
1127     return -strcmp(candidate_lhs->m_phrase_string,
1128                    candidate_rhs->m_phrase_string); /* in descendant order */
1129 }
1130
1131
1132 static bool _remove_duplicated_items_by_phrase_string
1133 (pinyin_instance_t * instance,
1134  CandidateVector candidates) {
1135     size_t i;
1136     /* create the GArray of indexed item */
1137     GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
1138     for (i = 0; i < candidates->len; ++i)
1139         g_array_append_val(indices, i);
1140
1141     /* sort the indices array by phrase array */
1142     g_array_sort_with_data
1143         (indices, compare_indexed_item_with_phrase_string, candidates);
1144
1145     /* mark duplicated items as zombie candidate */
1146     lookup_candidate_t * cur_item, * saved_item = NULL;
1147     for (i = 0; i < indices->len; ++i) {
1148         size_t cur_index = g_array_index(indices, size_t, i);
1149         cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
1150
1151         /* handle the first candidate */
1152         if (NULL == saved_item) {
1153             saved_item = cur_item;
1154             continue;
1155         }
1156
1157         if (0 == strcmp(saved_item->m_phrase_string,
1158                         cur_item->m_phrase_string)) {
1159             /* found duplicated candidates */
1160
1161             /* keep best match candidate */
1162             if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
1163                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1164                 continue;
1165             }
1166
1167             if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
1168                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1169                 saved_item = cur_item;
1170                 continue;
1171             }
1172
1173             /* keep the higher possiblity one
1174                to quickly move the word forward in the candidate list */
1175             if (cur_item->m_freq > saved_item->m_freq) {
1176                 /* find better candidate */
1177                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1178                 saved_item = cur_item;
1179                 continue;
1180             } else {
1181                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1182                 continue;
1183             }
1184         } else {
1185             /* keep the current candidate */
1186             saved_item = cur_item;
1187         }
1188     }
1189
1190     g_array_free(indices, TRUE);
1191
1192     /* remove zombie candidate from the returned candidates */
1193     for (i = 0; i < candidates->len; ++i) {
1194         lookup_candidate_t * candidate = &g_array_index
1195             (candidates, lookup_candidate_t, i);
1196
1197         if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
1198             g_free(candidate->m_phrase_string);
1199             g_free(candidate->m_new_pinyins);
1200             g_array_remove_index(candidates, i);
1201             i--;
1202         }
1203     }
1204
1205     return true;
1206 }
1207
1208 static bool _free_candidates(CandidateVector candidates) {
1209     /* free candidates */
1210     for (size_t i = 0; i < candidates->len; ++i) {
1211         lookup_candidate_t * candidate = &g_array_index
1212             (candidates, lookup_candidate_t, i);
1213         g_free(candidate->m_phrase_string);
1214         g_free(candidate->m_new_pinyins);
1215     }
1216     g_array_set_size(candidates, 0);
1217
1218     return true;
1219 }
1220
1221 bool pinyin_get_candidates(pinyin_instance_t * instance,
1222                            size_t offset,
1223                            CandidateVector candidates) {
1224
1225     pinyin_context_t * & context = instance->m_context;
1226     pinyin_option_t & options = context->m_options;
1227     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1228
1229     _free_candidates(candidates);
1230
1231     size_t pinyin_len = pinyin_keys->len - offset;
1232     ssize_t i;
1233
1234     /* lookup the previous token here. */
1235     phrase_token_t prev_token = null_token;
1236
1237     if (options & DYNAMIC_ADJUST) {
1238         prev_token = _get_previous_token(instance, offset);
1239     }
1240
1241     SingleGram merged_gram;
1242     SingleGram * system_gram = NULL, * user_gram = NULL;
1243
1244     if (options & DYNAMIC_ADJUST) {
1245         if (null_token != prev_token) {
1246             context->m_system_bigram->load(prev_token, system_gram);
1247             context->m_user_bigram->load(prev_token, user_gram);
1248             merge_single_gram(&merged_gram, system_gram, user_gram);
1249         }
1250     }
1251
1252     PhraseIndexRanges ranges;
1253     memset(ranges, 0, sizeof(ranges));
1254     context->m_phrase_index->prepare_ranges(ranges);
1255
1256     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1257
1258     for (i = pinyin_len; i >= 1; --i) {
1259         g_array_set_size(items, 0);
1260
1261         ChewingKey * keys = &g_array_index
1262             (pinyin_keys, ChewingKey, offset);
1263
1264         /* do pinyin search. */
1265         int retval = context->m_pinyin_table->search
1266             (i, keys, ranges);
1267
1268         if ( !(retval & SEARCH_OK) )
1269             continue;
1270
1271         lookup_candidate_t template_item;
1272         _append_items(context, ranges, &template_item, items);
1273
1274 #if 0
1275         g_array_sort(items, compare_item_with_token);
1276
1277         _remove_duplicated_items(items);
1278 #endif
1279
1280         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1281
1282         /* sort the candidates of the same length by frequency. */
1283         g_array_sort(items, compare_item_with_frequency);
1284
1285         /* transfer back items to tokens, and save it into candidates */
1286         for (size_t k = 0; k < items->len; ++k) {
1287             lookup_candidate_t * item = &g_array_index
1288                 (items, lookup_candidate_t, k);
1289             g_array_append_val(candidates, *item);
1290         }
1291
1292 #if 0
1293         if (!(retval & SEARCH_CONTINUED))
1294             break;
1295 #endif
1296     }
1297
1298     g_array_free(items, TRUE);
1299     context->m_phrase_index->destroy_ranges(ranges);
1300     if (system_gram)
1301         delete system_gram;
1302     if (user_gram)
1303         delete user_gram;
1304
1305     /* post process to remove duplicated candidates */
1306
1307     _prepend_sentence_candidate(instance, candidates);
1308
1309     _compute_phrase_strings_of_items(instance, offset, candidates);
1310
1311     _remove_duplicated_items_by_phrase_string(instance, candidates);
1312
1313     return true;
1314 }
1315
1316
1317 static bool _try_divided_table(pinyin_instance_t * instance,
1318                                PhraseIndexRanges ranges,
1319                                size_t offset,
1320                                CandidateVector items){
1321     bool found = false;
1322
1323     pinyin_context_t * & context = instance->m_context;
1324     pinyin_option_t & options = context->m_options;
1325     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1326     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1327
1328     assert(pinyin_keys->len == pinyin_key_rests->len);
1329     guint num_keys = pinyin_keys->len;
1330     assert(offset < num_keys);
1331
1332     /* handle "^xian$" -> "xi'an" here */
1333     ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
1334     ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
1335                                            ChewingKeyRest, offset);
1336     ChewingKeyRest orig_rest = *rest;
1337     guint16 tone = CHEWING_ZERO_TONE;
1338
1339     const divided_table_item_t * item = NULL;
1340
1341     /* back up tone */
1342     if (options & USE_TONE) {
1343         tone = key->m_tone;
1344         if (CHEWING_ZERO_TONE != tone) {
1345             key->m_tone = CHEWING_ZERO_TONE;
1346             rest->m_raw_end --;
1347         }
1348     }
1349
1350     item = context->m_full_pinyin_parser->retrieve_divided_item
1351         (options, key, rest, instance->m_raw_full_pinyin,
1352          strlen(instance->m_raw_full_pinyin));
1353
1354     if (item) {
1355         /* no ops */
1356         assert(item->m_new_freq > 0);
1357
1358         ChewingKey divided_keys[2];
1359         const char * pinyin = item->m_new_keys[0];
1360         assert(context->m_full_pinyin_parser->
1361                parse_one_key(options, divided_keys[0],
1362                              pinyin, strlen(pinyin)));
1363         pinyin = item->m_new_keys[1];
1364         assert(context->m_full_pinyin_parser->
1365                parse_one_key(options, divided_keys[1],
1366                              pinyin, strlen(pinyin)));
1367
1368         gchar * new_pinyins = g_strdup_printf
1369             ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
1370
1371         /* propagate the tone */
1372         if (options & USE_TONE) {
1373             if (CHEWING_ZERO_TONE != tone) {
1374                 assert(0 < tone && tone <= 5);
1375                 divided_keys[1].m_tone = tone;
1376
1377                 gchar * tmp_str = g_strdup_printf
1378                     ("%s%d", new_pinyins, tone);
1379                 g_free(new_pinyins);
1380                 new_pinyins = tmp_str;
1381             }
1382         }
1383
1384         /* do pinyin search. */
1385         int retval = context->m_pinyin_table->search
1386             (2, divided_keys, ranges);
1387
1388         if (retval & SEARCH_OK) {
1389             lookup_candidate_t template_item;
1390             template_item.m_candidate_type = DIVIDED_CANDIDATE;
1391             template_item.m_orig_rest = orig_rest;
1392             template_item.m_new_pinyins = new_pinyins;
1393
1394             _append_items(context, ranges, &template_item, items);
1395             found = true;
1396         }
1397         g_free(new_pinyins);
1398     }
1399
1400     /* restore tones */
1401     if (options & USE_TONE) {
1402         if (CHEWING_ZERO_TONE != tone) {
1403             key->m_tone = tone;
1404             rest->m_raw_end ++;
1405         }
1406     }
1407
1408     return found;
1409 }
1410
1411 static bool _try_resplit_table(pinyin_instance_t * instance,
1412                                PhraseIndexRanges ranges,
1413                                size_t offset,
1414                                CandidateVector items){
1415     bool found = false;
1416
1417     pinyin_context_t * & context = instance->m_context;
1418     pinyin_option_t & options = context->m_options;
1419     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1420     ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
1421
1422     assert(pinyin_keys->len == pinyin_key_rests->len);
1423     guint num_keys = pinyin_keys->len;
1424     assert(offset + 1 < num_keys);
1425
1426     guint16 next_tone = CHEWING_ZERO_TONE;
1427
1428     /* handle "^fa'nan$" -> "fan'an" here */
1429     ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
1430                                                ChewingKeyRest, offset);
1431     ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
1432                                                 ChewingKeyRest, offset + 1);
1433     /* some "'" here */
1434     if (cur_rest->m_raw_end != next_rest->m_raw_begin)
1435         return found;
1436
1437     ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
1438     ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
1439                                            offset + 1);
1440
1441     /* some tone here */
1442     if (CHEWING_ZERO_TONE != cur_key->m_tone)
1443         return found;
1444
1445     ChewingKeyRest orig_rest;
1446     orig_rest.m_raw_begin = cur_rest->m_raw_begin;
1447     orig_rest.m_raw_end = next_rest->m_raw_end;
1448
1449     /* backup tone */
1450     if (options & USE_TONE) {
1451         next_tone = next_key->m_tone;
1452         if (CHEWING_ZERO_TONE != next_tone) {
1453             next_key->m_tone = CHEWING_ZERO_TONE;
1454             next_rest->m_raw_end --;
1455         }
1456     }
1457
1458     /* lookup re-split table */
1459     const char * str = instance->m_raw_full_pinyin;
1460     const resplit_table_item_t * item_by_orig =
1461         context->m_full_pinyin_parser->
1462         retrieve_resplit_item_by_original_pinyins
1463         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1464
1465     const resplit_table_item_t * item_by_new =
1466         context->m_full_pinyin_parser->
1467         retrieve_resplit_item_by_resplit_pinyins
1468         (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
1469
1470     /* there are no same couple of pinyins in re-split table. */
1471     assert(!(item_by_orig && item_by_new));
1472
1473     ChewingKey resplit_keys[2];
1474     const char * pinyins[2];
1475
1476     bool tosearch = false;
1477     if (item_by_orig && item_by_orig->m_new_freq) {
1478         pinyins[0] = item_by_orig->m_new_keys[0];
1479         pinyins[1] = item_by_orig->m_new_keys[1];
1480
1481         assert(context->m_full_pinyin_parser->
1482                parse_one_key(options, resplit_keys[0],
1483                              pinyins[0], strlen(pinyins[0])));
1484
1485         assert(context->m_full_pinyin_parser->
1486                parse_one_key(options, resplit_keys[1],
1487                              pinyins[1], strlen(pinyins[1])));
1488         tosearch = true;
1489     }
1490
1491     if (item_by_new && item_by_new->m_orig_freq) {
1492         pinyins[0] = item_by_new->m_orig_keys[0];
1493         pinyins[1] = item_by_new->m_orig_keys[1];
1494
1495         assert(context->m_full_pinyin_parser->
1496                parse_one_key(options, resplit_keys[0],
1497                              pinyins[0], strlen(pinyins[0])));
1498
1499         assert(context->m_full_pinyin_parser->
1500                parse_one_key(options, resplit_keys[1],
1501                              pinyins[1], strlen(pinyins[1])));
1502         tosearch = true;
1503     }
1504
1505     if (tosearch) {
1506         gchar * new_pinyins = g_strdup_printf
1507             ("%s'%s", pinyins[0], pinyins[1]);
1508
1509         /* propagate the tone */
1510         if (options & USE_TONE) {
1511             if (CHEWING_ZERO_TONE != next_tone) {
1512                 assert(0 < next_tone && next_tone <= 5);
1513                 resplit_keys[1].m_tone = next_tone;
1514
1515                 gchar * tmp_str = g_strdup_printf
1516                     ("%s%d", new_pinyins, next_tone);
1517                 g_free(new_pinyins);
1518                 new_pinyins = tmp_str;
1519             }
1520         }
1521
1522         /* do pinyin search. */
1523         int retval = context->m_pinyin_table->search
1524             (2, resplit_keys, ranges);
1525
1526         if (retval & SEARCH_OK) {
1527             lookup_candidate_t template_item;
1528             template_item.m_candidate_type = RESPLIT_CANDIDATE;
1529             template_item.m_orig_rest = orig_rest;
1530             template_item.m_new_pinyins = new_pinyins;
1531
1532             _append_items(context, ranges, &template_item, items);
1533             found = true;
1534         }
1535         g_free(new_pinyins);
1536     }
1537
1538     /* restore tones */
1539     if (options & USE_TONE) {
1540         if (CHEWING_ZERO_TONE != next_tone) {
1541             next_key->m_tone = next_tone;
1542             next_rest->m_raw_end ++;
1543         }
1544     }
1545
1546     return found;
1547 }
1548
1549 bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
1550                                        size_t offset,
1551                                        CandidateVector candidates){
1552
1553     pinyin_context_t * & context = instance->m_context;
1554     pinyin_option_t & options = context->m_options;
1555     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
1556
1557     _free_candidates(candidates);
1558
1559     size_t pinyin_len = pinyin_keys->len - offset;
1560     pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
1561     ssize_t i;
1562
1563     /* lookup the previous token here. */
1564     phrase_token_t prev_token = null_token;
1565
1566     if (options & DYNAMIC_ADJUST) {
1567         prev_token = _get_previous_token(instance, offset);
1568     }
1569
1570     SingleGram merged_gram;
1571     SingleGram * system_gram = NULL, * user_gram = NULL;
1572
1573     if (options & DYNAMIC_ADJUST) {
1574         if (null_token != prev_token) {
1575             context->m_system_bigram->load(prev_token, system_gram);
1576             context->m_user_bigram->load(prev_token, user_gram);
1577             merge_single_gram(&merged_gram, system_gram, user_gram);
1578         }
1579     }
1580
1581     PhraseIndexRanges ranges;
1582     memset(ranges, 0, sizeof(ranges));
1583     context->m_phrase_index->prepare_ranges(ranges);
1584
1585     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1586
1587     if (1 == pinyin_len) {
1588         /* because there is only one pinyin left,
1589          *  the following for-loop will not produce 2 character candidates.
1590          * the if-branch will fill the candidate list with
1591          *  2 character candidates.
1592          */
1593
1594         if (options & USE_DIVIDED_TABLE) {
1595             g_array_set_size(items, 0);
1596
1597             if (_try_divided_table(instance, ranges, offset, items)) {
1598
1599 #if 0
1600                 g_array_sort(items, compare_item_with_token);
1601
1602                 _remove_duplicated_items(items);
1603 #endif
1604
1605                 _compute_frequency_of_items(context, prev_token,
1606                                             &merged_gram, items);
1607
1608                 /* sort the candidates of the same length by frequency. */
1609                 g_array_sort(items, compare_item_with_frequency);
1610
1611                 /* transfer back items to tokens, and save it into candidates */
1612                 for (i = 0; i < items->len; ++i) {
1613                     lookup_candidate_t * item = &g_array_index
1614                         (items, lookup_candidate_t, i);
1615                     g_array_append_val(candidates, *item);
1616                 }
1617             }
1618         }
1619     }
1620
1621     for (i = pinyin_len; i >= 1; --i) {
1622         bool found = false;
1623         g_array_set_size(items, 0);
1624
1625         if (2 == i) {
1626             /* handle fuzzy pinyin segment here. */
1627             if (options & USE_DIVIDED_TABLE) {
1628                 found = _try_divided_table(instance, ranges, offset, items) ||
1629                     found;
1630             }
1631             if (options & USE_RESPLIT_TABLE) {
1632                 found = _try_resplit_table(instance, ranges, offset, items) ||
1633                     found;
1634             }
1635         }
1636
1637         ChewingKey * keys = &g_array_index
1638             (pinyin_keys, ChewingKey, offset);
1639
1640         /* do pinyin search. */
1641         int retval = context->m_pinyin_table->search
1642             (i, keys, ranges);
1643
1644         found = (retval & SEARCH_OK) || found;
1645
1646         if ( !found )
1647             continue;
1648
1649         lookup_candidate_t template_item;
1650         _append_items(context, ranges, &template_item, items);
1651
1652 #if 0
1653         g_array_sort(items, compare_item_with_token);
1654
1655         _remove_duplicated_items(items);
1656 #endif
1657
1658         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1659
1660         g_array_sort(items, compare_item_with_frequency);
1661
1662         for (size_t k = 0; k < items->len; ++k) {
1663             lookup_candidate_t * item = &g_array_index
1664                 (items, lookup_candidate_t, k);
1665             g_array_append_val(candidates, *item);
1666         }
1667
1668 #if 0
1669         if (!(retval & SEARCH_CONTINUED))
1670             break;
1671 #endif
1672     }
1673
1674     g_array_free(items, TRUE);
1675     context->m_phrase_index->destroy_ranges(ranges);
1676     if (system_gram)
1677         delete system_gram;
1678     if (user_gram)
1679         delete user_gram;
1680
1681     /* post process to remove duplicated candidates */
1682
1683     _prepend_sentence_candidate(instance, candidates);
1684
1685     _compute_phrase_strings_of_items(instance, offset, candidates);
1686
1687     _remove_duplicated_items_by_phrase_string(instance, candidates);
1688
1689     return true;
1690 }
1691
1692
1693 int pinyin_choose_candidate(pinyin_instance_t * instance,
1694                             size_t offset,
1695                             lookup_candidate_t * candidate){
1696     pinyin_context_t * & context = instance->m_context;
1697
1698     if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
1699         RESPLIT_CANDIDATE == candidate->m_candidate_type) {
1700         /* update full pinyin. */
1701         gchar * oldpinyins = instance->m_raw_full_pinyin;
1702         const ChewingKeyRest rest = candidate->m_orig_rest;
1703         oldpinyins[rest.m_raw_begin] = '\0';
1704         const gchar * left_part = oldpinyins;
1705         const gchar * right_part = oldpinyins + rest.m_raw_end;
1706         gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
1707                                          right_part, NULL);
1708         g_free(oldpinyins);
1709         instance->m_raw_full_pinyin = newpinyins;
1710
1711         /* re-parse the full pinyin.  */
1712         const gchar * pinyins = instance->m_raw_full_pinyin;
1713         int pinyin_len = strlen(pinyins);
1714         int parse_len = context->m_full_pinyin_parser->parse
1715             (context->m_options, instance->m_pinyin_keys,
1716              instance->m_pinyin_key_rests, pinyins, pinyin_len);
1717
1718         /* Note: there may be some un-parsable input here. */
1719     }
1720
1721     /* sync m_constraints to the length of m_pinyin_keys. */
1722     bool retval = context->m_pinyin_lookup->validate_constraint
1723         (instance->m_constraints, instance->m_pinyin_keys);
1724
1725     phrase_token_t token = candidate->m_token;
1726     guint8 len = context->m_pinyin_lookup->add_constraint
1727         (instance->m_constraints, offset, token);
1728
1729     /* safe guard: validate the m_constraints again. */
1730     retval = context->m_pinyin_lookup->validate_constraint
1731         (instance->m_constraints, instance->m_pinyin_keys) && len;
1732
1733     return offset + len;
1734 }
1735
1736
1737 bool pinyin_free_candidates(pinyin_instance_t * instance,
1738                             CandidateVector candidates) {
1739     _free_candidates(candidates);
1740     return true;
1741 }
1742
1743 bool pinyin_clear_constraint(pinyin_instance_t * instance,
1744                              size_t offset){
1745     pinyin_context_t * & context = instance->m_context;
1746
1747     bool retval = context->m_pinyin_lookup->clear_constraint
1748         (instance->m_constraints, offset);
1749
1750     return retval;
1751 }
1752
1753 bool pinyin_lookup_tokens(pinyin_instance_t * instance,
1754                           const char * phrase, GArray * tokenarray){
1755     pinyin_context_t * & context = instance->m_context;
1756     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1757
1758     glong ucs4_len = 0;
1759     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
1760
1761     PhraseTokens tokens;
1762     memset(tokens, 0, sizeof(PhraseTokens));
1763     phrase_index->prepare_tokens(tokens);
1764     int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
1765     int num = reduce_tokens(tokens, tokenarray);
1766     phrase_index->destroy_tokens(tokens);
1767
1768     return SEARCH_OK & retval;
1769 }
1770
1771 bool pinyin_train(pinyin_instance_t * instance){
1772     if (!instance->m_context->m_user_dir)
1773         return false;
1774
1775     pinyin_context_t * & context = instance->m_context;
1776     context->m_modified = true;
1777
1778     bool retval = context->m_pinyin_lookup->train_result2
1779         (instance->m_pinyin_keys, instance->m_constraints,
1780          instance->m_match_results);
1781
1782     return retval;
1783 }
1784
1785 bool pinyin_reset(pinyin_instance_t * instance){
1786     g_free(instance->m_raw_full_pinyin);
1787     instance->m_raw_full_pinyin = NULL;
1788
1789     g_array_set_size(instance->m_prefixes, 0);
1790     g_array_set_size(instance->m_pinyin_keys, 0);
1791     g_array_set_size(instance->m_pinyin_key_rests, 0);
1792     g_array_set_size(instance->m_constraints, 0);
1793     g_array_set_size(instance->m_match_results, 0);
1794
1795     return true;
1796 }
1797
1798 bool pinyin_get_chewing_string(pinyin_instance_t * instance,
1799                                ChewingKey * key,
1800                                gchar ** utf8_str) {
1801     *utf8_str = NULL;
1802     if (0 == key->get_table_index())
1803         return false;
1804
1805     *utf8_str = key->get_chewing_string();
1806     return true;
1807 }
1808
1809 bool pinyin_get_pinyin_string(pinyin_instance_t * instance,
1810                               ChewingKey * key,
1811                               gchar ** utf8_str) {
1812     *utf8_str = NULL;
1813     if (0 == key->get_table_index())
1814         return false;
1815
1816     *utf8_str = key->get_pinyin_string();
1817     return true;
1818 }
1819
1820 bool pinyin_get_pinyin_strings(pinyin_instance_t * instance,
1821                                ChewingKey * key,
1822                                gchar ** shengmu,
1823                                gchar ** yunmu) {
1824     *shengmu = NULL; *yunmu = NULL;
1825     if (0 == key->get_table_index())
1826         return false;
1827
1828     *shengmu = key->get_shengmu_string();
1829     *yunmu = key->get_yunmu_string();
1830     return true;
1831 }
1832
1833 bool pinyin_token_get_phrase(pinyin_instance_t * instance,
1834                              phrase_token_t token,
1835                              guint * len,
1836                              gchar ** utf8_str) {
1837     pinyin_context_t * & context = instance->m_context;
1838     PhraseItem item;
1839     ucs4_t buffer[MAX_PHRASE_LENGTH];
1840
1841     int retval = context->m_phrase_index->get_phrase_item(token, item);
1842     if (ERROR_OK != retval)
1843         return false;
1844
1845     item.get_phrase_string(buffer);
1846     guint length = item.get_phrase_length();
1847     if (len)
1848         *len = length;
1849     if (utf8_str)
1850         *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1851     return true;
1852 }
1853
1854 bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
1855                                       phrase_token_t token,
1856                                       guint * num){
1857     *num = 0;
1858     pinyin_context_t * & context = instance->m_context;
1859     PhraseItem item;
1860
1861     int retval = context->m_phrase_index->get_phrase_item(token, item);
1862     if (ERROR_OK != retval)
1863         return false;
1864
1865     *num = item.get_n_pronunciation();
1866     return true;
1867 }
1868
1869 bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance,
1870                                         phrase_token_t token,
1871                                         guint nth,
1872                                         ChewingKeyVector keys){
1873     g_array_set_size(keys, 0);
1874     pinyin_context_t * & context = instance->m_context;
1875     PhraseItem item;
1876     ChewingKey buffer[MAX_PHRASE_LENGTH];
1877     guint32 freq = 0;
1878
1879     int retval = context->m_phrase_index->get_phrase_item(token, item);
1880     if (ERROR_OK != retval)
1881         return false;
1882
1883     item.get_nth_pronunciation(nth, buffer, freq);
1884     guint8 len = item.get_phrase_length();
1885     g_array_append_vals(keys, buffer, len);
1886     return true;
1887 }
1888
1889 bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance,
1890                                         phrase_token_t token,
1891                                         guint * freq) {
1892     *freq = 0;
1893     pinyin_context_t * & context = instance->m_context;
1894     PhraseItem item;
1895
1896     int retval = context->m_phrase_index->get_phrase_item(token, item);
1897     if (ERROR_OK != retval)
1898         return false;
1899
1900     *freq = item.get_unigram_frequency();
1901     return true;
1902 }
1903
1904 bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance,
1905                                         phrase_token_t token,
1906                                         guint delta){
1907     pinyin_context_t * & context = instance->m_context;
1908     int retval = context->m_phrase_index->add_unigram_frequency
1909         (token, delta);
1910     return ERROR_OK == retval;
1911 }
1912
1913
1914
1915 /**
1916  *  Note: prefix is the text before the pre-edit string.
1917  */