add get_shengmu/yunmu_string
[platform/upstream/libpinyin.git] / src / storage / chewing_large_table.cpp
1 /* 
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *  
5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6  *  
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  *  
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  */
21
22 #include "chewing_large_table.h"
23 #include <assert.h>
24 #include "pinyin_phrase2.h"
25 #include "pinyin_parser2.h"
26
27
28 /* internal class definition */
29
30 namespace pinyin{
31 class ChewingLengthIndexLevel{
32
33 protected:
34     GArray * m_chewing_array_indexes;
35
36 public:
37     /* constructor/destructor */
38     ChewingLengthIndexLevel();
39     ~ChewingLengthIndexLevel();
40
41     /* load/store method */
42     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
43     bool store(MemoryChunk * new_chunk, table_offset_t offset,
44                table_offset_t & end);
45
46     /* search method */
47     int search(pinyin_option_t options, int phrase_length,
48                /* in */ ChewingKey keys[],
49                /* out */ PhraseIndexRanges ranges) const;
50
51     /* add/remove index method */
52     int add_index(int phrase_length, /* in */ ChewingKey keys[],
53                   /* in */ phrase_token_t token);
54     int remove_index(int phrase_length, /* in */ ChewingKey keys[],
55                      /* in */ phrase_token_t token);
56
57     /* get length method */
58     int get_length() const;
59
60     /* mask out method */
61     bool mask_out(phrase_token_t mask, phrase_token_t value);
62 };
63
64
65 template<size_t phrase_length>
66 class ChewingArrayIndexLevel{
67 protected:
68     typedef PinyinIndexItem2<phrase_length> IndexItem;
69
70 protected:
71     MemoryChunk m_chunk;
72
73     /* compress consecutive tokens */
74     int convert(pinyin_option_t options,
75                 ChewingKey keys[],
76                 IndexItem * begin,
77                 IndexItem * end,
78                 PhraseIndexRanges ranges) const;
79
80 public:
81     /* load/store method */
82     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
83     bool store(MemoryChunk * new_chunk, table_offset_t offset,
84                table_offset_t & end);
85
86     /* search method */
87     int search(pinyin_option_t options, /* in */ChewingKey keys[],
88                /* out */ PhraseIndexRanges ranges) const;
89
90     /* add/remove index method */
91     int add_index(/* in */ ChewingKey keys[], /* in */ phrase_token_t token);
92     int remove_index(/* in */ ChewingKey keys[],
93                      /* in */ phrase_token_t token);
94
95     /* get length method */
96     int get_length() const;
97
98     /* mask out method */
99     bool mask_out(phrase_token_t mask, phrase_token_t value);
100 };
101
102 };
103
104
105 using namespace pinyin;
106
107 /* class implementation */
108
109 ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options)
110     : m_options(options) {
111     memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes));
112 }
113
114 void ChewingBitmapIndexLevel::reset() {
115     for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
116         for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
117             for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
118                 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
119                      ++n) {
120                     ChewingLengthIndexLevel * & length_array =
121                         m_chewing_length_indexes[k][l][m][n];
122                     if (length_array)
123                         delete length_array;
124                     length_array = NULL;
125                 }
126 }
127
128
129 /* search method */
130
131 int ChewingBitmapIndexLevel::search(int phrase_length,
132                                     /* in */ ChewingKey keys[],
133                                     /* out */ PhraseIndexRanges ranges) const {
134     assert(phrase_length > 0);
135     return initial_level_search(phrase_length, keys, ranges);
136 }
137
138 int ChewingBitmapIndexLevel::initial_level_search (int phrase_length,
139     /* in */ ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const {
140
141 /* macros */
142 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN:                  \
143     {                                                                   \
144         result |= middle_and_final_level_search(ORIGIN, phrase_length,  \
145                                                 keys, ranges);          \
146         if (m_options & AMBIGUITY) {                                    \
147             result |= middle_and_final_level_search(ANOTHER,            \
148                                                     phrase_length,      \
149                                                     keys, ranges);      \
150         }                                                               \
151         return result;                                                  \
152     }
153
154     /* deal with ambiguities */
155     int result = SEARCH_NONE;
156     const ChewingKey & first_key = keys[0];
157
158     switch(first_key.m_initial) {
159         MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH);
160         MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C);
161         MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH);
162         MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z);
163         MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH);
164         MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S);
165         MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L);
166         MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L);
167         MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H);
168         MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F);
169         MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K);
170         MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G);
171
172     case CHEWING_L:
173         {
174             result |= middle_and_final_level_search
175                 (CHEWING_L, phrase_length, keys, ranges);
176
177             if (m_options & PINYIN_AMB_L_N)
178                 result |= middle_and_final_level_search
179                     (CHEWING_N, phrase_length, keys,ranges);
180
181             if (m_options & PINYIN_AMB_L_R)
182                 result |= middle_and_final_level_search
183                     (CHEWING_R, phrase_length, keys, ranges);
184             return result;
185         }
186     default:
187         {
188             result |= middle_and_final_level_search
189                 ((ChewingInitial) first_key.m_initial,
190                  phrase_length, keys, ranges);
191             return result;
192         }
193     }
194 #undef MATCH
195     return result;
196 }
197
198
199 int ChewingBitmapIndexLevel::middle_and_final_level_search
200 (ChewingInitial initial, int phrase_length, /* in */ ChewingKey keys[],
201  /* out */ PhraseIndexRanges ranges) const {
202
203 /* macros */
204 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN:                  \
205     {                                                                   \
206         result = tone_level_search                                      \
207             (initial, middle,                                           \
208              ORIGIN, phrase_length, keys, ranges);                      \
209         if (m_options & AMBIGUITY) {                                    \
210             result |= tone_level_search                                 \
211                 (initial, middle,                                       \
212                  ANOTHER, phrase_length, keys, ranges);                 \
213         }                                                               \
214         return result;                                                  \
215     }
216
217     int result = SEARCH_NONE;
218     const ChewingKey & first_key = keys[0];
219     const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle;
220
221     switch(first_key.m_final) {
222     case CHEWING_ZERO_FINAL:
223         {
224             if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */
225                 if (!(m_options & PINYIN_INCOMPLETE))
226                     return result;
227                 for (int m = CHEWING_ZERO_MIDDLE;
228                      m < CHEWING_NUMBER_OF_MIDDLES; ++m)
229                     for (int n = CHEWING_ZERO_FINAL;
230                          n < CHEWING_NUMBER_OF_FINALS; ++n) {
231
232                         if (CHEWING_ZERO_MIDDLE == m &&
233                             CHEWING_ZERO_FINAL == n)
234                             continue;
235
236                         result |= tone_level_search
237                             (initial, (ChewingMiddle) m, (ChewingFinal) n,
238                              phrase_length, keys, ranges);
239                     }
240                 return result;
241             } else { /* normal pinyin */
242                 result |= tone_level_search
243                     (initial, middle, CHEWING_ZERO_FINAL,
244                      phrase_length, keys, ranges);
245                 return result;
246             }
247         }
248
249         MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG);
250         MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN);
251         MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG);
252         MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN);
253         MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING);
254         MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN);
255
256     default:
257         {
258             result |= tone_level_search
259                 (initial, middle, (ChewingFinal) first_key.m_final,
260                  phrase_length, keys, ranges);
261             return result;
262         }
263     }
264 #undef MATCH
265     return result;
266 }
267
268
269 int ChewingBitmapIndexLevel::tone_level_search
270 (ChewingInitial initial, ChewingMiddle middle, ChewingFinal final,
271  int phrase_length, /* in */ ChewingKey keys[],
272  /* out */ PhraseIndexRanges ranges) const {
273
274     int result = SEARCH_NONE;
275     const ChewingKey & first_key = keys[0];
276
277     switch (first_key.m_tone) {
278     case CHEWING_ZERO_TONE:
279         {
280             /* deal with zero tone in chewing large table. */
281             for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) {
282                 ChewingLengthIndexLevel * phrases =
283                     m_chewing_length_indexes
284                     [initial][middle][final][(ChewingTone)i];
285                 if (phrases)
286                     result |= phrases->search
287                         (m_options, phrase_length - 1, keys + 1, ranges);
288             }
289             return result;
290         }
291     default:
292         {
293             ChewingLengthIndexLevel * phrases =
294                 m_chewing_length_indexes
295                 [initial][middle][final][CHEWING_ZERO_TONE];
296             if (phrases)
297                 result |= phrases->search
298                     (m_options, phrase_length - 1, keys + 1, ranges);
299
300             phrases = m_chewing_length_indexes
301                 [initial][middle][final][(ChewingTone) first_key.m_tone];
302             if (phrases)
303                 result |= phrases->search
304                     (m_options, phrase_length - 1, keys + 1, ranges);
305             return result;
306         }
307     }
308     return result;
309 }
310
311
312 ChewingLengthIndexLevel::ChewingLengthIndexLevel() {
313     m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
314 }
315
316 ChewingLengthIndexLevel::~ChewingLengthIndexLevel() {
317 #define CASE(len) case len:                                             \
318     {                                                                   \
319         ChewingArrayIndexLevel<len> * & array = g_array_index           \
320             (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
321         if (array)                                                      \
322             delete array;                                               \
323         array = NULL;                                                   \
324         break;                                                          \
325     }
326
327     for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
328         switch (i){
329             CASE(0);
330             CASE(1);
331             CASE(2);
332             CASE(3);
333             CASE(4);
334             CASE(5);
335             CASE(6);
336             CASE(7);
337             CASE(8);
338             CASE(9);
339             CASE(10);
340             CASE(11);
341             CASE(12);
342             CASE(13);
343             CASE(14);
344             CASE(15);
345         default:
346             assert(false);
347         }
348     }
349 #undef CASE
350     g_array_free(m_chewing_array_indexes, TRUE);
351 }
352
353
354 int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length,
355                                     /* in */ ChewingKey keys[],
356                                     /* out */ PhraseIndexRanges ranges) const {
357     int result = SEARCH_NONE;
358     if (m_chewing_array_indexes->len < phrase_length + 1)
359         return result;
360     if (m_chewing_array_indexes->len > phrase_length + 1)
361         result |= SEARCH_CONTINUED;
362
363 #define CASE(len) case len:                                             \
364     {                                                                   \
365         ChewingArrayIndexLevel<len> * & array = g_array_index           \
366             (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
367         if (!array)                                                     \
368             return result;                                              \
369         result |= array->search(options, keys, ranges);                 \
370         return result;                                                  \
371     }
372
373     switch (phrase_length) {
374         CASE(0);
375         CASE(1);
376         CASE(2);
377         CASE(3);
378         CASE(4);
379         CASE(5);
380         CASE(6);
381         CASE(7);
382         CASE(8);
383         CASE(9);
384         CASE(10);
385         CASE(11);
386         CASE(12);
387         CASE(13);
388         CASE(14);
389         CASE(15);
390     default:
391         assert(false);
392     }
393
394 #undef CASE
395 }
396
397
398 template<size_t phrase_length>
399 int ChewingArrayIndexLevel<phrase_length>::search
400 (pinyin_option_t options, /* in */ChewingKey keys[],
401  /* out */ PhraseIndexRanges ranges) const {
402     IndexItem * chunk_begin = NULL, * chunk_end = NULL;
403     chunk_begin = (IndexItem *) m_chunk.begin();
404     chunk_end = (IndexItem *) m_chunk.end();
405
406     /* do the search */
407     ChewingKey left_keys[phrase_length], right_keys[phrase_length];
408     compute_lower_value2(options, keys, left_keys, phrase_length);
409     compute_upper_value2(options, keys, right_keys, phrase_length);
410
411     IndexItem left(left_keys, -1), right(right_keys, -1);
412
413     IndexItem * begin = std_lite::lower_bound
414         (chunk_begin, chunk_end, left,
415          phrase_exact_less_than2<phrase_length>);
416     IndexItem * end   = std_lite::upper_bound
417         (chunk_begin, chunk_end, right,
418          phrase_exact_less_than2<phrase_length>);
419
420     return convert(options, keys, begin, end, ranges);
421 }
422
423 /* compress consecutive tokens */
424 template<size_t phrase_length>
425 int ChewingArrayIndexLevel<phrase_length>::convert
426 (pinyin_option_t options, ChewingKey keys[],
427  IndexItem * begin, IndexItem * end,
428  PhraseIndexRanges ranges) const {
429     IndexItem * iter = NULL;
430     PhraseIndexRange cursor;
431     GArray * head, * cursor_head = NULL;
432
433     int result = SEARCH_NONE;
434     /* TODO: check the below code */
435     cursor.m_range_begin = null_token; cursor.m_range_end = null_token;
436     for (iter = begin; iter != end; ++iter) {
437         if (0 != pinyin_compare_with_ambiguities2
438             (options, keys, iter->m_keys, phrase_length))
439             continue;
440
441         phrase_token_t token = iter->m_token;
442         head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
443         if (NULL == head)
444             continue;
445
446         result |= SEARCH_OK;
447
448         if (null_token == cursor.m_range_begin) {
449             cursor.m_range_begin = token;
450             cursor.m_range_end   = token + 1;
451             cursor_head = head;
452         } else if (cursor.m_range_end == token &&
453                    PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) ==
454                    PHRASE_INDEX_LIBRARY_INDEX(token)) {
455             ++cursor.m_range_end;
456         } else {
457             g_array_append_val(cursor_head, cursor);
458             cursor.m_range_begin = token; cursor.m_range_end = token + 1;
459             cursor_head = head;
460         }
461     }
462
463     if (null_token == cursor.m_range_begin)
464         return result;
465
466     g_array_append_val(cursor_head, cursor);
467     return result;
468 }
469
470
471 /* add/remove index method */
472
473 int ChewingBitmapIndexLevel::add_index(int phrase_length,
474                                        /* in */ ChewingKey keys[],
475                                        /* in */ phrase_token_t token) {
476     const ChewingKey first_key = keys[0];
477     ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
478         [first_key.m_initial][first_key.m_middle]
479         [first_key.m_final][first_key.m_tone];
480
481     if (NULL == length_array) {
482         length_array = new ChewingLengthIndexLevel();
483     }
484
485     return length_array->add_index(phrase_length - 1, keys + 1, token);
486 }
487
488 int ChewingBitmapIndexLevel::remove_index(int phrase_length,
489                                           /* in */ ChewingKey keys[],
490                                           /* in */ phrase_token_t token) {
491     const ChewingKey first_key = keys[0];
492     ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
493         [first_key.m_initial][first_key.m_middle]
494         [first_key.m_final][first_key.m_tone];
495
496     if (NULL == length_array)
497         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
498
499     int retval = length_array->remove_index(phrase_length - 1, keys + 1, token);
500
501     /* remove empty array. */
502     if (0 == length_array->get_length()) {
503         delete length_array;
504         length_array = NULL;
505     }
506
507     return retval;
508 }
509
510 int ChewingLengthIndexLevel::add_index(int phrase_length,
511                                        /* in */ ChewingKey keys[],
512                                        /* in */ phrase_token_t token) {
513     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
514         return ERROR_PHRASE_TOO_LONG;
515
516     if (m_chewing_array_indexes->len <= phrase_length)
517         g_array_set_size(m_chewing_array_indexes, phrase_length + 1);
518
519 #define CASE(len) case len:                                     \
520     {                                                           \
521         ChewingArrayIndexLevel<len> * & array = g_array_index   \
522             (m_chewing_array_indexes,                           \
523              ChewingArrayIndexLevel<len> *, len);               \
524         if (NULL == array)                                      \
525             array = new ChewingArrayIndexLevel<len>;            \
526         return array->add_index(keys, token);                   \
527     }
528
529     switch(phrase_length) {
530         CASE(0);
531         CASE(1);
532         CASE(2);
533         CASE(3);
534         CASE(4);
535         CASE(5);
536         CASE(6);
537         CASE(7);
538         CASE(8);
539         CASE(9);
540         CASE(10);
541         CASE(11);
542         CASE(12);
543         CASE(13);
544         CASE(14);
545         CASE(15);
546     default:
547         assert(false);
548     }
549
550 #undef CASE
551 }
552
553 int ChewingLengthIndexLevel::remove_index(int phrase_length,
554                                           /* in */ ChewingKey keys[],
555                                           /* in */ phrase_token_t token) {
556     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
557         return ERROR_PHRASE_TOO_LONG;
558
559     if (m_chewing_array_indexes->len <= phrase_length)
560         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
561
562 #define CASE(len) case len:                                     \
563     {                                                           \
564         ChewingArrayIndexLevel<len> * & array = g_array_index   \
565             (m_chewing_array_indexes,                           \
566              ChewingArrayIndexLevel<len> *, len);               \
567         if (NULL == array)                                      \
568             return ERROR_REMOVE_ITEM_DONOT_EXISTS;              \
569         int retval = array->remove_index(keys, token);          \
570                                                                 \
571         /* remove empty array. */                               \
572         if (0 == array->get_length()) {                         \
573             delete array;                                       \
574             array = NULL;                                       \
575                                                                 \
576             /* shrink self array. */                            \
577             g_array_set_size(m_chewing_array_indexes,           \
578                              get_length());                     \
579         }                                                       \
580         return retval;                                          \
581     }
582
583     switch (phrase_length) {
584         CASE(0);
585         CASE(1);
586         CASE(2);
587         CASE(3);
588         CASE(4);
589         CASE(5);
590         CASE(6);
591         CASE(7);
592         CASE(8);
593         CASE(9);
594         CASE(10);
595         CASE(11);
596         CASE(12);
597         CASE(13);
598         CASE(14);
599         CASE(15);
600     default:
601         assert(false);
602     }
603
604 #undef CASE
605 }
606
607 template<size_t phrase_length>
608 int ChewingArrayIndexLevel<phrase_length>::add_index
609 (/* in */ ChewingKey keys[], /* in */ phrase_token_t token) {
610     IndexItem * begin, * end;
611
612     IndexItem add_elem(keys, token);
613     begin = (IndexItem *) m_chunk.begin();
614     end   = (IndexItem *) m_chunk.end();
615
616     std_lite::pair<IndexItem *, IndexItem *> range;
617     range = std_lite::equal_range
618         (begin, end, add_elem, phrase_exact_less_than2<phrase_length>);
619
620     IndexItem * cur_elem;
621     for (cur_elem = range.first;
622          cur_elem != range.second; ++cur_elem) {
623         if (cur_elem->m_token == token)
624             return ERROR_INSERT_ITEM_EXISTS;
625         if (cur_elem->m_token > token)
626             break;
627     }
628
629     int offset = (cur_elem - begin) * sizeof(IndexItem);
630     m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
631     return ERROR_OK;
632 }
633
634 template<size_t phrase_length>
635 int ChewingArrayIndexLevel<phrase_length>::remove_index
636 (/* in */ ChewingKey keys[], /* in */ phrase_token_t token) {
637     IndexItem * begin, * end;
638
639     IndexItem remove_elem(keys, token);
640     begin = (IndexItem *) m_chunk.begin();
641     end   = (IndexItem *) m_chunk.end();
642
643     std_lite::pair<IndexItem *, IndexItem *> range;
644     range = std_lite::equal_range
645         (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>);
646
647     IndexItem * cur_elem;
648     for (cur_elem = range.first;
649          cur_elem != range.second; ++cur_elem) {
650         if (cur_elem->m_token == token)
651             break;
652     }
653
654     if (cur_elem == range.second)
655         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
656
657     int offset = (cur_elem - begin) * sizeof(IndexItem);
658     m_chunk.remove_content(offset, sizeof(IndexItem));
659     return ERROR_OK;
660 }
661
662
663 /* load text method */
664 bool ChewingLargeTable::load_text(FILE * infile) {
665     char pinyin[256];
666     char phrase[256];
667     phrase_token_t token;
668     size_t freq;
669
670     while (!feof(infile)) {
671         fscanf(infile, "%s", pinyin);
672         fscanf(infile, "%s", phrase);
673         fscanf(infile, "%u", &token);
674         fscanf(infile, "%ld", &freq);
675
676         if(feof(infile))
677             break;
678
679         glong len = g_utf8_strlen(phrase, -1);
680
681         FullPinyinParser2 parser;
682         ChewingKeyVector keys;
683         ChewingKeyRestVector key_rests;
684
685         keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
686         key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
687
688         pinyin_option_t options = USE_TONE;
689         parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
690
691         if (len != keys->len) {
692             fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n",
693                     pinyin, phrase, token, freq);
694             continue;
695         }
696
697         add_index(keys->len, (ChewingKey *)keys->data, token);
698
699         g_array_free(keys, TRUE);
700         g_array_free(key_rests, TRUE);
701     }
702
703     return true;
704 }
705
706
707 /* load/store method */
708
709 bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
710                                    table_offset_t end) {
711     reset();
712     char * begin = (char *) chunk->begin();
713     table_offset_t phrase_begin, phrase_end;
714     table_offset_t * index = (table_offset_t *) (begin + offset);
715     phrase_end = *index;
716
717     for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
718         for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
719             for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
720                 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
721                     phrase_begin = phrase_end;
722                     index++;
723                     phrase_end = *index;
724
725                     if (phrase_begin == phrase_end) /* null pointer */
726                         continue;
727
728                     /* after reset() all phrases are null pointer. */
729                     ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel;
730                     m_chewing_length_indexes[k][l][m][n] = phrases;
731
732                     phrases->load(chunk, phrase_begin, phrase_end - 1);
733                     assert(phrase_end <= end);
734                     assert(*(begin + phrase_end - 1)  == c_separate);
735                 }
736
737     offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
738     assert(c_separate == *(begin + offset));
739     return true;
740 }
741
742 bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
743                                     table_offset_t offset,
744                                     table_offset_t & end) {
745     table_offset_t phrase_end;
746     table_offset_t index = offset;
747     offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
748
749     /* add '#' */
750     new_chunk->set_content(offset, &c_separate, sizeof(char));
751     offset += sizeof(char);
752     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
753     index += sizeof(table_offset_t);
754
755     for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
756         for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
757             for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
758                 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
759                     ChewingLengthIndexLevel * phrases =
760                         m_chewing_length_indexes[k][l][m][n];
761
762                     if (NULL == phrases) { /* null pointer */
763                         new_chunk->set_content(index, &offset,
764                                                sizeof(table_offset_t));
765                         index += sizeof(table_offset_t);
766                         continue;
767                     }
768
769                     /* has a end '#' */
770                     phrases->store(new_chunk, offset, phrase_end);
771                     offset = phrase_end;
772
773                     /* add '#' */
774                     new_chunk->set_content(offset, &c_separate, sizeof(char));
775                     offset += sizeof(char);
776                     new_chunk->set_content(index, &offset,
777                                            sizeof(table_offset_t));
778                     index += sizeof(table_offset_t);
779                 }
780
781     end = offset;
782     return true;
783 }
784
785 bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
786                                    table_offset_t end) {
787     char * begin = (char *) chunk->begin();
788     guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
789     table_offset_t * index = (table_offset_t *)
790         (begin + offset + sizeof(guint32));
791
792     table_offset_t phrase_begin, phrase_end = *index;
793     g_array_set_size(m_chewing_array_indexes, 0);
794     for (guint32 i = 0; i < nindex; ++i) {
795         phrase_begin = phrase_end;
796         index++;
797         phrase_end = *index;
798
799         if (phrase_begin == phrase_end) {
800             void * null = NULL;
801             g_array_append_val(m_chewing_array_indexes, null);
802             continue;
803         }
804
805 #define CASE(len) case len:                                             \
806         {                                                               \
807             ChewingArrayIndexLevel<len> * phrase =                      \
808                 new ChewingArrayIndexLevel<len>;                        \
809             phrase->load(chunk, phrase_begin, phrase_end - 1);          \
810             assert(*(begin + phrase_end - 1) == c_separate);            \
811             assert(phrase_end <= end);                                  \
812             g_array_append_val(m_chewing_array_indexes, phrase);        \
813             break;                                                      \
814         }
815
816         switch ( i ){
817             CASE(0);
818             CASE(1);
819             CASE(2);
820             CASE(3);
821             CASE(4);
822             CASE(5);
823             CASE(6);
824             CASE(7);
825             CASE(8);
826             CASE(9);
827             CASE(10);
828             CASE(11);
829             CASE(12);
830             CASE(13);
831             CASE(14);
832             CASE(15);
833         default:
834             assert(false);
835         }
836
837 #undef CASE
838     }
839
840     /* check '#' */
841     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
842     assert(c_separate == *(begin + offset));
843     return true;
844 }
845
846 bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk,
847                                     table_offset_t offset,
848                                     table_offset_t & end) {
849     guint32 nindex = m_chewing_array_indexes->len; /* number of index */
850     new_chunk->set_content(offset, &nindex, sizeof(guint32));
851     table_offset_t index = offset + sizeof(guint32);
852
853     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
854     new_chunk->set_content(offset, &c_separate, sizeof(char));
855     offset += sizeof(char);
856     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
857     index += sizeof(table_offset_t);
858
859     table_offset_t phrase_end;
860     for (guint32 i = 0; i < nindex; ++i) {
861 #define CASE(len) case len:                                             \
862         {                                                               \
863             ChewingArrayIndexLevel<len> * phrase = g_array_index        \
864                 (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
865             if (NULL == phrase) {                                       \
866                 new_chunk->set_content                                  \
867                     (index, &offset, sizeof(table_offset_t));           \
868                 index += sizeof(table_offset_t);                        \
869                 continue;                                               \
870             }                                                           \
871             phrase->store(new_chunk, offset, phrase_end);               \
872             offset = phrase_end;                                        \
873             break;                                                      \
874         }
875
876         switch ( i ){
877             CASE(0);
878             CASE(1);
879             CASE(2);
880             CASE(3);
881             CASE(4);
882             CASE(5);
883             CASE(6);
884             CASE(7);
885             CASE(8);
886             CASE(9);
887             CASE(10);
888             CASE(11);
889             CASE(12);
890             CASE(13);
891             CASE(14);
892             CASE(15);
893         default:
894             assert(false);
895         }
896 #undef CASE
897
898         /* add '#' */
899         new_chunk->set_content(offset, &c_separate, sizeof(char));
900         offset += sizeof(char);
901         new_chunk->set_content(index, &offset, sizeof(table_offset_t));
902         index += sizeof(table_offset_t);
903     }
904
905     end = offset;
906     return true;
907 }
908
909 template<size_t phrase_length>
910 bool ChewingArrayIndexLevel<phrase_length>::
911 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) {
912     char * begin = (char *) chunk->begin();
913     m_chunk.set_chunk(begin + offset, end - offset, NULL);
914     return true;
915 }
916
917 template<size_t phrase_length>
918 bool ChewingArrayIndexLevel<phrase_length>::
919 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
920     new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
921     end = offset + m_chunk.size();
922     return true;
923 }
924
925
926 /* get length method */
927
928 int ChewingLengthIndexLevel::get_length() const {
929     int length = m_chewing_array_indexes->len;
930
931     /* trim trailing zero. */
932     for (int i = length - 1; i >= 0; --i) {
933         void * array = g_array_index(m_chewing_array_indexes, void *, i);
934
935         if (NULL != array)
936             break;
937
938         --length;
939     }
940
941     return length;
942 }
943
944 template<size_t phrase_length>
945 int ChewingArrayIndexLevel<phrase_length>::get_length() const {
946     IndexItem * chunk_begin = NULL, * chunk_end = NULL;
947     chunk_begin = (IndexItem *) m_chunk.begin();
948     chunk_end = (IndexItem *) m_chunk.end();
949
950     return chunk_end - chunk_begin;
951 }
952
953
954 /* mask out method */
955
956 bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask,
957                                        phrase_token_t value) {
958     for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
959         for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
960             for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
961                 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
962                      ++n) {
963                     ChewingLengthIndexLevel * & length_array =
964                         m_chewing_length_indexes[k][l][m][n];
965
966                     if (NULL == length_array)
967                         continue;
968
969                     length_array->mask_out(mask, value);
970
971                     if (0 == length_array->get_length()) {
972                         delete length_array;
973                         length_array = NULL;
974                     }
975                 }
976     return true;
977 }
978
979 bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask,
980                                        phrase_token_t value) {
981 #define CASE(len) case len:                                     \
982     {                                                           \
983         ChewingArrayIndexLevel<len> * & array = g_array_index   \
984             (m_chewing_array_indexes,                           \
985              ChewingArrayIndexLevel<len> *, len);               \
986                                                                 \
987         if (NULL == array)                                      \
988             continue;                                           \
989                                                                 \
990         array->mask_out(mask, value);                           \
991                                                                 \
992         if (0 == array->get_length()) {                         \
993             delete array;                                       \
994             array = NULL;                                       \
995         }                                                       \
996         break;                                                  \
997     }
998
999     for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
1000         switch (i){
1001             CASE(0);
1002             CASE(1);
1003             CASE(2);
1004             CASE(3);
1005             CASE(4);
1006             CASE(5);
1007             CASE(6);
1008             CASE(7);
1009             CASE(8);
1010             CASE(9);
1011             CASE(10);
1012             CASE(11);
1013             CASE(12);
1014             CASE(13);
1015             CASE(14);
1016             CASE(15);
1017         default:
1018             assert(false);
1019         }
1020     }
1021 #undef CASE
1022     g_array_set_size(m_chewing_array_indexes, get_length());
1023     return true;
1024 }
1025
1026 template<size_t phrase_length>
1027 bool ChewingArrayIndexLevel<phrase_length>::mask_out
1028 (phrase_token_t mask, phrase_token_t value) {
1029     IndexItem * begin = NULL, * end = NULL;
1030     begin = (IndexItem *) m_chunk.begin();
1031     end   = (IndexItem *) m_chunk.end();
1032
1033     for (IndexItem * cur = begin; cur != end; ++cur) {
1034         if ((cur->m_token & mask) != value)
1035             continue;
1036
1037         int offset = (cur - begin) * sizeof(IndexItem);
1038         m_chunk.remove_content(offset, sizeof(IndexItem));
1039
1040         /* update chunk end. */
1041         end = (IndexItem *) m_chunk.end();
1042         --cur;
1043     }
1044
1045     return true;
1046 }