3 * Library to deal with pinyin.
5 * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "phrase_large_table2.h"
27 /* class definition */
31 class PhraseLengthIndexLevel2{
33 GArray * m_phrase_array_indexes;
35 PhraseLengthIndexLevel2();
36 ~PhraseLengthIndexLevel2();
38 /* load/store method */
39 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
43 int search(int phrase_length, /* in */ ucs4_t phrase[],
44 /* out */ PhraseTokens tokens) const;
46 /* add_index/remove_index method */
47 int add_index(int phrase_length, /* in */ ucs4_t phrase[],
48 /* in */ phrase_token_t token);
49 int remove_index(int phrase_length, /* in */ ucs4_t phrase[],
50 /* in */ phrase_token_t token);
52 /* get length method */
53 int get_length() const;
56 bool mask_out(phrase_token_t mask, phrase_token_t value);
60 template<size_t phrase_length>
61 struct PhraseIndexItem2{
62 phrase_token_t m_token;
63 ucs4_t m_phrase[phrase_length];
65 PhraseIndexItem2<phrase_length>(ucs4_t phrase[], phrase_token_t token){
66 memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
72 template<size_t phrase_length>
73 class PhraseArrayIndexLevel2{
75 typedef PhraseIndexItem2<phrase_length> IndexItem;
80 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
81 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
84 int search(/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
86 /* add_index/remove_index method */
87 int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
88 int remove_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
90 /* get length method */
91 int get_length() const;
94 bool mask_out(phrase_token_t mask, phrase_token_t value);
99 using namespace pinyin;
101 /* class implementation */
103 template<size_t phrase_length>
104 static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
105 const PhraseIndexItem2<phrase_length> &rhs){
106 ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
107 ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
109 return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
112 template<size_t phrase_length>
113 static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
114 const PhraseIndexItem2<phrase_length> & rhs){
115 return 0 > phrase_compare2(lhs, rhs);
118 PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
119 memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
122 void PhraseBitmapIndexLevel2::reset(){
123 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
124 PhraseLengthIndexLevel2 * & length_array =
125 m_phrase_length_indexes[i];
135 int PhraseBitmapIndexLevel2::search(int phrase_length,
136 /* in */ ucs4_t phrase[],
137 /* out */ PhraseTokens tokens) const {
138 assert(phrase_length > 0);
140 int result = SEARCH_NONE;
141 /* use the first 8-bit of the lower 16-bit for bitmap index,
142 * as most the higher 16-bit are zero.
144 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
146 PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
148 return phrase_array->search(phrase_length, phrase, tokens);
152 PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
153 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
156 PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
157 #define CASE(len) case len: \
159 PhraseArrayIndexLevel2<len> * & array = g_array_index \
160 (m_phrase_array_indexes, \
161 PhraseArrayIndexLevel2<len> *, len - 1); \
169 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
191 g_array_free(m_phrase_array_indexes, TRUE);
195 int PhraseLengthIndexLevel2::search(int phrase_length,
196 /* in */ ucs4_t phrase[],
197 /* out */ PhraseTokens tokens) const {
198 int result = SEARCH_NONE;
199 if(m_phrase_array_indexes->len < phrase_length)
201 if (m_phrase_array_indexes->len > phrase_length)
202 result |= SEARCH_CONTINUED;
204 #define CASE(len) case len: \
206 PhraseArrayIndexLevel2<len> * array = g_array_index \
207 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
210 result |= array->search(phrase, tokens); \
214 switch ( phrase_length ){
237 template<size_t phrase_length>
238 int PhraseArrayIndexLevel2<phrase_length>::search
239 (/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
240 int result = SEARCH_NONE;
242 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
243 chunk_begin = (IndexItem *) m_chunk.begin();
244 chunk_end = (IndexItem *) m_chunk.end();
247 IndexItem search_elem(phrase, -1);
248 std_lite::pair<IndexItem *, IndexItem *> range;
249 range = std_lite::equal_range
250 (chunk_begin, chunk_end, search_elem,
251 phrase_less_than2<phrase_length>);
253 const IndexItem * const begin = range.first;
254 const IndexItem * const end = range.second;
258 const IndexItem * iter = NULL;
259 GArray * array = NULL;
261 for (iter = begin; iter != end; ++iter) {
262 phrase_token_t token = iter->m_token;
264 /* filter out disabled sub phrase indices. */
265 array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
271 g_array_append_val(array, token);
278 /* add/remove index method */
280 int PhraseBitmapIndexLevel2::add_index(int phrase_length,
281 /* in */ ucs4_t phrase[],
282 /* in */ phrase_token_t token){
283 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
285 PhraseLengthIndexLevel2 * & length_array =
286 m_phrase_length_indexes[first_key];
288 if ( !length_array ){
289 length_array = new PhraseLengthIndexLevel2();
291 return length_array->add_index(phrase_length, phrase, token);
294 int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
295 /* in */ ucs4_t phrase[],
296 /* in */ phrase_token_t token){
297 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
299 PhraseLengthIndexLevel2 * & length_array =
300 m_phrase_length_indexes[first_key];
302 if (NULL == length_array)
303 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
305 int retval = length_array->remove_index(phrase_length, phrase, token);
307 /* remove empty array. */
308 if (0 == length_array->get_length()) {
316 int PhraseLengthIndexLevel2::add_index(int phrase_length,
317 /* in */ ucs4_t phrase[],
318 /* in */ phrase_token_t token) {
319 if (phrase_length >= MAX_PHRASE_LENGTH)
320 return ERROR_PHRASE_TOO_LONG;
322 if (m_phrase_array_indexes->len < phrase_length)
323 g_array_set_size(m_phrase_array_indexes, phrase_length);
325 #define CASE(len) case len: \
327 PhraseArrayIndexLevel2<len> * & array = g_array_index \
328 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
330 array = new PhraseArrayIndexLevel2<len>; \
331 return array->add_index(phrase, token); \
334 switch(phrase_length){
358 int PhraseLengthIndexLevel2::remove_index(int phrase_length,
359 /* in */ ucs4_t phrase[],
360 /* in */ phrase_token_t token) {
361 if (phrase_length >= MAX_PHRASE_LENGTH)
362 return ERROR_PHRASE_TOO_LONG;
364 if (m_phrase_array_indexes->len < phrase_length)
365 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
367 #define CASE(len) case len: \
369 PhraseArrayIndexLevel2<len> * & array = g_array_index \
370 (m_phrase_array_indexes, \
371 PhraseArrayIndexLevel2<len> *, len - 1); \
373 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
374 int retval = array->remove_index(phrase, token); \
376 /* remove empty array. */ \
377 if (0 == array->get_length()) { \
381 /* shrink self array. */ \
382 g_array_set_size(m_phrase_array_indexes, \
388 switch(phrase_length){
411 template<size_t phrase_length>
412 int PhraseArrayIndexLevel2<phrase_length>::add_index
413 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
414 IndexItem * begin, * end;
416 IndexItem add_elem(phrase, token);
417 begin = (IndexItem *) m_chunk.begin();
418 end = (IndexItem *) m_chunk.end();
420 std_lite::pair<IndexItem *, IndexItem *> range;
421 range = std_lite::equal_range
422 (begin, end, add_elem, phrase_less_than2<phrase_length>);
424 IndexItem * cur_elem;
425 for (cur_elem = range.first;
426 cur_elem != range.second; ++cur_elem) {
427 if (cur_elem->m_token == token)
428 return ERROR_INSERT_ITEM_EXISTS;
429 if (cur_elem->m_token > token)
433 int offset = (cur_elem - begin) * sizeof(IndexItem);
434 m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
438 template<size_t phrase_length>
439 int PhraseArrayIndexLevel2<phrase_length>::remove_index
440 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token) {
441 IndexItem * begin, * end;
443 IndexItem remove_elem(phrase, token);
444 begin = (IndexItem *) m_chunk.begin();
445 end = (IndexItem *) m_chunk.end();
447 std_lite::pair<IndexItem *, IndexItem *> range;
448 range = std_lite::equal_range
449 (begin, end, remove_elem, phrase_less_than2<phrase_length>);
451 IndexItem * cur_elem;
452 for (cur_elem = range.first;
453 cur_elem != range.second; ++cur_elem) {
454 if (cur_elem->m_token == token)
458 if (cur_elem == range.second)
459 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
461 int offset = (cur_elem - begin) * sizeof(IndexItem);
462 m_chunk.remove_content(offset, sizeof(IndexItem));
467 /* load text method */
469 bool PhraseLargeTable2::load_text(FILE * infile){
472 phrase_token_t token;
475 while ( !feof(infile) ) {
476 fscanf(infile, "%s", pinyin);
477 fscanf(infile, "%s", phrase);
478 fscanf(infile, "%u", &token);
479 fscanf(infile, "%ld", &freq);
484 glong phrase_len = g_utf8_strlen(phrase, -1);
485 ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
486 add_index(phrase_len, new_phrase, token);
494 /* load/store method */
496 bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk,
497 table_offset_t offset,
500 char * buf_begin = (char *) chunk->begin();
501 table_offset_t phrase_begin, phrase_end;
502 table_offset_t * index = (table_offset_t *) (buf_begin + offset);
505 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
506 phrase_begin = phrase_end;
509 if ( phrase_begin == phrase_end ) //null pointer
512 /* after reset() all phrases are null pointer. */
513 PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2;
514 m_phrase_length_indexes[i] = phrases;
516 phrases->load(chunk, phrase_begin, phrase_end - 1);
517 assert( phrase_end <= end );
518 assert( *(buf_begin + phrase_end - 1) == c_separate);
520 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
521 assert( c_separate == *(buf_begin + offset) );
525 bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk,
526 table_offset_t offset,
527 table_offset_t & end){
528 table_offset_t phrase_end;
529 table_offset_t index = offset;
530 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
532 new_chunk->set_content(offset, &c_separate, sizeof(char));
533 offset +=sizeof(char);
534 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
535 index += sizeof(table_offset_t);
536 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
537 PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i];
538 if ( !phrases ) { //null pointer
539 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
540 index += sizeof(table_offset_t);
543 phrases->store(new_chunk, offset, phrase_end); //has a end '#'
546 new_chunk->set_content(offset, &c_separate, sizeof(char));
547 offset += sizeof(char);
548 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
549 index += sizeof(table_offset_t);
555 bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
556 table_offset_t offset,
557 table_offset_t end) {
558 char * buf_begin = (char *) chunk->begin();
559 guint32 nindex = *((guint32 *)(buf_begin + offset));
560 table_offset_t * index = (table_offset_t *)
561 (buf_begin + offset + sizeof(guint32));
563 table_offset_t phrase_begin, phrase_end = *index;
564 g_array_set_size(m_phrase_array_indexes, 0);
565 for (size_t i = 1; i <= nindex; ++i) {
566 phrase_begin = phrase_end;
569 if ( phrase_begin == phrase_end ){
571 g_array_append_val(m_phrase_array_indexes, null);
575 #define CASE(len) case len: \
577 PhraseArrayIndexLevel2<len> * phrase = \
578 new PhraseArrayIndexLevel2<len>; \
579 phrase->load(chunk, phrase_begin, phrase_end - 1); \
580 assert( *(buf_begin + phrase_end - 1) == c_separate ); \
581 assert( phrase_end <= end ); \
582 g_array_append_val(m_phrase_array_indexes, phrase); \
607 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
608 assert ( c_separate == * (buf_begin + offset) );
612 bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk,
613 table_offset_t offset,
614 table_offset_t & end) {
615 guint32 nindex = m_phrase_array_indexes->len;
616 new_chunk->set_content(offset, &nindex, sizeof(guint32));
617 table_offset_t index = offset + sizeof(guint32);
619 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
620 new_chunk->set_content(offset, &c_separate, sizeof(char));
621 offset += sizeof(char);
622 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
623 index += sizeof(table_offset_t);
625 table_offset_t phrase_end;
626 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
627 #define CASE(len) case len: \
629 PhraseArrayIndexLevel2<len> * phrase = g_array_index \
630 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
632 new_chunk->set_content \
633 (index, &offset, sizeof(table_offset_t)); \
634 index += sizeof(table_offset_t); \
637 phrase->store(new_chunk, offset, phrase_end); \
638 offset = phrase_end; \
662 new_chunk->set_content(offset, &c_separate, sizeof(char));
663 offset += sizeof(char);
664 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
665 index += sizeof(table_offset_t);
673 template<size_t phrase_length>
674 bool PhraseArrayIndexLevel2<phrase_length>::
675 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
676 char * buf_begin = (char *) chunk->begin();
677 m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
681 template<size_t phrase_length>
682 bool PhraseArrayIndexLevel2<phrase_length>::
683 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
684 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
685 end = offset + m_chunk.size();
690 /* get length method */
692 int PhraseLengthIndexLevel2::get_length() const {
693 int length = m_phrase_array_indexes->len;
695 /* trim trailing zero. */
696 for (int i = length - 1; i >= 0; --i) {
697 void * array = g_array_index(m_phrase_array_indexes, void *, i);
708 template<size_t phrase_length>
709 int PhraseArrayIndexLevel2<phrase_length>::get_length() const {
710 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
711 chunk_begin = (IndexItem *) m_chunk.begin();
712 chunk_end = (IndexItem *) m_chunk.end();
714 return chunk_end - chunk_begin;
718 /* mask out method */
720 bool PhraseBitmapIndexLevel2::mask_out(phrase_token_t mask,
721 phrase_token_t value){
722 for (size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
723 PhraseLengthIndexLevel2 * & length_array =
724 m_phrase_length_indexes[i];
726 if (NULL == length_array)
729 length_array->mask_out(mask, value);
731 if (0 == length_array->get_length()) {
740 bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask,
741 phrase_token_t value){
742 #define CASE(len) case len: \
744 PhraseArrayIndexLevel2<len> * & array = g_array_index \
745 (m_phrase_array_indexes, \
746 PhraseArrayIndexLevel2<len> *, len - 1); \
751 array->mask_out(mask, value); \
753 if (0 == array->get_length()) { \
760 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
782 /* shrink self array. */
783 g_array_set_size(m_phrase_array_indexes, get_length());
788 template<size_t phrase_length>
789 bool PhraseArrayIndexLevel2<phrase_length>::mask_out
790 (phrase_token_t mask, phrase_token_t value) {
791 IndexItem * begin = NULL, * end = NULL;
792 begin = (IndexItem *) m_chunk.begin();
793 end = (IndexItem *) m_chunk.end();
795 for (IndexItem * cur = begin; cur != end; ++cur) {
796 if ((cur->m_token & mask) != value)
799 int offset = (cur - begin) * sizeof(IndexItem);
800 m_chunk.remove_content(offset, sizeof(IndexItem));
802 /* update chunk end. */
803 end = (IndexItem *) m_chunk.end();