3 * Library to deal with pinyin.
5 * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "phrase_large_table2.h"
27 /* class definition */
31 class PhraseLengthIndexLevel2{
33 GArray * m_phrase_array_indexes;
35 PhraseLengthIndexLevel2();
36 ~PhraseLengthIndexLevel2();
38 /* load/store method */
39 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
43 int search(int phrase_length, /* in */ const ucs4_t phrase[],
44 /* out */ PhraseTokens tokens) const;
46 /* add_index/remove_index method */
47 int add_index(int phrase_length, /* in */ const ucs4_t phrase[],
48 /* in */ phrase_token_t token);
49 int remove_index(int phrase_length, /* in */ const ucs4_t phrase[],
50 /* in */ phrase_token_t token);
52 /* get length method */
53 int get_length() const;
56 bool mask_out(phrase_token_t mask, phrase_token_t value);
60 template<size_t phrase_length>
61 struct PhraseIndexItem2{
62 phrase_token_t m_token;
63 ucs4_t m_phrase[phrase_length];
65 PhraseIndexItem2<phrase_length>(const ucs4_t phrase[], phrase_token_t token){
66 memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
72 template<size_t phrase_length>
73 class PhraseArrayIndexLevel2{
75 typedef PhraseIndexItem2<phrase_length> IndexItem;
80 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
81 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
84 int search(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
86 /* add_index/remove_index method */
87 int add_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
88 int remove_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
90 /* get length method */
91 int get_length() const;
94 bool mask_out(phrase_token_t mask, phrase_token_t value);
99 using namespace pinyin;
101 /* class implementation */
103 template<size_t phrase_length>
104 static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
105 const PhraseIndexItem2<phrase_length> &rhs){
106 ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
107 ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
109 return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
112 template<size_t phrase_length>
113 static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
114 const PhraseIndexItem2<phrase_length> & rhs){
115 return 0 > phrase_compare2(lhs, rhs);
118 PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
119 memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
122 void PhraseBitmapIndexLevel2::reset(){
123 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
124 PhraseLengthIndexLevel2 * & length_array =
125 m_phrase_length_indexes[i];
135 int PhraseBitmapIndexLevel2::search(int phrase_length,
136 /* in */ const ucs4_t phrase[],
137 /* out */ PhraseTokens tokens) const {
138 assert(phrase_length > 0);
140 int result = SEARCH_NONE;
141 /* use the first 8-bit of the lower 16-bit for bitmap index,
142 * as most the higher 16-bit are zero.
144 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
146 PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
148 return phrase_array->search(phrase_length, phrase, tokens);
152 PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
153 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
156 PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
157 #define CASE(len) case len: \
159 PhraseArrayIndexLevel2<len> * & array = g_array_index \
160 (m_phrase_array_indexes, \
161 PhraseArrayIndexLevel2<len> *, len - 1); \
169 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
191 g_array_free(m_phrase_array_indexes, TRUE);
195 int PhraseLengthIndexLevel2::search(int phrase_length,
196 /* in */ const ucs4_t phrase[],
197 /* out */ PhraseTokens tokens) const {
198 int result = SEARCH_NONE;
199 if(m_phrase_array_indexes->len < phrase_length)
201 if (m_phrase_array_indexes->len > phrase_length)
202 result |= SEARCH_CONTINUED;
204 #define CASE(len) case len: \
206 PhraseArrayIndexLevel2<len> * array = g_array_index \
207 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
210 result |= array->search(phrase, tokens); \
214 switch ( phrase_length ){
237 template<size_t phrase_length>
238 int PhraseArrayIndexLevel2<phrase_length>::search
239 (/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
240 int result = SEARCH_NONE;
242 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
243 chunk_begin = (IndexItem *) m_chunk.begin();
244 chunk_end = (IndexItem *) m_chunk.end();
247 IndexItem search_elem(phrase, -1);
248 std_lite::pair<IndexItem *, IndexItem *> range;
249 range = std_lite::equal_range
250 (chunk_begin, chunk_end, search_elem,
251 phrase_less_than2<phrase_length>);
253 const IndexItem * const begin = range.first;
254 const IndexItem * const end = range.second;
258 const IndexItem * iter = NULL;
259 GArray * array = NULL;
261 for (iter = begin; iter != end; ++iter) {
262 phrase_token_t token = iter->m_token;
264 /* filter out disabled sub phrase indices. */
265 array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
271 g_array_append_val(array, token);
278 /* add/remove index method */
280 int PhraseBitmapIndexLevel2::add_index(int phrase_length,
281 /* in */ const ucs4_t phrase[],
282 /* in */ phrase_token_t token){
283 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
285 PhraseLengthIndexLevel2 * & length_array =
286 m_phrase_length_indexes[first_key];
288 if ( !length_array ){
289 length_array = new PhraseLengthIndexLevel2();
291 return length_array->add_index(phrase_length, phrase, token);
294 int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
295 /* in */ const ucs4_t phrase[],
296 /* in */ phrase_token_t token){
297 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
299 PhraseLengthIndexLevel2 * & length_array =
300 m_phrase_length_indexes[first_key];
302 if (NULL == length_array)
303 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
305 int retval = length_array->remove_index(phrase_length, phrase, token);
307 /* remove empty array. */
308 if (0 == length_array->get_length()) {
316 int PhraseLengthIndexLevel2::add_index(int phrase_length,
317 /* in */ const ucs4_t phrase[],
318 /* in */ phrase_token_t token) {
319 if (phrase_length >= MAX_PHRASE_LENGTH)
320 return ERROR_PHRASE_TOO_LONG;
322 if (m_phrase_array_indexes->len < phrase_length)
323 g_array_set_size(m_phrase_array_indexes, phrase_length);
325 #define CASE(len) case len: \
327 PhraseArrayIndexLevel2<len> * & array = g_array_index \
328 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
330 array = new PhraseArrayIndexLevel2<len>; \
331 return array->add_index(phrase, token); \
334 switch(phrase_length){
358 int PhraseLengthIndexLevel2::remove_index(int phrase_length,
359 /* in */ const ucs4_t phrase[],
360 /* in */ phrase_token_t token) {
361 if (phrase_length >= MAX_PHRASE_LENGTH)
362 return ERROR_PHRASE_TOO_LONG;
364 if (m_phrase_array_indexes->len < phrase_length)
365 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
367 #define CASE(len) case len: \
369 PhraseArrayIndexLevel2<len> * & array = g_array_index \
370 (m_phrase_array_indexes, \
371 PhraseArrayIndexLevel2<len> *, len - 1); \
373 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
374 int retval = array->remove_index(phrase, token); \
376 /* remove empty array. */ \
377 if (0 == array->get_length()) { \
381 /* shrink self array. */ \
382 g_array_set_size(m_phrase_array_indexes, \
388 switch(phrase_length){
411 template<size_t phrase_length>
412 int PhraseArrayIndexLevel2<phrase_length>::add_index
413 (/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token){
414 IndexItem * begin, * end;
416 IndexItem add_elem(phrase, token);
417 begin = (IndexItem *) m_chunk.begin();
418 end = (IndexItem *) m_chunk.end();
420 std_lite::pair<IndexItem *, IndexItem *> range;
421 range = std_lite::equal_range
422 (begin, end, add_elem, phrase_less_than2<phrase_length>);
424 IndexItem * cur_elem;
425 for (cur_elem = range.first;
426 cur_elem != range.second; ++cur_elem) {
427 if (cur_elem->m_token == token)
428 return ERROR_INSERT_ITEM_EXISTS;
429 if (cur_elem->m_token > token)
433 int offset = (cur_elem - begin) * sizeof(IndexItem);
434 m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
438 template<size_t phrase_length>
439 int PhraseArrayIndexLevel2<phrase_length>::remove_index
440 (/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) {
441 IndexItem * begin, * end;
443 IndexItem remove_elem(phrase, token);
444 begin = (IndexItem *) m_chunk.begin();
445 end = (IndexItem *) m_chunk.end();
447 std_lite::pair<IndexItem *, IndexItem *> range;
448 range = std_lite::equal_range
449 (begin, end, remove_elem, phrase_less_than2<phrase_length>);
451 IndexItem * cur_elem;
452 for (cur_elem = range.first;
453 cur_elem != range.second; ++cur_elem) {
454 if (cur_elem->m_token == token)
458 if (cur_elem == range.second)
459 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
461 int offset = (cur_elem - begin) * sizeof(IndexItem);
462 m_chunk.remove_content(offset, sizeof(IndexItem));
467 /* load text method */
469 bool PhraseLargeTable2::load_text(FILE * infile){
472 phrase_token_t token;
475 while (!feof(infile)) {
476 int num = fscanf(infile, "%s %s %u %ld",
477 pinyin, phrase, &token, &freq);
485 glong phrase_len = g_utf8_strlen(phrase, -1);
486 ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
487 add_index(phrase_len, new_phrase, token);
495 /* load/store method */
497 bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk,
498 table_offset_t offset,
501 char * buf_begin = (char *) chunk->begin();
502 table_offset_t phrase_begin, phrase_end;
503 table_offset_t * index = (table_offset_t *) (buf_begin + offset);
506 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
507 phrase_begin = phrase_end;
510 if ( phrase_begin == phrase_end ) //null pointer
513 /* after reset() all phrases are null pointer. */
514 PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2;
515 m_phrase_length_indexes[i] = phrases;
517 phrases->load(chunk, phrase_begin, phrase_end - 1);
518 assert( phrase_end <= end );
519 assert( *(buf_begin + phrase_end - 1) == c_separate);
521 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
522 assert( c_separate == *(buf_begin + offset) );
526 bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk,
527 table_offset_t offset,
528 table_offset_t & end){
529 table_offset_t phrase_end;
530 table_offset_t index = offset;
531 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
533 new_chunk->set_content(offset, &c_separate, sizeof(char));
534 offset +=sizeof(char);
535 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
536 index += sizeof(table_offset_t);
537 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
538 PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i];
539 if ( !phrases ) { //null pointer
540 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
541 index += sizeof(table_offset_t);
544 phrases->store(new_chunk, offset, phrase_end); //has a end '#'
547 new_chunk->set_content(offset, &c_separate, sizeof(char));
548 offset += sizeof(char);
549 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
550 index += sizeof(table_offset_t);
556 bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
557 table_offset_t offset,
558 table_offset_t end) {
559 char * buf_begin = (char *) chunk->begin();
560 guint32 nindex = *((guint32 *)(buf_begin + offset));
561 table_offset_t * index = (table_offset_t *)
562 (buf_begin + offset + sizeof(guint32));
564 table_offset_t phrase_begin, phrase_end = *index;
565 g_array_set_size(m_phrase_array_indexes, 0);
566 for (size_t i = 1; i <= nindex; ++i) {
567 phrase_begin = phrase_end;
570 if ( phrase_begin == phrase_end ){
572 g_array_append_val(m_phrase_array_indexes, null);
576 #define CASE(len) case len: \
578 PhraseArrayIndexLevel2<len> * phrase = \
579 new PhraseArrayIndexLevel2<len>; \
580 phrase->load(chunk, phrase_begin, phrase_end - 1); \
581 assert( *(buf_begin + phrase_end - 1) == c_separate ); \
582 assert( phrase_end <= end ); \
583 g_array_append_val(m_phrase_array_indexes, phrase); \
608 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
609 assert ( c_separate == * (buf_begin + offset) );
613 bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk,
614 table_offset_t offset,
615 table_offset_t & end) {
616 guint32 nindex = m_phrase_array_indexes->len;
617 new_chunk->set_content(offset, &nindex, sizeof(guint32));
618 table_offset_t index = offset + sizeof(guint32);
620 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
621 new_chunk->set_content(offset, &c_separate, sizeof(char));
622 offset += sizeof(char);
623 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
624 index += sizeof(table_offset_t);
626 table_offset_t phrase_end;
627 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
628 #define CASE(len) case len: \
630 PhraseArrayIndexLevel2<len> * phrase = g_array_index \
631 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
633 new_chunk->set_content \
634 (index, &offset, sizeof(table_offset_t)); \
635 index += sizeof(table_offset_t); \
638 phrase->store(new_chunk, offset, phrase_end); \
639 offset = phrase_end; \
663 new_chunk->set_content(offset, &c_separate, sizeof(char));
664 offset += sizeof(char);
665 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
666 index += sizeof(table_offset_t);
674 template<size_t phrase_length>
675 bool PhraseArrayIndexLevel2<phrase_length>::
676 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
677 char * buf_begin = (char *) chunk->begin();
678 m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
682 template<size_t phrase_length>
683 bool PhraseArrayIndexLevel2<phrase_length>::
684 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
685 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
686 end = offset + m_chunk.size();
691 /* get length method */
693 int PhraseLengthIndexLevel2::get_length() const {
694 int length = m_phrase_array_indexes->len;
696 /* trim trailing zero. */
697 for (int i = length - 1; i >= 0; --i) {
698 void * array = g_array_index(m_phrase_array_indexes, void *, i);
709 template<size_t phrase_length>
710 int PhraseArrayIndexLevel2<phrase_length>::get_length() const {
711 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
712 chunk_begin = (IndexItem *) m_chunk.begin();
713 chunk_end = (IndexItem *) m_chunk.end();
715 return chunk_end - chunk_begin;
719 /* mask out method */
721 bool PhraseBitmapIndexLevel2::mask_out(phrase_token_t mask,
722 phrase_token_t value){
723 for (size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
724 PhraseLengthIndexLevel2 * & length_array =
725 m_phrase_length_indexes[i];
727 if (NULL == length_array)
730 length_array->mask_out(mask, value);
732 if (0 == length_array->get_length()) {
741 bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask,
742 phrase_token_t value){
743 #define CASE(len) case len: \
745 PhraseArrayIndexLevel2<len> * & array = g_array_index \
746 (m_phrase_array_indexes, \
747 PhraseArrayIndexLevel2<len> *, len - 1); \
752 array->mask_out(mask, value); \
754 if (0 == array->get_length()) { \
761 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
783 /* shrink self array. */
784 g_array_set_size(m_phrase_array_indexes, get_length());
789 template<size_t phrase_length>
790 bool PhraseArrayIndexLevel2<phrase_length>::mask_out
791 (phrase_token_t mask, phrase_token_t value) {
792 IndexItem * begin = NULL, * end = NULL;
793 begin = (IndexItem *) m_chunk.begin();
794 end = (IndexItem *) m_chunk.end();
796 for (IndexItem * cur = begin; cur != end; ++cur) {
797 if ((cur->m_token & mask) != value)
800 int offset = (cur - begin) * sizeof(IndexItem);
801 m_chunk.remove_content(offset, sizeof(IndexItem));
803 /* update chunk end. */
804 end = (IndexItem *) m_chunk.end();