3 * Library to deal with pinyin.
5 * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "phrase_large_table2.h"
27 /* class definition */
31 class PhraseLengthIndexLevel2{
33 GArray * m_phrase_array_indexes;
35 PhraseLengthIndexLevel2();
36 ~PhraseLengthIndexLevel2();
38 /* load/store method */
39 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
43 int search(int phrase_length, /* in */ ucs4_t phrase[],
44 /* out */ PhraseTokens tokens) const;
46 /* add_index/remove_index method */
47 int add_index(int phrase_length, /* in */ ucs4_t phrase[],
48 /* in */ phrase_token_t token);
49 int remove_index(int phrase_length, /* in */ ucs4_t phrase[],
50 /* in */ phrase_token_t token);
54 template<size_t phrase_length>
55 struct PhraseIndexItem2{
56 phrase_token_t m_token;
57 ucs4_t m_phrase[phrase_length];
59 PhraseIndexItem2<phrase_length>(ucs4_t phrase[], phrase_token_t token){
60 memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
66 template<size_t phrase_length>
67 class PhraseArrayIndexLevel2{
69 typedef PhraseIndexItem2<phrase_length> IndexItem;
74 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
75 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
78 int search(/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
80 /* add_index/remove_index method */
81 int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
82 int remove_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
87 using namespace pinyin;
89 /* class implementation */
91 template<size_t phrase_length>
92 static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
93 const PhraseIndexItem2<phrase_length> &rhs){
94 ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
95 ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
97 return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
100 template<size_t phrase_length>
101 static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
102 const PhraseIndexItem2<phrase_length> & rhs){
103 return 0 > phrase_compare2(lhs, rhs);
106 PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
107 memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
110 void PhraseBitmapIndexLevel2::reset(){
111 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
112 PhraseLengthIndexLevel2 * length_array =
113 m_phrase_length_indexes[i];
122 int PhraseBitmapIndexLevel2::search(int phrase_length,
123 /* in */ ucs4_t phrase[],
124 /* out */ PhraseTokens tokens) const {
125 assert(phrase_length > 0);
127 int result = SEARCH_NONE;
128 /* use the first 8-bit of the lower 16-bit for bitmap index,
129 * as most the higher 16-bit are zero.
131 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
133 PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
135 return phrase_array->search(phrase_length, phrase, tokens);
139 PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
140 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
143 PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
144 #define CASE(len) case len: \
146 PhraseArrayIndexLevel2<len> * & array = g_array_index \
147 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
155 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
177 g_array_free(m_phrase_array_indexes, TRUE);
181 int PhraseLengthIndexLevel2::search(int phrase_length,
182 /* in */ ucs4_t phrase[],
183 /* out */ PhraseTokens tokens) const {
184 int result = SEARCH_NONE;
185 if(m_phrase_array_indexes->len < phrase_length)
187 if (m_phrase_array_indexes->len > phrase_length)
188 result |= SEARCH_CONTINUED;
190 #define CASE(len) case len: \
192 PhraseArrayIndexLevel2<len> * array = g_array_index \
193 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
196 result |= array->search(phrase, tokens); \
200 switch ( phrase_length ){
223 template<size_t phrase_length>
224 int PhraseArrayIndexLevel2<phrase_length>::search
225 (/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
226 int result = SEARCH_NONE;
228 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
229 chunk_begin = (IndexItem *) m_chunk.begin();
230 chunk_end = (IndexItem *) m_chunk.end();
233 IndexItem search_elem(phrase, -1);
234 std_lite::pair<IndexItem *, IndexItem *> range;
235 range = std_lite::equal_range
236 (chunk_begin, chunk_end, search_elem,
237 phrase_less_than2<phrase_length>);
239 const IndexItem * const begin = range.first;
240 const IndexItem * const end = range.second;
244 const IndexItem * iter = NULL;
245 GArray * array = NULL;
247 for (iter = begin; iter != end; ++iter) {
248 phrase_token_t token = iter->m_token;
250 /* filter out disabled sub phrase indices. */
251 array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
257 g_array_append_val(array, token);
264 /* add/remove index method */
266 int PhraseBitmapIndexLevel2::add_index(int phrase_length,
267 /* in */ ucs4_t phrase[],
268 /* in */ phrase_token_t token){
269 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
271 PhraseLengthIndexLevel2 * & length_array =
272 m_phrase_length_indexes[first_key];
274 if ( !length_array ){
275 length_array = new PhraseLengthIndexLevel2();
277 return length_array->add_index(phrase_length, phrase, token);
280 int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
281 /* in */ ucs4_t phrase[],
282 /* in */ phrase_token_t token){
283 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
285 PhraseLengthIndexLevel2 * & length_array =
286 m_phrase_length_indexes[first_key];
289 return length_array->remove_index(phrase_length, phrase, token);
291 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
294 int PhraseLengthIndexLevel2::add_index(int phrase_length,
295 /* in */ ucs4_t phrase[],
296 /* in */ phrase_token_t token) {
297 if (phrase_length >= MAX_PHRASE_LENGTH)
298 return ERROR_PHRASE_TOO_LONG;
300 if (m_phrase_array_indexes->len < phrase_length)
301 g_array_set_size(m_phrase_array_indexes, phrase_length);
303 #define CASE(len) case len: \
305 PhraseArrayIndexLevel2<len> * & array = g_array_index \
306 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
308 array = new PhraseArrayIndexLevel2<len>; \
309 return array->add_index(phrase, token); \
312 switch(phrase_length){
336 int PhraseLengthIndexLevel2::remove_index(int phrase_length,
337 /* in */ ucs4_t phrase[],
338 /* in */ phrase_token_t token) {
339 if (phrase_length >= MAX_PHRASE_LENGTH)
340 return ERROR_PHRASE_TOO_LONG;
342 if (m_phrase_array_indexes->len < phrase_length)
343 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
345 #define CASE(len) case len: \
347 PhraseArrayIndexLevel2<len> * & array = g_array_index \
348 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
350 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
351 return array->remove_index(phrase, token); \
354 switch(phrase_length){
377 template<size_t phrase_length>
378 int PhraseArrayIndexLevel2<phrase_length>::add_index
379 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
380 IndexItem * begin, * end;
382 IndexItem add_elem(phrase, token);
383 begin = (IndexItem *) m_chunk.begin();
384 end = (IndexItem *) m_chunk.end();
386 std_lite::pair<IndexItem *, IndexItem *> range;
387 range = std_lite::equal_range
388 (begin, end, add_elem, phrase_less_than2<phrase_length>);
390 IndexItem * cur_elem;
391 for (cur_elem = range.first;
392 cur_elem != range.second; ++cur_elem) {
393 if (cur_elem->m_token == token)
394 return ERROR_INSERT_ITEM_EXISTS;
395 if (cur_elem->m_token > token)
399 int offset = (cur_elem - begin) * sizeof(IndexItem);
400 m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
404 template<size_t phrase_length>
405 int PhraseArrayIndexLevel2<phrase_length>::remove_index
406 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token) {
407 IndexItem * begin, * end;
409 IndexItem remove_elem(phrase, token);
410 begin = (IndexItem *) m_chunk.begin();
411 end = (IndexItem *) m_chunk.end();
413 std_lite::pair<IndexItem *, IndexItem *> range;
414 range = std_lite::equal_range
415 (begin, end, remove_elem, phrase_less_than2<phrase_length>);
417 IndexItem * cur_elem;
418 for (cur_elem = range.first;
419 cur_elem != range.second; ++cur_elem) {
420 if (cur_elem->m_token == token)
424 if (cur_elem == range.second)
425 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
427 int offset = (cur_elem - begin) * sizeof(IndexItem);
428 m_chunk.remove_content(offset, sizeof(IndexItem));
433 /* load text method */
435 bool PhraseLargeTable2::load_text(FILE * infile){
438 phrase_token_t token;
441 while ( !feof(infile) ) {
442 fscanf(infile, "%s", pinyin);
443 fscanf(infile, "%s", phrase);
444 fscanf(infile, "%u", &token);
445 fscanf(infile, "%ld", &freq);
450 glong phrase_len = g_utf8_strlen(phrase, -1);
451 ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
452 add_index(phrase_len, new_phrase, token);
460 /* load/store method */
462 bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk,
463 table_offset_t offset,
466 char * buf_begin = (char *) chunk->begin();
467 table_offset_t phrase_begin, phrase_end;
468 table_offset_t * index = (table_offset_t *) (buf_begin + offset);
471 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
472 phrase_begin = phrase_end;
475 if ( phrase_begin == phrase_end ) //null pointer
477 PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2;
478 m_phrase_length_indexes[i] = phrases;
479 phrases->load(chunk, phrase_begin, phrase_end - 1);
480 assert( phrase_end <= end );
481 assert( *(buf_begin + phrase_end - 1) == c_separate);
483 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
484 assert( c_separate == *(buf_begin + offset) );
488 bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk,
489 table_offset_t offset,
490 table_offset_t & end){
491 table_offset_t phrase_end;
492 table_offset_t index = offset;
493 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
495 new_chunk->set_content(offset, &c_separate, sizeof(char));
496 offset +=sizeof(char);
497 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
498 index += sizeof(table_offset_t);
499 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
500 PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i];
501 if ( !phrases ) { //null pointer
502 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
503 index += sizeof(table_offset_t);
506 phrases->store(new_chunk, offset, phrase_end); //has a end '#'
509 new_chunk->set_content(offset, &c_separate, sizeof(char));
510 offset += sizeof(char);
511 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
512 index += sizeof(table_offset_t);
518 bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
519 table_offset_t offset,
520 table_offset_t end) {
521 char * buf_begin = (char *) chunk->begin();
522 guint32 nindex = *((guint32 *)(buf_begin + offset));
523 table_offset_t * index = (table_offset_t *)
524 (buf_begin + offset + sizeof(guint32));
526 table_offset_t phrase_begin, phrase_end = *index;
527 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
528 for (size_t i = 1; i <= nindex; ++i) {
529 phrase_begin = phrase_end;
532 if ( phrase_begin == phrase_end ){
534 g_array_append_val(m_phrase_array_indexes, null);
538 #define CASE(len) case len: \
540 PhraseArrayIndexLevel2<len> * phrase = \
541 new PhraseArrayIndexLevel2<len>; \
542 phrase->load(chunk, phrase_begin, phrase_end - 1); \
543 assert( *(buf_begin + phrase_end - 1) == c_separate ); \
544 assert( phrase_end <= end ); \
545 g_array_append_val(m_phrase_array_indexes, phrase); \
570 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
571 assert ( c_separate == * (buf_begin + offset) );
575 bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk,
576 table_offset_t offset,
577 table_offset_t & end) {
578 guint32 nindex = m_phrase_array_indexes->len;
579 new_chunk->set_content(offset, &nindex, sizeof(guint32));
580 table_offset_t index = offset + sizeof(guint32);
582 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
583 new_chunk->set_content(offset, &c_separate, sizeof(char));
584 offset += sizeof(char);
585 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
586 index += sizeof(table_offset_t);
588 table_offset_t phrase_end;
589 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
590 #define CASE(len) case len: \
592 PhraseArrayIndexLevel2<len> * phrase = g_array_index \
593 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
595 new_chunk->set_content \
596 (index, &offset, sizeof(table_offset_t)); \
597 index += sizeof(table_offset_t); \
600 phrase->store(new_chunk, offset, phrase_end); \
601 offset = phrase_end; \
625 new_chunk->set_content(offset, &c_separate, sizeof(char));
626 offset += sizeof(char);
627 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
628 index += sizeof(table_offset_t);
636 template<size_t phrase_length>
637 bool PhraseArrayIndexLevel2<phrase_length>::
638 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
639 char * buf_begin = (char *) chunk->begin();
640 m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
644 template<size_t phrase_length>
645 bool PhraseArrayIndexLevel2<phrase_length>::
646 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
647 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
648 end = offset + m_chunk.size();