3 * Library to deal with pinyin.
5 * Copyright (C) 2010 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "phrase_large_table.h"
27 /* class definition */
31 class PhraseLengthIndexLevel{
33 GArray* m_phrase_array_indexes;
35 PhraseLengthIndexLevel();
36 ~PhraseLengthIndexLevel();
38 /* load/store method */
39 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
42 /* search/add_index/remove_index method */
43 int search( int phrase_length, /* in */ ucs4_t phrase[],
44 /* out */ phrase_token_t & token);
46 int add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
47 int remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token);
50 template<size_t phrase_length>
51 class PhraseArrayIndexLevel{
55 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
56 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
58 /* search/add_index/remove_index method */
59 int search( /* in */ ucs4_t phrase[],
60 /* out */ phrase_token_t & token);
62 int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
63 int remove_index(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token);
68 using namespace pinyin;
70 /* class implementation */
72 template<size_t phrase_length>
73 struct PhraseIndexItem{
74 phrase_token_t m_token;
75 ucs4_t m_phrase[phrase_length];
77 PhraseIndexItem<phrase_length>(ucs4_t phrase[], phrase_token_t token){
78 memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
83 template<size_t phrase_length>
84 static int phrase_compare(const PhraseIndexItem<phrase_length> &lhs,
85 const PhraseIndexItem<phrase_length> &rhs){
86 ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
87 ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
89 return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
92 template<size_t phrase_length>
93 static bool phrase_less_than(const PhraseIndexItem<phrase_length> & lhs,
94 const PhraseIndexItem<phrase_length> & rhs){
95 return 0 > phrase_compare(lhs, rhs);
98 PhraseBitmapIndexLevel::PhraseBitmapIndexLevel(){
99 memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
102 void PhraseBitmapIndexLevel::reset(){
103 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
104 PhraseLengthIndexLevel * length_array =
105 m_phrase_length_indexes[i];
111 int PhraseBitmapIndexLevel::search( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
112 assert(phrase_length > 0);
114 int result = SEARCH_NONE;
115 /* use the lower 16-bit for bitmap index,
116 * as most the higher 16-bit are zero.
118 guint16 first_key = phrase[0] & 0xFFFF;
120 PhraseLengthIndexLevel * phrase_array = m_phrase_length_indexes[first_key];
122 return phrase_array->search(phrase_length, phrase, token);
126 PhraseLengthIndexLevel::PhraseLengthIndexLevel(){
127 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
130 PhraseLengthIndexLevel::~PhraseLengthIndexLevel(){
131 #define CASE(x) case x: \
133 PhraseArrayIndexLevel<x> * array = g_array_index \
134 (m_phrase_array_indexes, PhraseArrayIndexLevel<x> *, x); \
140 for ( size_t i = 0 ; i < m_phrase_array_indexes->len; ++i){
162 g_array_free(m_phrase_array_indexes, TRUE);
166 int PhraseLengthIndexLevel::search(int phrase_length,
167 /* in */ ucs4_t phrase[],
168 /* out */ phrase_token_t & token){
169 int result = SEARCH_NONE;
170 if(m_phrase_array_indexes->len < phrase_length + 1)
172 if (m_phrase_array_indexes->len > phrase_length + 1)
173 result |= SEARCH_CONTINUED;
175 #define CASE(len) case len: \
177 PhraseArrayIndexLevel<len> * array = g_array_index \
178 (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
181 result |= array->search(phrase, token); \
185 switch ( phrase_length ){
208 template<size_t phrase_length>
209 int PhraseArrayIndexLevel<phrase_length>::search(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
210 PhraseIndexItem<phrase_length> * chunk_begin, * chunk_end;
211 chunk_begin = (PhraseIndexItem<phrase_length> *)m_chunk.begin();
212 chunk_end = (PhraseIndexItem<phrase_length> *)m_chunk.end();
213 PhraseIndexItem<phrase_length> search_elem(phrase, -1);
216 std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
217 range = std_lite::equal_range(chunk_begin, chunk_end, search_elem, phrase_less_than<phrase_length>);
219 if ( range.first == range.second )
222 assert(range.second - range.first == 1);
223 token = range.first->m_token;
227 int PhraseBitmapIndexLevel::add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
228 guint16 first_key = phrase[0] & 0xFFFF;
229 PhraseLengthIndexLevel * & length_array = m_phrase_length_indexes[first_key];
230 if ( !length_array ){
231 length_array = new PhraseLengthIndexLevel();
233 return length_array->add_index(phrase_length, phrase, token);
236 int PhraseBitmapIndexLevel::remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
237 guint16 first_key = phrase[0] & 0xFFFF;
238 PhraseLengthIndexLevel * &length_array = m_phrase_length_indexes[first_key];
240 return length_array->remove_index(phrase_length, phrase, token);
241 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
244 int PhraseLengthIndexLevel::add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
245 if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
246 return ERROR_PHRASE_TOO_LONG;
248 if ( m_phrase_array_indexes -> len <= phrase_length )
249 g_array_set_size(m_phrase_array_indexes, phrase_length + 1);
251 #define CASE(len) case len: \
253 PhraseArrayIndexLevel<len> * &array = g_array_index \
254 (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
256 array = new PhraseArrayIndexLevel<len>; \
257 return array->add_index(phrase, token); \
260 switch(phrase_length){
284 int PhraseLengthIndexLevel::remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
285 if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
286 return ERROR_PHRASE_TOO_LONG;
288 if ( m_phrase_array_indexes -> len <= phrase_length )
289 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
290 #define CASE(len) case len: \
292 PhraseArrayIndexLevel<len> * &array = g_array_index \
293 (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
295 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
296 return array->remove_index(phrase, token); \
299 switch(phrase_length){
322 template<size_t phrase_length>
323 int PhraseArrayIndexLevel<phrase_length>::add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
324 PhraseIndexItem<phrase_length> * buf_begin, * buf_end;
326 PhraseIndexItem<phrase_length> new_elem(phrase, token);
327 buf_begin = (PhraseIndexItem<phrase_length> *) m_chunk.begin();
328 buf_end = (PhraseIndexItem<phrase_length> *) m_chunk.end();
330 std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
331 range = std_lite::equal_range(buf_begin, buf_end, new_elem, phrase_less_than<phrase_length>);
333 assert(range.second - range.first <= 1);
334 if ( range.second - range.first == 1 )
335 return ERROR_INSERT_ITEM_EXISTS;
337 PhraseIndexItem<phrase_length> * cur_elem = range.first;
338 int offset = (cur_elem - buf_begin) *
339 sizeof(PhraseIndexItem<phrase_length>);
340 m_chunk.insert_content(offset, &new_elem,
341 sizeof(PhraseIndexItem<phrase_length> ));
345 template<size_t phrase_length>
346 int PhraseArrayIndexLevel<phrase_length>::remove_index(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
347 PhraseIndexItem<phrase_length> * buf_begin, * buf_end;
349 PhraseIndexItem<phrase_length> remove_elem(phrase, -1);
350 buf_begin = (PhraseIndexItem<phrase_length> *) m_chunk.begin();
351 buf_end = (PhraseIndexItem<phrase_length> *) m_chunk.end();
353 std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
354 range = std_lite::equal_range(buf_begin, buf_end, remove_elem, phrase_less_than<phrase_length>);
356 assert(range.second - range.first <= 1);
357 PhraseIndexItem<phrase_length> * cur_elem = range.first;
358 if ( range.first == range.second || cur_elem == buf_end)
359 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
361 token = cur_elem->m_token;
362 int offset = (cur_elem - buf_begin) *
363 sizeof(PhraseIndexItem<phrase_length>);
364 m_chunk.remove_content(offset, sizeof (PhraseIndexItem<phrase_length>));
368 bool PhraseLargeTable::load_text(FILE * infile){
371 phrase_token_t token;
374 while ( !feof(infile) ) {
375 fscanf(infile, "%s", pinyin);
376 fscanf(infile, "%s", phrase);
377 fscanf(infile, "%u", &token);
378 fscanf(infile, "%ld", &freq);
383 glong phrase_len = g_utf8_strlen(phrase, -1);
384 ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
385 add_index(phrase_len, new_phrase, token);
392 bool PhraseBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
395 char * buf_begin = (char *) chunk->begin();
396 table_offset_t phrase_begin, phrase_end;
397 table_offset_t * index = (table_offset_t *) (buf_begin + offset);
400 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
401 phrase_begin = phrase_end;
404 if ( phrase_begin == phrase_end ) //null pointer
406 PhraseLengthIndexLevel * phrases = new PhraseLengthIndexLevel;
407 m_phrase_length_indexes[i] = phrases;
408 phrases->load(chunk, phrase_begin, phrase_end - 1);
409 assert( phrase_end <= end );
410 assert( *(buf_begin + phrase_end - 1) == c_separate);
412 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
413 assert( c_separate == *(buf_begin + offset) );
417 bool PhraseBitmapIndexLevel::store(MemoryChunk * new_chunk,
418 table_offset_t offset,
419 table_offset_t & end){
420 table_offset_t phrase_end;
421 table_offset_t index = offset;
422 offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
424 new_chunk->set_content(offset, &c_separate, sizeof(char));
425 offset +=sizeof(char);
426 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
427 index += sizeof(table_offset_t);
428 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
429 PhraseLengthIndexLevel * phrases = m_phrase_length_indexes[i];
430 if ( !phrases ) { //null pointer
431 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
432 index += sizeof(table_offset_t);
435 phrases->store(new_chunk, offset, phrase_end); //has a end '#'
438 new_chunk->set_content(offset, &c_separate, sizeof(char));
439 offset += sizeof(char);
440 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
441 index += sizeof(table_offset_t);
447 bool PhraseLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
448 char * buf_begin = (char *) chunk->begin();
449 guint32 nindex = *((guint32 *)(buf_begin + offset));
450 table_offset_t * index = (table_offset_t *)
451 (buf_begin + offset + sizeof(guint32));
453 table_offset_t phrase_begin, phrase_end = *index;
454 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
455 for ( size_t i = 0; i < nindex; ++i) {
456 phrase_begin = phrase_end;
459 if ( phrase_begin == phrase_end ){
461 g_array_append_val(m_phrase_array_indexes, null);
465 #define CASE(len) case len: \
467 PhraseArrayIndexLevel<len> * phrase = new PhraseArrayIndexLevel<len>; \
468 phrase->load(chunk, phrase_begin, phrase_end - 1); \
469 assert( *(buf_begin + phrase_end - 1) == c_separate); \
470 assert( phrase_end <= end ); \
471 g_array_append_val(m_phrase_array_indexes, phrase); \
496 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
497 assert ( c_separate == * (buf_begin + offset) );
501 bool PhraseLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
502 guint32 nindex = m_phrase_array_indexes->len;
503 new_chunk->set_content(offset, &nindex, sizeof(guint32));
504 table_offset_t index = offset + sizeof(guint32);
506 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
507 new_chunk->set_content(offset, &c_separate, sizeof(char));
508 offset += sizeof(char);
509 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
510 index += sizeof(table_offset_t);
512 table_offset_t phrase_end;
513 for ( size_t i = 0; i < m_phrase_array_indexes->len; ++i) {
514 #define CASE(len) case len: \
516 PhraseArrayIndexLevel<len> * phrase = g_array_index \
517 (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, i); \
519 new_chunk->set_content \
520 (index, &offset, sizeof(table_offset_t)); \
521 index += sizeof(table_offset_t); \
524 phrase->store(new_chunk, offset, phrase_end); \
525 offset = phrase_end; \
549 new_chunk->set_content(offset, &c_separate, sizeof(char));
550 offset += sizeof(char);
551 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
552 index += sizeof(table_offset_t);
560 template<size_t phrase_length>
561 bool PhraseArrayIndexLevel<phrase_length>::
562 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
563 char * buf_begin = (char *) chunk->begin();
564 m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
568 template<size_t phrase_length>
569 bool PhraseArrayIndexLevel<phrase_length>::
570 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
571 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
572 end = offset + m_chunk.size();