3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 #include "novel_types.h"
25 #include "pinyin_base.h"
26 #include "pinyin_phrase.h"
27 #include "pinyin_large_table.h"
30 /* class definition */
34 class PinyinLengthIndexLevel{
36 GArray* m_pinyin_array_indexes;
38 PinyinLengthIndexLevel();
39 ~PinyinLengthIndexLevel();
40 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
41 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
43 /*search/add_index method */
44 int search( int phrase_length, /* in */ PinyinCustomSettings * custom,
45 /* in */ PinyinKey keys[],
46 /* out */ PhraseIndexRanges ranges);
47 int add_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token);
48 int remove_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token);
51 template<size_t phrase_length>
52 class PinyinArrayIndexLevel{
55 int convert(PinyinCustomSettings * custom,
57 PinyinIndexItem<phrase_length> * begin,
58 PinyinIndexItem<phrase_length> * end,
59 PhraseIndexRanges ranges);
61 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
62 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
64 /*search/add_index method */
65 int search(/* in */ PinyinCustomSettings * custom,
66 /* in */ PinyinKey keys[],
67 /* out */ PhraseIndexRanges ranges);
68 int add_index(/* in */ PinyinKey keys[], /* in */ phrase_token_t token);
69 int remove_index(/* in */ PinyinKey keys[], /* in */ phrase_token_t token);
74 using namespace pinyin;
76 /* class implementation */
78 PinyinBitmapIndexLevel::PinyinBitmapIndexLevel(PinyinCustomSettings * custom)
80 memset(m_pinyin_length_indexes, 0, sizeof(m_pinyin_length_indexes));
83 void PinyinBitmapIndexLevel::reset(){
84 for ( int k = PINYIN_ZeroInitial; k < PINYIN_Number_Of_Initials; k++)
85 for ( int m = PINYIN_ZeroFinal; m < PINYIN_Number_Of_Finals; m++)
86 for ( int n = PINYIN_ZeroTone; n < PINYIN_Number_Of_Tones; n++){
87 PinyinLengthIndexLevel * length_array =
88 m_pinyin_length_indexes[k][m][n];
94 int PinyinBitmapIndexLevel::search( int phrase_length, /* in */ PinyinKey keys[],
95 /* out */ PhraseIndexRanges ranges) const{
96 assert(phrase_length > 0);
97 return initial_level_search(phrase_length, keys, ranges);
100 int PinyinBitmapIndexLevel::initial_level_search(int phrase_length,
101 /* in */PinyinKey keys[],
102 /* out */ PhraseIndexRanges ranges) const{
104 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
106 result |= final_level_search((PinyinInitial)first_key.m_initial,\
107 phrase_length, keys, ranges); \
108 if ( custom.use_ambiguities [AMBIGUITY] ){ \
109 result |= final_level_search(ANOTHER, \
110 phrase_length, keys, ranges); \
115 //deal with the ambiguities
117 int result = SEARCH_NONE;
118 PinyinKey& first_key = keys[0];
119 PinyinCustomSettings & custom= *m_custom;
121 switch(first_key.m_initial){
123 MATCH(PINYIN_AmbZhiZi, PINYIN_Zi, PINYIN_Zhi);
124 MATCH(PINYIN_AmbZhiZi, PINYIN_Zhi, PINYIN_Zi);
125 MATCH(PINYIN_AmbChiCi, PINYIN_Ci, PINYIN_Chi);
126 MATCH(PINYIN_AmbChiCi, PINYIN_Chi, PINYIN_Ci);
127 MATCH(PINYIN_AmbShiSi, PINYIN_Si, PINYIN_Shi);
128 MATCH(PINYIN_AmbShiSi, PINYIN_Shi, PINYIN_Si);
129 MATCH(PINYIN_AmbLeRi, PINYIN_Ri, PINYIN_Le);
130 MATCH(PINYIN_AmbNeLe, PINYIN_Ne, PINYIN_Le);
131 MATCH(PINYIN_AmbFoHe, PINYIN_Fo, PINYIN_He);
132 MATCH(PINYIN_AmbFoHe, PINYIN_He, PINYIN_Fo);
133 MATCH(PINYIN_AmbGeKe, PINYIN_Ge, PINYIN_Ke);
134 MATCH(PINYIN_AmbGeKe, PINYIN_Ke, PINYIN_Ge);
138 result |= final_level_search((PinyinInitial)first_key.m_initial,
139 phrase_length, keys, ranges);
140 if ( custom.use_ambiguities [PINYIN_AmbLeRi] )
141 result |= final_level_search(PINYIN_Ri, phrase_length,
143 if ( custom.use_ambiguities [PINYIN_AmbNeLe] )
144 result |= final_level_search(PINYIN_Ne, phrase_length,
150 return final_level_search((PinyinInitial)first_key.m_initial,
158 int PinyinBitmapIndexLevel::final_level_search(PinyinInitial initial,
160 /* in */PinyinKey keys[],
161 /* out */ PhraseIndexRanges ranges) const{
162 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
164 result = tone_level_search(initial,(PinyinFinal) first_key.m_final,\
165 phrase_length, keys, ranges); \
166 if ( custom.use_ambiguities [AMBIGUITY] ){ \
167 result |= tone_level_search(initial, ANOTHER, \
168 phrase_length, keys, ranges); \
173 int result = SEARCH_NONE;
174 PinyinKey& first_key = keys[0];
175 PinyinCustomSettings & custom= *m_custom;
177 switch(first_key.m_final){
178 case PINYIN_ZeroFinal:
180 if (!custom.use_incomplete )
182 for ( int i = PINYIN_A; i < PINYIN_Number_Of_Finals; ++i){
183 result |= tone_level_search(initial,(PinyinFinal)i ,
184 phrase_length, keys, ranges);
189 MATCH(PINYIN_AmbAnAng, PINYIN_An, PINYIN_Ang);
190 MATCH(PINYIN_AmbAnAng, PINYIN_Ang, PINYIN_An);
191 MATCH(PINYIN_AmbEnEng, PINYIN_En, PINYIN_Eng);
192 MATCH(PINYIN_AmbEnEng, PINYIN_Eng, PINYIN_En);
193 MATCH(PINYIN_AmbInIng, PINYIN_In, PINYIN_Ing);
194 MATCH(PINYIN_AmbInIng, PINYIN_Ing, PINYIN_In);
198 return tone_level_search(initial,(PinyinFinal)first_key.m_final,
199 phrase_length, keys, ranges);
205 int PinyinBitmapIndexLevel::tone_level_search(PinyinInitial initial,
208 /* in */PinyinKey keys[],
209 /* out */ PhraseIndexRanges ranges) const{
210 int result = SEARCH_NONE;
211 PinyinKey& first_key = keys[0];
212 PinyinCustomSettings & custom= *m_custom;
214 switch ( first_key.m_tone ){
215 case PINYIN_ZeroTone:
217 //deal with ZeroTone in pinyin table files.
218 for ( int i = PINYIN_ZeroTone; i < PINYIN_Number_Of_Tones; ++i){
219 PinyinLengthIndexLevel * phrases =
220 m_pinyin_length_indexes[initial][final][(PinyinTone)i];
222 result |= phrases->search(phrase_length - 1, &custom,
229 PinyinLengthIndexLevel * phrases =
230 m_pinyin_length_indexes[initial][final]
233 result = phrases->search(phrase_length - 1, &custom,
235 phrases = m_pinyin_length_indexes[initial][final]
236 [(PinyinTone) first_key.m_tone];
238 result |= phrases->search(phrase_length - 1, &custom,
246 PinyinLengthIndexLevel::PinyinLengthIndexLevel(){
247 m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
250 PinyinLengthIndexLevel::~PinyinLengthIndexLevel(){
251 #define CASE(len) case len: \
253 PinyinArrayIndexLevel<len> * array = g_array_index \
254 (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
259 for ( size_t i = 0 ; i < m_pinyin_array_indexes->len; ++i){
281 g_array_free(m_pinyin_array_indexes, TRUE);
285 int PinyinLengthIndexLevel::search( int phrase_length,
286 /* in */ PinyinCustomSettings * custom,
287 /* in */ PinyinKey keys[],
288 /* out */ PhraseIndexRanges ranges){
289 int result = SEARCH_NONE;
290 if (m_pinyin_array_indexes->len < phrase_length + 1)
292 if (m_pinyin_array_indexes->len > phrase_length + 1)
293 result |= SEARCH_CONTINUED;
295 #define CASE(len) case len: \
297 PinyinArrayIndexLevel<len> * array = g_array_index \
298 (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
301 result |= array->search(custom, keys, ranges); \
305 switch ( phrase_length ){
328 template<size_t phrase_length>
329 int PinyinArrayIndexLevel<phrase_length>::search(/* in */ PinyinCustomSettings * custom, /* in */ PinyinKey keys[], /* out */ PhraseIndexRanges ranges){
330 PhraseExactLessThan<phrase_length> m_lessthan;
331 PinyinIndexItem<phrase_length> * chunk_begin, * chunk_end;
332 chunk_begin = (PinyinIndexItem<phrase_length> *)m_chunk.begin();
333 chunk_end = (PinyinIndexItem<phrase_length> *)m_chunk.end();
335 PinyinKey left_keys[phrase_length], right_keys[phrase_length];
336 compute_lower_value(*custom, keys, left_keys, phrase_length);
337 compute_upper_value(*custom, keys, right_keys, phrase_length);
338 PinyinIndexItem<phrase_length> left(left_keys, -1), right(right_keys, -1);
340 PinyinIndexItem<phrase_length> * begin = std_lite::lower_bound(chunk_begin, chunk_end, left, m_lessthan);
341 PinyinIndexItem<phrase_length> * end = std_lite::upper_bound(chunk_begin, chunk_end, right, m_lessthan);
343 return convert(custom, keys, begin, end, ranges);
346 template<size_t phrase_length>
347 int PinyinArrayIndexLevel<phrase_length>::convert(PinyinCustomSettings * custom, PinyinKey keys[], PinyinIndexItem<phrase_length> * begin, PinyinIndexItem<phrase_length> * end, PhraseIndexRanges ranges){
348 PinyinIndexItem<phrase_length> * iter;
349 PhraseIndexRange cursor;
350 GArray * head, *cursor_head = NULL;
351 int result = SEARCH_NONE;
352 cursor.m_range_begin = -1; cursor.m_range_end = -1;
353 for ( iter = begin; iter != end; ++iter){
355 pinyin_compare_with_ambiguities
356 (*custom, keys, iter->m_keys, phrase_length))
358 phrase_token_t token = iter->m_token;
359 head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
365 if ( cursor.m_range_begin == (phrase_token_t) -1 ){
366 cursor.m_range_begin = token;
367 cursor.m_range_end = token + 1;
369 }else if (cursor.m_range_end == token &&
370 PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_end) ==
371 PHRASE_INDEX_LIBRARY_INDEX(token) ){
372 cursor.m_range_end++;
374 g_array_append_val(cursor_head, cursor);
375 cursor.m_range_begin = token; cursor.m_range_end = token + 1;
379 if ( cursor.m_range_begin == (phrase_token_t) -1 )
382 g_array_append_val(cursor_head, cursor);
386 int PinyinBitmapIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token){
387 PinyinKey first_key = keys[0];
388 PinyinLengthIndexLevel * &length_array =
389 m_pinyin_length_indexes[first_key.m_initial][first_key.m_final][first_key.m_tone];
390 if ( !length_array ){
391 length_array = new PinyinLengthIndexLevel();
393 return length_array->add_index(phrase_length - 1, keys + 1, token);
396 int PinyinBitmapIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token){
397 PinyinKey first_key = keys[0];
398 PinyinLengthIndexLevel * &length_array =
399 m_pinyin_length_indexes[first_key.m_initial][first_key.m_final][first_key.m_tone];
401 return length_array->remove_index(phrase_length - 1, keys + 1, token);
402 return REMOVE_ITEM_DONOT_EXISTS;
405 int PinyinLengthIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token){
406 assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
407 if ( m_pinyin_array_indexes -> len <= phrase_length )
408 g_array_set_size(m_pinyin_array_indexes, phrase_length + 1);
409 #define CASE(len) case len: \
411 PinyinArrayIndexLevel<len> * &array = g_array_index \
412 (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
414 array = new PinyinArrayIndexLevel<len>; \
415 return array->add_index(keys, token); \
417 switch(phrase_length){
440 int PinyinLengthIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[], /* in */ phrase_token_t token){
441 assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
442 if ( m_pinyin_array_indexes -> len <= phrase_length )
443 return REMOVE_ITEM_DONOT_EXISTS;
444 #define CASE(len) case len: \
446 PinyinArrayIndexLevel<len> * &array = g_array_index \
447 (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
449 return REMOVE_ITEM_DONOT_EXISTS; \
450 return array->remove_index(keys, token); \
452 switch(phrase_length){
475 template<size_t phrase_length>
476 int PinyinArrayIndexLevel<phrase_length>::add_index(/* in */ PinyinKey keys[], /* in */ phrase_token_t token){
477 PhraseExactLessThan<phrase_length> m_lessthan;
478 PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
480 PinyinIndexItem<phrase_length> new_elem(keys, token);
481 buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
482 buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
484 std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
485 range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan);
487 PinyinIndexItem<phrase_length> * cur_elem;
488 for ( cur_elem = range.first;
489 cur_elem != range.second; ++cur_elem){
490 if ( cur_elem->m_token == token )
491 return INSERT_ITEM_EXISTS;
492 if ( cur_elem->m_token > token )
496 int offset = (cur_elem - buf_begin) *
497 sizeof(PinyinIndexItem<phrase_length>);
498 m_chunk.insert_content(offset, &new_elem,
499 sizeof ( PinyinIndexItem<phrase_length> ));
503 template<size_t phrase_length>
504 int PinyinArrayIndexLevel<phrase_length>::remove_index(/* in */ PinyinKey keys[], /* in */ phrase_token_t token){
505 PhraseExactLessThan<phrase_length> m_lessthan;
506 PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
508 PinyinIndexItem<phrase_length> remove_elem(keys, token);
509 buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
510 buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
512 std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
513 range = std_lite::equal_range(buf_begin, buf_end, remove_elem, m_lessthan);
515 PinyinIndexItem<phrase_length> * cur_elem;
516 for ( cur_elem = range.first;
517 cur_elem != range.second; ++cur_elem){
518 if ( cur_elem->m_token == token )
521 if (cur_elem->m_token != token )
522 return REMOVE_ITEM_DONOT_EXISTS;
524 int offset = (cur_elem - buf_begin) *
525 sizeof(PinyinIndexItem<phrase_length>);
526 m_chunk.remove_content(offset, sizeof (PinyinIndexItem<phrase_length>));
530 bool PinyinLargeTable::load_text(FILE * infile){
533 phrase_token_t token;
536 while ( !feof(infile) ) {
537 fscanf(infile, "%s", pinyin);
538 fscanf(infile, "%s", phrase);
539 fscanf(infile, "%u", &token);
540 fscanf(infile, "%ld", &freq);
545 PinyinDefaultParser parser;
546 NullPinyinValidator validator;
547 PinyinKeyVector keys;
548 PinyinKeyPosVector poses;
550 keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
551 poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
552 parser.parse(validator, keys, poses, pinyin);
554 add_index( keys->len, (PinyinKey *)keys->data, token);
556 g_array_free(keys, true);
557 g_array_free(poses, true);
562 bool PinyinBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
565 char * buf_begin = (char *) chunk->begin();
566 table_offset_t phrase_begin, phrase_end;
567 table_offset_t * index = (table_offset_t *) (buf_begin + offset);
569 for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m )
570 for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
571 for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){
572 phrase_begin = phrase_end;
575 if ( phrase_begin == phrase_end ) //null pointer
577 PinyinLengthIndexLevel * phrases = new PinyinLengthIndexLevel;
578 m_pinyin_length_indexes[m][n][k] = phrases;
579 phrases->load(chunk, phrase_begin, phrase_end - 1);
580 assert( phrase_end <= end );
581 assert( *(buf_begin + phrase_end - 1) == c_separate);
583 offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof (table_offset_t);
584 assert( c_separate == *(buf_begin + offset) );
588 bool PinyinBitmapIndexLevel::store(MemoryChunk * new_chunk,
589 table_offset_t offset,
590 table_offset_t & end){
591 table_offset_t phrase_end;
592 table_offset_t index = offset;
593 offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t);
595 new_chunk->set_content(offset, &c_separate, sizeof(char));
596 offset += sizeof(char);
597 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
598 index += sizeof(table_offset_t);
599 for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m)
600 for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
601 for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k) {
602 PinyinLengthIndexLevel * phrases = m_pinyin_length_indexes[m][n][k];
603 if ( !phrases ) { //null pointer
604 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
605 index += sizeof(table_offset_t);
608 phrases->store(new_chunk, offset, phrase_end); //has a end '#'
611 new_chunk->set_content(offset, &c_separate, sizeof(char));
612 offset += sizeof(char);
613 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
614 index += sizeof(table_offset_t);
620 bool PinyinLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
621 char * buf_begin = (char *) chunk->begin();
622 guint32 nindex = *((guint32 *)(buf_begin + offset));
623 table_offset_t * index = (table_offset_t *)
624 (buf_begin + offset + sizeof(guint32));
626 table_offset_t phrase_begin, phrase_end = *index;
627 m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
628 for ( size_t i = 0; i < nindex; ++i) {
629 phrase_begin = phrase_end;
632 if ( phrase_begin == phrase_end ){
634 g_array_append_val(m_pinyin_array_indexes, null);
638 #define CASE(len) case len: \
640 PinyinArrayIndexLevel<len> * phrase = new PinyinArrayIndexLevel<len>; \
641 phrase->load(chunk, phrase_begin, phrase_end - 1); \
642 assert( *(buf_begin + phrase_end - 1) == c_separate); \
643 assert( phrase_end <= end ); \
644 g_array_append_val(m_pinyin_array_indexes, phrase); \
670 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
671 assert ( c_separate == * (buf_begin + offset) );
675 bool PinyinLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
676 guint32 nindex = m_pinyin_array_indexes->len;
677 new_chunk->set_content(offset, &nindex, sizeof(guint32));
678 table_offset_t index = offset + sizeof(guint32);
680 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
681 new_chunk->set_content(offset, &c_separate, sizeof(char));
682 offset += sizeof(char);
683 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
684 index += sizeof(table_offset_t);
686 table_offset_t phrase_end;
687 for ( size_t i = 0 ; i < m_pinyin_array_indexes->len; ++i) {
688 #define CASE(len) case len: \
690 PinyinArrayIndexLevel<len> * phrase = g_array_index \
691 (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> * , i); \
693 new_chunk->set_content \
694 (index, &offset, sizeof(table_offset_t)); \
695 index += sizeof(table_offset_t); \
698 phrase->store(new_chunk, offset, phrase_end); \
699 offset = phrase_end; \
723 new_chunk->set_content(offset, &c_separate, sizeof(char));
724 offset += sizeof(char);
725 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
726 index += sizeof(table_offset_t);
734 template<size_t phrase_length>
735 bool PinyinArrayIndexLevel<phrase_length>::
736 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
737 char * buf_begin = (char *) chunk->begin();
738 m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
742 template<size_t phrase_length>
743 bool PinyinArrayIndexLevel<phrase_length>::
744 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
745 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
746 end = offset + m_chunk.size();