begin to write import iterator
[platform/upstream/libpinyin.git] / src / storage / phrase_large_table.cpp
1 /* 
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *  
5  *  Copyright (C) 2010 Peng Wu
6  *  
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  *  
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  */
21
22 #include <assert.h>
23 #include <string.h>
24 #include "phrase_large_table.h"
25
26
27 /* class definition */
28
29 namespace pinyin{
30
31 class PhraseLengthIndexLevel{
32 protected:
33     GArray* m_phrase_array_indexes;
34 public:
35     PhraseLengthIndexLevel();
36     ~PhraseLengthIndexLevel();
37
38     /* load/store method */
39     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40     bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
41
42     /* search/add_index/remove_index method */
43     int search( int phrase_length, /* in */ ucs4_t phrase[],
44                 /* out */ phrase_token_t & token);
45
46     int add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
47     int remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token);
48 };
49
50 template<size_t phrase_length>
51 class PhraseArrayIndexLevel{
52 protected:
53     MemoryChunk m_chunk;
54 public:
55     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
56     bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
57
58     /* search/add_index/remove_index method */
59     int search( /* in */ ucs4_t phrase[],
60                 /* out */ phrase_token_t & token);
61
62     int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
63     int remove_index(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token);
64 };
65
66 };
67
68 using namespace pinyin;
69
70 /* class implementation */
71
72 template<size_t phrase_length>
73 struct PhraseIndexItem{
74     phrase_token_t m_token;
75     ucs4_t m_phrase[phrase_length];
76 public:
77     PhraseIndexItem<phrase_length>(ucs4_t phrase[], phrase_token_t token){
78         memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
79         m_token = token;
80     }
81 };
82
83 template<size_t phrase_length>
84 static int phrase_compare(const PhraseIndexItem<phrase_length> &lhs,
85                           const PhraseIndexItem<phrase_length> &rhs){
86     ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
87     ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
88
89     return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
90 }
91
92 template<size_t phrase_length>
93 static bool phrase_less_than(const PhraseIndexItem<phrase_length> & lhs,
94                              const PhraseIndexItem<phrase_length> & rhs){
95     return 0 > phrase_compare(lhs, rhs);
96 }
97
98 PhraseBitmapIndexLevel::PhraseBitmapIndexLevel(){
99     memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
100 }
101
102 void PhraseBitmapIndexLevel::reset(){
103     for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
104         PhraseLengthIndexLevel * length_array =
105             m_phrase_length_indexes[i];
106         if ( length_array )
107             delete length_array;
108     }
109 }
110
111 int PhraseBitmapIndexLevel::search( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
112     assert(phrase_length > 0);
113
114     int result = SEARCH_NONE;
115     /* use the lower 16-bit for bitmap index,
116      * as most the higher 16-bit are zero.
117      */
118     guint16 first_key = phrase[0] & 0xFFFF;
119
120     PhraseLengthIndexLevel * phrase_array = m_phrase_length_indexes[first_key];
121     if ( phrase_array )
122         return phrase_array->search(phrase_length, phrase, token);
123     return result;
124 }
125
126 PhraseLengthIndexLevel::PhraseLengthIndexLevel(){
127     m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
128 }
129
130 PhraseLengthIndexLevel::~PhraseLengthIndexLevel(){
131 #define CASE(x) case x:                                                 \
132     {                                                                   \
133         PhraseArrayIndexLevel<x> * array =  g_array_index               \
134             (m_phrase_array_indexes, PhraseArrayIndexLevel<x> *, x);    \
135         if ( array )                                                    \
136             delete array;                                               \
137         break;                                                          \
138     }
139
140     for ( size_t i = 0 ; i < m_phrase_array_indexes->len; ++i){
141         switch (i){
142             CASE(0);
143             CASE(1);
144             CASE(2);
145             CASE(3);
146             CASE(4);
147             CASE(5);
148             CASE(6);
149             CASE(7);
150             CASE(8);
151             CASE(9);
152             CASE(10);
153             CASE(11);
154             CASE(12);
155             CASE(13);
156             CASE(14);
157             CASE(15);
158         default:
159             assert(false);
160         }
161     }
162     g_array_free(m_phrase_array_indexes, TRUE);
163 #undef CASE
164 }
165
166 int PhraseLengthIndexLevel::search(int phrase_length,
167                                    /* in */ ucs4_t phrase[],
168                                    /* out */ phrase_token_t & token){
169     int result = SEARCH_NONE;
170     if(m_phrase_array_indexes->len < phrase_length + 1)
171         return result;
172     if (m_phrase_array_indexes->len > phrase_length + 1)
173         result |= SEARCH_CONTINUED;
174
175 #define CASE(len) case len:                                             \
176     {                                                                   \
177         PhraseArrayIndexLevel<len> * array = g_array_index              \
178             (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
179         if ( !array )                                                   \
180             return result;                                              \
181         result |= array->search(phrase, token);                         \
182         return result;                                                  \
183     }
184
185     switch ( phrase_length ){
186         CASE(0);
187         CASE(1);
188         CASE(2);
189         CASE(3);
190         CASE(4);
191         CASE(5);
192         CASE(6);
193         CASE(7);
194         CASE(8);
195         CASE(9);
196         CASE(10);
197         CASE(11);
198         CASE(12);
199         CASE(13);
200         CASE(14);
201         CASE(15);
202     default:
203         assert(false);
204     }
205 #undef CASE
206 }
207
208 template<size_t phrase_length>
209 int PhraseArrayIndexLevel<phrase_length>::search(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
210     PhraseIndexItem<phrase_length> * chunk_begin, * chunk_end;
211     chunk_begin = (PhraseIndexItem<phrase_length> *)m_chunk.begin();
212     chunk_end = (PhraseIndexItem<phrase_length> *)m_chunk.end();
213     PhraseIndexItem<phrase_length> search_elem(phrase, -1);
214
215     //do the search
216     std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
217     range = std_lite::equal_range(chunk_begin, chunk_end, search_elem, phrase_less_than<phrase_length>);
218
219     if ( range.first == range.second )
220         return SEARCH_NONE;
221
222     assert(range.second - range.first == 1);
223     token = range.first->m_token;
224     return SEARCH_OK;
225 }
226
227 int PhraseBitmapIndexLevel::add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
228     guint16 first_key =  phrase[0] & 0xFFFF;
229     PhraseLengthIndexLevel * & length_array = m_phrase_length_indexes[first_key];
230     if ( !length_array ){
231         length_array = new PhraseLengthIndexLevel();
232     }
233     return length_array->add_index(phrase_length, phrase, token);
234 }
235
236 int PhraseBitmapIndexLevel::remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
237     guint16 first_key = phrase[0] & 0xFFFF;
238     PhraseLengthIndexLevel * &length_array = m_phrase_length_indexes[first_key];
239     if ( length_array )
240         return length_array->remove_index(phrase_length, phrase, token);
241     return ERROR_REMOVE_ITEM_DONOT_EXISTS;
242 }
243
244 int PhraseLengthIndexLevel::add_index( int phrase_length, /* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
245     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
246         return ERROR_PHRASE_TOO_LONG;
247
248     if ( m_phrase_array_indexes -> len <= phrase_length )
249         g_array_set_size(m_phrase_array_indexes, phrase_length + 1);
250
251 #define CASE(len) case len:                                             \
252     {                                                                   \
253         PhraseArrayIndexLevel<len> * &array = g_array_index             \
254             (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
255         if ( !array )                                                   \
256             array = new PhraseArrayIndexLevel<len>;                     \
257         return array->add_index(phrase, token);                         \
258     }
259
260     switch(phrase_length){
261         CASE(0);
262         CASE(1);
263         CASE(2);
264         CASE(3);
265         CASE(4);
266         CASE(5);
267         CASE(6);
268         CASE(7);
269         CASE(8);
270         CASE(9);
271         CASE(10);
272         CASE(11);
273         CASE(12);
274         CASE(13);
275         CASE(14);
276         CASE(15);
277     default:
278         assert(false);
279     }
280
281 #undef CASE
282 }
283
284 int PhraseLengthIndexLevel::remove_index( int phrase_length, /* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
285     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
286         return ERROR_PHRASE_TOO_LONG;
287
288     if ( m_phrase_array_indexes -> len <= phrase_length )
289         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
290 #define CASE(len) case len:                                             \
291     {                                                                   \
292         PhraseArrayIndexLevel<len> * &array =  g_array_index            \
293             (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, len); \
294         if ( !array )                                                   \
295             return ERROR_REMOVE_ITEM_DONOT_EXISTS;                            \
296         return array->remove_index(phrase, token);                      \
297     }
298
299     switch(phrase_length){
300         CASE(0);
301         CASE(1);
302         CASE(2);
303         CASE(3);
304         CASE(4);
305         CASE(5);
306         CASE(6);
307         CASE(7);
308         CASE(8);
309         CASE(9);
310         CASE(10);
311         CASE(11);
312         CASE(12);
313         CASE(13);
314         CASE(14);
315         CASE(15);
316     default:
317         assert(false);
318     }
319 #undef CASE
320 }
321
322 template<size_t phrase_length>
323 int PhraseArrayIndexLevel<phrase_length>::add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
324     PhraseIndexItem<phrase_length> * buf_begin, * buf_end;
325
326     PhraseIndexItem<phrase_length> new_elem(phrase, token);
327     buf_begin = (PhraseIndexItem<phrase_length> *) m_chunk.begin();
328     buf_end = (PhraseIndexItem<phrase_length> *) m_chunk.end();
329
330     std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
331     range = std_lite::equal_range(buf_begin, buf_end, new_elem, phrase_less_than<phrase_length>);
332
333     assert(range.second - range.first <= 1);
334     if ( range.second - range.first == 1 )
335         return ERROR_INSERT_ITEM_EXISTS;
336
337     PhraseIndexItem<phrase_length> * cur_elem = range.first;
338     int offset = (cur_elem - buf_begin) *
339         sizeof(PhraseIndexItem<phrase_length>);
340     m_chunk.insert_content(offset, &new_elem,
341                            sizeof(PhraseIndexItem<phrase_length> ));
342     return ERROR_OK;
343 }
344
345 template<size_t phrase_length>
346 int PhraseArrayIndexLevel<phrase_length>::remove_index(/* in */ ucs4_t phrase[], /* out */ phrase_token_t & token){
347     PhraseIndexItem<phrase_length> * buf_begin, * buf_end;
348
349     PhraseIndexItem<phrase_length> remove_elem(phrase, -1);
350     buf_begin = (PhraseIndexItem<phrase_length> *) m_chunk.begin();
351     buf_end = (PhraseIndexItem<phrase_length> *) m_chunk.end();
352
353     std_lite::pair<PhraseIndexItem<phrase_length> *, PhraseIndexItem<phrase_length> *> range;
354     range = std_lite::equal_range(buf_begin, buf_end, remove_elem, phrase_less_than<phrase_length>);
355
356     assert(range.second - range.first <= 1);
357     PhraseIndexItem<phrase_length> * cur_elem = range.first;
358     if ( range.first == range.second || cur_elem == buf_end)
359         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
360
361     token = cur_elem->m_token;
362     int offset = (cur_elem -  buf_begin) *
363         sizeof(PhraseIndexItem<phrase_length>);
364     m_chunk.remove_content(offset, sizeof (PhraseIndexItem<phrase_length>));
365     return ERROR_OK;
366 }
367
368 bool PhraseLargeTable::load_text(FILE * infile){
369     char pinyin[256];
370     char phrase[256];
371     phrase_token_t token;
372     size_t freq;
373
374     while ( !feof(infile) ) {
375         fscanf(infile, "%s", pinyin);
376         fscanf(infile, "%s", phrase);
377         fscanf(infile, "%u", &token);
378         fscanf(infile, "%ld", &freq);
379
380         if ( feof(infile) )
381             break;
382
383         glong phrase_len = g_utf8_strlen(phrase, -1);
384         ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
385         add_index(phrase_len, new_phrase, token);
386
387         g_free(new_phrase);
388     }
389     return true;
390 }
391
392 bool PhraseBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
393                                   table_offset_t end){
394     reset();
395     char * buf_begin = (char *) chunk->begin();
396     table_offset_t phrase_begin, phrase_end;
397     table_offset_t * index = (table_offset_t *) (buf_begin + offset);
398     phrase_end = *index;
399
400     for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
401         phrase_begin = phrase_end;
402         index++;
403         phrase_end = *index;
404         if ( phrase_begin == phrase_end ) //null pointer
405             continue;
406         PhraseLengthIndexLevel * phrases = new PhraseLengthIndexLevel;
407         m_phrase_length_indexes[i] = phrases;
408         phrases->load(chunk, phrase_begin, phrase_end - 1);
409         assert( phrase_end <= end );
410         assert( *(buf_begin + phrase_end - 1) == c_separate);
411     }
412     offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
413     assert( c_separate == *(buf_begin + offset) );
414     return true;
415 }
416
417 bool PhraseBitmapIndexLevel::store(MemoryChunk * new_chunk,
418                                    table_offset_t offset,
419                                    table_offset_t & end){
420     table_offset_t phrase_end;
421     table_offset_t index = offset;
422     offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
423     //add '#'
424     new_chunk->set_content(offset, &c_separate, sizeof(char));
425     offset +=sizeof(char);
426     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
427     index += sizeof(table_offset_t);
428     for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
429         PhraseLengthIndexLevel * phrases = m_phrase_length_indexes[i];
430         if ( !phrases ) { //null pointer
431             new_chunk->set_content(index, &offset, sizeof(table_offset_t));
432             index += sizeof(table_offset_t);
433             continue;
434         }
435         phrases->store(new_chunk, offset, phrase_end); //has a end '#'
436         offset = phrase_end;
437         //add '#'
438         new_chunk->set_content(offset, &c_separate, sizeof(char));
439         offset += sizeof(char);
440         new_chunk->set_content(index, &offset, sizeof(table_offset_t));
441         index += sizeof(table_offset_t);
442     }
443     end = offset;
444     return true;
445 }
446
447 bool PhraseLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
448     char * buf_begin = (char *) chunk->begin();
449     guint32 nindex = *((guint32 *)(buf_begin + offset));
450     table_offset_t * index = (table_offset_t *)
451         (buf_begin + offset + sizeof(guint32));
452
453     table_offset_t phrase_begin, phrase_end = *index;
454     m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
455     for ( size_t i = 0; i < nindex; ++i) {
456         phrase_begin = phrase_end;
457         index++;
458         phrase_end = *index;
459         if ( phrase_begin == phrase_end ){
460             void * null = NULL;
461             g_array_append_val(m_phrase_array_indexes, null);
462             continue;
463         }
464
465 #define CASE(len) case len:                                             \
466         {                                                               \
467             PhraseArrayIndexLevel<len> * phrase = new PhraseArrayIndexLevel<len>; \
468             phrase->load(chunk, phrase_begin, phrase_end - 1);          \
469             assert( *(buf_begin + phrase_end - 1) == c_separate);       \
470             assert( phrase_end <= end );                                \
471             g_array_append_val(m_phrase_array_indexes, phrase);         \
472             break;                                                      \
473         }
474         switch ( i ){
475             CASE(0);
476             CASE(1);
477             CASE(2);
478             CASE(3);
479             CASE(4);
480             CASE(5);
481             CASE(6);
482             CASE(7);
483             CASE(8);
484             CASE(9);
485             CASE(10);
486             CASE(11);
487             CASE(12);
488             CASE(13);
489             CASE(14);
490             CASE(15);
491         default:
492             assert(false);
493         }
494 #undef CASE
495     }
496     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
497     assert ( c_separate == * (buf_begin + offset) );
498     return true;
499 }
500
501 bool PhraseLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
502     guint32 nindex = m_phrase_array_indexes->len;
503     new_chunk->set_content(offset, &nindex, sizeof(guint32));
504     table_offset_t index = offset + sizeof(guint32);
505
506     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
507     new_chunk->set_content(offset, &c_separate, sizeof(char));
508     offset += sizeof(char);
509     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
510     index += sizeof(table_offset_t);
511
512     table_offset_t phrase_end;
513     for ( size_t i = 0; i < m_phrase_array_indexes->len; ++i) {
514 #define CASE(len) case len:                                             \
515         {                                                               \
516             PhraseArrayIndexLevel<len> * phrase = g_array_index         \
517                 (m_phrase_array_indexes, PhraseArrayIndexLevel<len> *, i); \
518             if ( !phrase ){                                             \
519                 new_chunk->set_content                                  \
520                     (index, &offset, sizeof(table_offset_t));           \
521                 index += sizeof(table_offset_t);                        \
522                 continue;                                               \
523             }                                                           \
524             phrase->store(new_chunk, offset, phrase_end);               \
525             offset = phrase_end;                                        \
526             break;                                                      \
527         }
528         switch ( i ){
529             CASE(0);
530             CASE(1);
531             CASE(2);
532             CASE(3);
533             CASE(4);
534             CASE(5);
535             CASE(6);
536             CASE(7);
537             CASE(8);
538             CASE(9);
539             CASE(10);
540             CASE(11);
541             CASE(12);
542             CASE(13);
543             CASE(14);
544             CASE(15);
545         default:
546             assert(false);
547         }
548         //add '#'
549         new_chunk->set_content(offset, &c_separate, sizeof(char));
550         offset += sizeof(char);
551         new_chunk->set_content(index, &offset, sizeof(table_offset_t));
552         index += sizeof(table_offset_t);
553
554 #undef CASE
555     }
556     end = offset;
557     return true;
558 }
559
560 template<size_t phrase_length>
561 bool PhraseArrayIndexLevel<phrase_length>::
562 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
563     char * buf_begin = (char *) chunk->begin();
564     m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
565     return true;
566 }
567
568 template<size_t phrase_length>
569 bool PhraseArrayIndexLevel<phrase_length>::
570 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
571     new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
572     end = offset + m_chunk.size();
573     return true;
574 }