src/storage/phrase_large_table2.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include <assert.h>
  23 #include <string.h>
  24 #include "phrase_large_table2.h"
  25
  26
  27 /* class definition */
  28
  29 namespace pinyin{
  30
  31 class PhraseLengthIndexLevel2{
  32 protected:
  33     GArray * m_phrase_array_indexes;
  34 public:
  35     PhraseLengthIndexLevel2();
  36     ~PhraseLengthIndexLevel2();
  37
  38     /* load/store method */
  39     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
  40     bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
  41
  42     /* search method */
  43     int search(int phrase_length, /* in */ ucs4_t phrase[],
  44                /* out */ PhraseTokens tokens) const;
  45
  46     /* add_index/remove_index method */
  47     int add_index(int phrase_length, /* in */ ucs4_t phrase[],
  48                   /* in */ phrase_token_t token);
  49     int remove_index(int phrase_length, /* in */ ucs4_t phrase[],
  50                      /* in */ phrase_token_t token);
  51 };
  52
  53
  54 template<size_t phrase_length>
  55 struct PhraseIndexItem2{
  56     phrase_token_t m_token;
  57     ucs4_t m_phrase[phrase_length];
  58 public:
  59     PhraseIndexItem2<phrase_length>(ucs4_t phrase[], phrase_token_t token){
  60         memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
  61         m_token = token;
  62     }
  63 };
  64
  65
  66 template<size_t phrase_length>
  67 class PhraseArrayIndexLevel2{
  68 protected:
  69     typedef PhraseIndexItem2<phrase_length> IndexItem;
  70
  71 protected:
  72     MemoryChunk m_chunk;
  73 public:
  74     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
  75     bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
  76
  77     /* search method */
  78     int search(/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
  79
  80     /* add_index/remove_index method */
  81     int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
  82     int remove_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
  83 };
  84
  85 };
  86
  87 using namespace pinyin;
  88
  89 /* class implementation */
  90
  91 template<size_t phrase_length>
  92 static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
  93                            const PhraseIndexItem2<phrase_length> &rhs){
  94     ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
  95     ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
  96
  97     return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
  98 }
  99
 100 template<size_t phrase_length>
 101 static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
 102                               const PhraseIndexItem2<phrase_length> & rhs){
 103     return 0 > phrase_compare2(lhs, rhs);
 104 }
 105
 106 PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
 107     memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
 108 }
 109
 110 void PhraseBitmapIndexLevel2::reset(){
 111     for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
 112         PhraseLengthIndexLevel2 * length_array =
 113             m_phrase_length_indexes[i];
 114         if ( length_array )
 115             delete length_array;
 116     }
 117 }
 118
 119
 120 /* search method */
 121
 122 int PhraseBitmapIndexLevel2::search(int phrase_length,
 123                                     /* in */ ucs4_t phrase[],
 124                                     /* out */ PhraseTokens tokens) const {
 125     assert(phrase_length > 0);
 126
 127     int result = SEARCH_NONE;
 128     /* use the first 8-bit of the lower 16-bit for bitmap index,
 129      * as most the higher 16-bit are zero.
 130      */
 131     guint8 first_key = (phrase[0] & 0xFF00) >> 8;
 132
 133     PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
 134     if ( phrase_array )
 135         return phrase_array->search(phrase_length, phrase, tokens);
 136     return result;
 137 }
 138
 139 PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
 140     m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
 141 }
 142
 143 PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
 144 #define CASE(len) case len:                                             \
 145     {                                                                   \
 146         PhraseArrayIndexLevel2<len> * & array =  g_array_index          \
 147             (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
 148         if ( array ) {                                                  \
 149             delete array;                                               \
 150             array = NULL;                                               \
 151         }                                                               \
 152         break;                                                          \
 153     }
 154
 155     for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
 156         switch (i){
 157             CASE(1);
 158             CASE(2);
 159             CASE(3);
 160             CASE(4);
 161             CASE(5);
 162             CASE(6);
 163             CASE(7);
 164             CASE(8);
 165             CASE(9);
 166             CASE(10);
 167             CASE(11);
 168             CASE(12);
 169             CASE(13);
 170             CASE(14);
 171             CASE(15);
 172             CASE(16);
 173         default:
 174             assert(false);
 175         }
 176     }
 177     g_array_free(m_phrase_array_indexes, TRUE);
 178 #undef CASE
 179 }
 180
 181 int PhraseLengthIndexLevel2::search(int phrase_length,
 182                                     /* in */ ucs4_t phrase[],
 183                                     /* out */ PhraseTokens tokens) const {
 184     int result = SEARCH_NONE;
 185     if(m_phrase_array_indexes->len < phrase_length)
 186         return result;
 187     if (m_phrase_array_indexes->len > phrase_length)
 188         result |= SEARCH_CONTINUED;
 189
 190 #define CASE(len) case len:                                             \
 191     {                                                                   \
 192         PhraseArrayIndexLevel2<len> * array = g_array_index             \
 193             (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
 194         if ( !array )                                                   \
 195             return result;                                              \
 196         result |= array->search(phrase, tokens);                        \
 197         return result;                                                  \
 198     }
 199
 200     switch ( phrase_length ){
 201         CASE(1);
 202         CASE(2);
 203         CASE(3);
 204         CASE(4);
 205         CASE(5);
 206         CASE(6);
 207         CASE(7);
 208         CASE(8);
 209         CASE(9);
 210         CASE(10);
 211         CASE(11);
 212         CASE(12);
 213         CASE(13);
 214         CASE(14);
 215         CASE(15);
 216         CASE(16);
 217     default:
 218         assert(false);
 219     }
 220 #undef CASE
 221 }
 222
 223 template<size_t phrase_length>
 224 int PhraseArrayIndexLevel2<phrase_length>::search
 225 (/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
 226     int result = SEARCH_NONE;
 227
 228     IndexItem * chunk_begin = NULL, * chunk_end = NULL;
 229     chunk_begin = (IndexItem *) m_chunk.begin();
 230     chunk_end = (IndexItem *) m_chunk.end();
 231
 232     /* do the search */
 233     IndexItem search_elem(phrase, -1);
 234     std_lite::pair<IndexItem *, IndexItem *> range;
 235     range = std_lite::equal_range
 236         (chunk_begin, chunk_end, search_elem,
 237          phrase_less_than2<phrase_length>);
 238
 239     const IndexItem * const begin = range.first;
 240     const IndexItem * const end = range.second;
 241     if (begin == end)
 242         return result;
 243
 244     const IndexItem * iter = NULL;
 245     GArray * array = NULL;
 246
 247     for (iter = begin; iter != end; ++iter) {
 248         phrase_token_t token = iter->m_token;
 249
 250         /* filter out disabled sub phrase indices. */
 251         array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
 252         if (NULL == array)
 253             continue;
 254
 255         result |= SEARCH_OK;
 256
 257         g_array_append_val(array, token);
 258     }
 259
 260     return result;
 261 }
 262
 263
 264 /* add/remove index method */
 265
 266 int PhraseBitmapIndexLevel2::add_index(int phrase_length,
 267                                        /* in */ ucs4_t phrase[],
 268                                        /* in */ phrase_token_t token){
 269     guint8 first_key =  (phrase[0] & 0xFF00) >> 8;
 270
 271     PhraseLengthIndexLevel2 * & length_array =
 272         m_phrase_length_indexes[first_key];
 273
 274     if ( !length_array ){
 275         length_array = new PhraseLengthIndexLevel2();
 276     }
 277     return length_array->add_index(phrase_length, phrase, token);
 278 }
 279
 280 int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
 281                                          /* in */ ucs4_t phrase[],
 282                                          /* in */ phrase_token_t token){
 283     guint8 first_key = (phrase[0] & 0xFF00) >> 8;
 284
 285     PhraseLengthIndexLevel2 * & length_array =
 286         m_phrase_length_indexes[first_key];
 287
 288     if ( length_array )
 289         return length_array->remove_index(phrase_length, phrase, token);
 290
 291     return ERROR_REMOVE_ITEM_DONOT_EXISTS;
 292 }
 293
 294 int PhraseLengthIndexLevel2::add_index(int phrase_length,
 295                                        /* in */ ucs4_t phrase[],
 296                                        /* in */ phrase_token_t token) {
 297     if (phrase_length >= MAX_PHRASE_LENGTH)
 298         return ERROR_PHRASE_TOO_LONG;
 299
 300     if (m_phrase_array_indexes->len < phrase_length)
 301         g_array_set_size(m_phrase_array_indexes, phrase_length);
 302
 303 #define CASE(len) case len:                                             \
 304     {                                                                   \
 305         PhraseArrayIndexLevel2<len> * & array = g_array_index           \
 306             (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
 307         if ( !array )                                                   \
 308             array = new PhraseArrayIndexLevel2<len>;                    \
 309         return array->add_index(phrase, token);                         \
 310     }
 311
 312     switch(phrase_length){
 313         CASE(1);
 314         CASE(2);
 315         CASE(3);
 316         CASE(4);
 317         CASE(5);
 318         CASE(6);
 319         CASE(7);
 320         CASE(8);
 321         CASE(9);
 322         CASE(10);
 323         CASE(11);
 324         CASE(12);
 325         CASE(13);
 326         CASE(14);
 327         CASE(15);
 328         CASE(16);
 329     default:
 330         assert(false);
 331     }
 332
 333 #undef CASE
 334 }
 335
 336 int PhraseLengthIndexLevel2::remove_index(int phrase_length,
 337                                           /* in */ ucs4_t phrase[],
 338                                           /* in */ phrase_token_t token) {
 339     if (phrase_length >= MAX_PHRASE_LENGTH)
 340         return ERROR_PHRASE_TOO_LONG;
 341
 342     if (m_phrase_array_indexes->len < phrase_length)
 343         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
 344
 345 #define CASE(len) case len:                                             \
 346     {                                                                   \
 347         PhraseArrayIndexLevel2<len> * & array =  g_array_index          \
 348             (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
 349         if ( !array )                                                   \
 350             return ERROR_REMOVE_ITEM_DONOT_EXISTS;                      \
 351         return array->remove_index(phrase, token);                      \
 352     }
 353
 354     switch(phrase_length){
 355         CASE(1);
 356         CASE(2);
 357         CASE(3);
 358         CASE(4);
 359         CASE(5);
 360         CASE(6);
 361         CASE(7);
 362         CASE(8);
 363         CASE(9);
 364         CASE(10);
 365         CASE(11);
 366         CASE(12);
 367         CASE(13);
 368         CASE(14);
 369         CASE(15);
 370         CASE(16);
 371     default:
 372         assert(false);
 373     }
 374 #undef CASE
 375 }
 376
 377 template<size_t phrase_length>
 378 int PhraseArrayIndexLevel2<phrase_length>::add_index
 379 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
 380     IndexItem * begin, * end;
 381
 382     IndexItem add_elem(phrase, token);
 383     begin = (IndexItem *) m_chunk.begin();
 384     end   = (IndexItem *) m_chunk.end();
 385
 386     std_lite::pair<IndexItem *, IndexItem *> range;
 387     range = std_lite::equal_range
 388         (begin, end, add_elem, phrase_less_than2<phrase_length>);
 389
 390     IndexItem * cur_elem;
 391     for (cur_elem = range.first;
 392          cur_elem != range.second; ++cur_elem) {
 393         if (cur_elem->m_token == token)
 394             return ERROR_INSERT_ITEM_EXISTS;
 395         if (cur_elem->m_token > token)
 396             break;
 397     }
 398
 399     int offset = (cur_elem - begin) * sizeof(IndexItem);
 400     m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
 401     return ERROR_OK;
 402 }
 403
 404 template<size_t phrase_length>
 405 int PhraseArrayIndexLevel2<phrase_length>::remove_index
 406 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token) {
 407     IndexItem * begin, * end;
 408
 409     IndexItem remove_elem(phrase, token);
 410     begin = (IndexItem *) m_chunk.begin();
 411     end   = (IndexItem *) m_chunk.end();
 412
 413     std_lite::pair<IndexItem *, IndexItem *> range;
 414     range = std_lite::equal_range
 415         (begin, end, remove_elem, phrase_less_than2<phrase_length>);
 416
 417     IndexItem * cur_elem;
 418     for (cur_elem = range.first;
 419          cur_elem != range.second; ++cur_elem) {
 420         if (cur_elem->m_token == token)
 421             break;
 422     }
 423
 424     if (cur_elem == range.second)
 425         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
 426
 427     int offset = (cur_elem - begin) * sizeof(IndexItem);
 428     m_chunk.remove_content(offset, sizeof(IndexItem));
 429     return ERROR_OK;
 430 }
 431
 432
 433 /* load text method */
 434
 435 bool PhraseLargeTable2::load_text(FILE * infile){
 436     char pinyin[256];
 437     char phrase[256];
 438     phrase_token_t token;
 439     size_t freq;
 440
 441     while ( !feof(infile) ) {
 442         fscanf(infile, "%s", pinyin);
 443         fscanf(infile, "%s", phrase);
 444         fscanf(infile, "%u", &token);
 445         fscanf(infile, "%ld", &freq);
 446
 447         if ( feof(infile) )
 448             break;
 449
 450         glong phrase_len = g_utf8_strlen(phrase, -1);
 451         ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
 452         add_index(phrase_len, new_phrase, token);
 453
 454         g_free(new_phrase);
 455     }
 456     return true;
 457 }