3 * Library to deal with pinyin.
5 * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "phrase_large_table2.h"
27 /* class definition */
31 class PhraseLengthIndexLevel2{
33 GArray * m_phrase_array_indexes;
35 PhraseLengthIndexLevel2();
36 ~PhraseLengthIndexLevel2();
38 /* load/store method */
39 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
40 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
43 int search(int phrase_length, /* in */ ucs4_t phrase[],
44 /* out */ PhraseTokens tokens) const;
46 /* add_index/remove_index method */
47 int add_index(int phrase_length, /* in */ ucs4_t phrase[],
48 /* in */ phrase_token_t token);
49 int remove_index(int phrase_length, /* in */ ucs4_t phrase[],
50 /* in */ phrase_token_t token);
54 template<size_t phrase_length>
55 struct PhraseIndexItem2{
56 phrase_token_t m_token;
57 ucs4_t m_phrase[phrase_length];
59 PhraseIndexItem2<phrase_length>(ucs4_t phrase[], phrase_token_t token){
60 memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
66 template<size_t phrase_length>
67 class PhraseArrayIndexLevel2{
69 typedef PhraseIndexItem2<phrase_length> IndexItem;
74 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
75 bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
78 int search(/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
80 /* add_index/remove_index method */
81 int add_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
82 int remove_index(/* in */ ucs4_t phrase[], /* in */ phrase_token_t token);
87 using namespace pinyin;
89 /* class implementation */
91 template<size_t phrase_length>
92 static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
93 const PhraseIndexItem2<phrase_length> &rhs){
94 ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
95 ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
97 return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
100 template<size_t phrase_length>
101 static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
102 const PhraseIndexItem2<phrase_length> & rhs){
103 return 0 > phrase_compare2(lhs, rhs);
106 PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
107 memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
110 void PhraseBitmapIndexLevel2::reset(){
111 for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
112 PhraseLengthIndexLevel2 * length_array =
113 m_phrase_length_indexes[i];
119 int PhraseBitmapIndexLevel2::search(int phrase_length,
120 /* in */ ucs4_t phrase[],
121 /* out */ PhraseTokens tokens) const {
122 assert(phrase_length > 0);
124 int result = SEARCH_NONE;
125 /* use the first 8-bit of the lower 16-bit for bitmap index,
126 * as most the higher 16-bit are zero.
128 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
130 PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
132 return phrase_array->search(phrase_length, phrase, tokens);
136 PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
137 m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
140 PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
141 #define CASE(len) case len: \
143 PhraseArrayIndexLevel2<len> * & array = g_array_index \
144 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
152 for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
174 g_array_free(m_phrase_array_indexes, TRUE);
178 int PhraseLengthIndexLevel2::search(int phrase_length,
179 /* in */ ucs4_t phrase[],
180 /* out */ PhraseTokens tokens) const {
181 int result = SEARCH_NONE;
182 if(m_phrase_array_indexes->len < phrase_length)
184 if (m_phrase_array_indexes->len > phrase_length)
185 result |= SEARCH_CONTINUED;
187 #define CASE(len) case len: \
189 PhraseArrayIndexLevel2<len> * array = g_array_index \
190 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
193 result |= array->search(phrase, tokens); \
197 switch ( phrase_length ){
220 template<size_t phrase_length>
221 int PhraseArrayIndexLevel2<phrase_length>::search
222 (/* in */ ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
223 int result = SEARCH_NONE;
225 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
226 chunk_begin = (IndexItem *) m_chunk.begin();
227 chunk_end = (IndexItem *) m_chunk.end();
230 IndexItem search_elem(phrase, -1);
231 std_lite::pair<IndexItem *, IndexItem *> range;
232 range = std_lite::equal_range
233 (chunk_begin, chunk_end, search_elem,
234 phrase_less_than2<phrase_length>);
236 const IndexItem * const begin = range.first;
237 const IndexItem * const end = range.second;
241 const IndexItem * iter = NULL;
242 GArray * array = NULL;
244 for (iter = begin; iter != end; ++iter) {
245 phrase_token_t token = iter->m_token;
247 /* filter out disabled sub phrase indices. */
248 array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
254 g_array_append_val(array, token);
261 /* add/remove index method */
263 int PhraseBitmapIndexLevel2::add_index(int phrase_length,
264 /* in */ ucs4_t phrase[],
265 /* in */ phrase_token_t token){
266 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
268 PhraseLengthIndexLevel2 * & length_array =
269 m_phrase_length_indexes[first_key];
271 if ( !length_array ){
272 length_array = new PhraseLengthIndexLevel2();
274 return length_array->add_index(phrase_length, phrase, token);
277 int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
278 /* in */ ucs4_t phrase[],
279 /* in */ phrase_token_t token){
280 guint8 first_key = (phrase[0] & 0xFF00) >> 8;
282 PhraseLengthIndexLevel2 * & length_array =
283 m_phrase_length_indexes[first_key];
286 return length_array->remove_index(phrase_length, phrase, token);
288 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
291 int PhraseLengthIndexLevel2::add_index(int phrase_length,
292 /* in */ ucs4_t phrase[],
293 /* in */ phrase_token_t token) {
294 if (phrase_length >= MAX_PHRASE_LENGTH)
295 return ERROR_PHRASE_TOO_LONG;
297 if (m_phrase_array_indexes->len < phrase_length)
298 g_array_set_size(m_phrase_array_indexes, phrase_length);
300 #define CASE(len) case len: \
302 PhraseArrayIndexLevel2<len> * & array = g_array_index \
303 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
305 array = new PhraseArrayIndexLevel2<len>; \
306 return array->add_index(phrase, token); \
309 switch(phrase_length){
333 int PhraseLengthIndexLevel2::remove_index(int phrase_length,
334 /* in */ ucs4_t phrase[],
335 /* in */ phrase_token_t token) {
336 if (phrase_length >= MAX_PHRASE_LENGTH)
337 return ERROR_PHRASE_TOO_LONG;
339 if (m_phrase_array_indexes->len < phrase_length)
340 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
342 #define CASE(len) case len: \
344 PhraseArrayIndexLevel2<len> * & array = g_array_index \
345 (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
347 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
348 return array->remove_index(phrase, token); \
351 switch(phrase_length){
374 template<size_t phrase_length>
375 int PhraseArrayIndexLevel2<phrase_length>::add_index
376 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token){
377 IndexItem * begin, * end;
379 IndexItem add_elem(phrase, token);
380 begin = (IndexItem *) m_chunk.begin();
381 end = (IndexItem *) m_chunk.end();
383 std_lite::pair<IndexItem *, IndexItem *> range;
384 range = std_lite::equal_range
385 (begin, end, add_elem, phrase_less_than2<phrase_length>);
387 IndexItem * cur_elem;
388 for (cur_elem = range.first;
389 cur_elem != range.second; ++cur_elem) {
390 if (cur_elem->m_token == token)
391 return ERROR_INSERT_ITEM_EXISTS;
392 if (cur_elem->m_token > token)
396 int offset = (cur_elem - begin) * sizeof(IndexItem);
397 m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
401 template<size_t phrase_length>
402 int PhraseArrayIndexLevel2<phrase_length>::remove_index
403 (/* in */ ucs4_t phrase[], /* in */ phrase_token_t token) {
404 IndexItem * begin, * end;
406 IndexItem remove_elem(phrase, token);
407 begin = (IndexItem *) m_chunk.begin();
408 end = (IndexItem *) m_chunk.end();
410 std_lite::pair<IndexItem *, IndexItem *> range;
411 range = std_lite::equal_range
412 (begin, end, remove_elem, phrase_less_than2<phrase_length>);
414 IndexItem * cur_elem;
415 for (cur_elem = range.first;
416 cur_elem != range.second; ++cur_elem) {
417 if (cur_elem->m_token == token)
421 if (cur_elem == range.second)
422 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
424 int offset = (cur_elem - begin) * sizeof(IndexItem);
425 m_chunk.remove_content(offset, sizeof(IndexItem));
430 /* load text method */
432 bool PhraseLargeTable2::load_text(FILE * infile){
435 phrase_token_t token;
438 while ( !feof(infile) ) {
439 fscanf(infile, "%s", pinyin);
440 fscanf(infile, "%s", phrase);
441 fscanf(infile, "%u", &token);
442 fscanf(infile, "%ld", &freq);
447 glong phrase_len = g_utf8_strlen(phrase, -1);
448 ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
449 add_index(phrase_len, new_phrase, token);