3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <glib/gstdio.h>
26 #include "memory_chunk.h"
27 #include "novel_types.h"
30 using namespace pinyin;
32 struct SingleGramItem{
33 phrase_token_t m_token;
37 SingleGram::SingleGram(){
38 m_chunk.set_size(sizeof(guint32));
39 memset(m_chunk.begin(), 0, sizeof(guint32));
42 SingleGram::SingleGram(void * buffer, size_t length){
43 m_chunk.set_chunk(buffer, length, NULL);
46 bool SingleGram::get_total_freq(guint32 & total) const{
47 char * buf_begin = (char *)m_chunk.begin();
48 total = *((guint32 *)buf_begin);
52 bool SingleGram::set_total_freq(guint32 total){
53 char * buf_begin = (char *)m_chunk.begin();
54 *((guint32 *)buf_begin) = total;
58 guint32 SingleGram::get_length(){
59 /* get the number of items. */
60 const SingleGramItem * begin = (const SingleGramItem *)
61 ((const char *)(m_chunk.begin()) + sizeof(guint32));
62 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
64 const guint32 length = end - begin;
67 /* no items here, total freq should be zero. */
68 guint32 total_freq = 0;
69 assert(get_total_freq(total_freq));
70 assert(0 == total_freq);
76 guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
77 guint32 removed_items = 0;
79 guint32 total_freq = 0;
80 assert(get_total_freq(total_freq));
82 const SingleGramItem * begin = (const SingleGramItem *)
83 ((const char *)(m_chunk.begin()) + sizeof(guint32));
84 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
86 for (const SingleGramItem * cur = begin; cur != end; ++cur) {
87 if ((cur->m_token & mask) != value)
90 total_freq -= cur->m_freq;
91 size_t offset = sizeof(guint32) +
92 sizeof(SingleGramItem) * (cur - begin);
93 m_chunk.remove_content(offset, sizeof(SingleGramItem));
95 /* update chunk end. */
96 end = (const SingleGramItem *) m_chunk.end();
101 assert(set_total_freq(total_freq));
102 return removed_items;
105 bool SingleGram::prune(){
108 SingleGramItem * begin = (SingleGramItem *)
109 ((const char *)(m_chunk.begin()) + sizeof(guint32));
110 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
113 for ( SingleGramItem * cur = begin; cur != end; ++cur){
116 if ( cur->m_freq == 0 ){
117 size_t offset = sizeof(guint32) + (cur - begin)
118 * sizeof(SingleGramItem) ;
119 m_chunk.remove_content(offset, sizeof(SingleGramItem));
123 assert(get_total_freq(total_freq));
124 assert(set_total_freq(total_freq - nitem));
129 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
130 return lhs.m_token < rhs.m_token;
133 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
135 const SingleGramItem * begin = (const SingleGramItem *)
136 ((const char *)(m_chunk.begin()) + sizeof(guint32));
137 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
140 BigramPhraseItemWithCount bigram_item_with_count;
141 assert(get_total_freq(total_freq));
143 for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
144 bigram_item_with_count.m_token = cur_item->m_token;
145 bigram_item_with_count.m_count = cur_item->m_freq;
146 bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
147 g_array_append_val(array, bigram_item_with_count);
153 bool SingleGram::search(/* in */ PhraseIndexRange * range,
154 /* out */ BigramPhraseArray array) const {
155 const SingleGramItem * begin = (const SingleGramItem *)
156 ((const char *)(m_chunk.begin()) + sizeof(guint32));
157 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
159 SingleGramItem compare_item;
160 compare_item.m_token = range->m_range_begin;
161 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
164 BigramPhraseItem bigram_item;
165 assert(get_total_freq(total_freq));
167 for ( ; cur_item != end; ++cur_item){
168 if ( cur_item->m_token >= range->m_range_end )
170 bigram_item.m_token = cur_item->m_token;
171 bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
172 g_array_append_val(array, bigram_item);
178 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
179 /* in */ guint32 freq){
180 SingleGramItem * begin = (SingleGramItem *)
181 ((const char *)(m_chunk.begin()) + sizeof(guint32));
182 SingleGramItem * end = (SingleGramItem *) m_chunk.end();
183 SingleGramItem compare_item;
184 compare_item.m_token = token;
185 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
187 SingleGramItem insert_item;
188 insert_item.m_token = token;
189 insert_item.m_freq = freq;
190 for ( ; cur_item != end; ++cur_item ){
191 if ( cur_item->m_token > token ){
192 size_t offset = sizeof(guint32) +
193 sizeof(SingleGramItem) * (cur_item - begin);
194 m_chunk.insert_content(offset, &insert_item,
195 sizeof(SingleGramItem));
198 if ( cur_item->m_token == token ){
202 m_chunk.insert_content(m_chunk.size(), &insert_item,
203 sizeof(SingleGramItem));
207 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
208 /* out */ guint32 & freq){
210 const SingleGramItem * begin = (const SingleGramItem *)
211 ((const char *)(m_chunk.begin()) + sizeof(guint32));
212 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
213 SingleGramItem compare_item;
214 compare_item.m_token = token;
215 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
217 for ( ; cur_item != end; ++cur_item ){
218 if ( cur_item->m_token > token )
220 if ( cur_item->m_token == token ){
221 freq = cur_item -> m_freq;
222 size_t offset = sizeof(guint32) +
223 sizeof(SingleGramItem) * (cur_item - begin);
224 m_chunk.remove_content(offset, sizeof(SingleGramItem));
231 bool SingleGram::get_freq(/* in */ phrase_token_t token,
232 /* out */ guint32 & freq) const {
234 const SingleGramItem * begin = (const SingleGramItem *)
235 ((const char *)(m_chunk.begin()) + sizeof(guint32));
236 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
237 SingleGramItem compare_item;
238 compare_item.m_token = token;
239 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
241 for ( ; cur_item != end; ++cur_item){
242 if ( cur_item->m_token > token )
244 if ( cur_item->m_token == token ){
245 freq = cur_item -> m_freq;
252 bool SingleGram::set_freq( /* in */ phrase_token_t token,
253 /* in */ guint32 freq){
254 SingleGramItem * begin = (SingleGramItem *)
255 ((const char *)(m_chunk.begin()) + sizeof(guint32));
256 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
257 SingleGramItem compare_item;
258 compare_item.m_token = token;
259 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
261 for ( ;cur_item != end; ++cur_item){
262 if ( cur_item->m_token > token ){
265 if ( cur_item->m_token == token ){
266 cur_item -> m_freq = freq;
273 bool Bigram::load_db(const char * dbfile){
276 /* create in memory db. */
277 int ret = db_create(&m_db, NULL, 0);
280 ret = m_db->open(m_db, NULL, NULL, NULL,
281 DB_HASH, DB_CREATE, 0600);
285 /* load db into memory. */
287 ret = db_create(&tmp_db, NULL, 0);
290 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
291 DB_HASH, DB_RDONLY, 0600);
295 DBC * cursorp = NULL;
298 tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
300 /* Initialize our DBTs. */
301 memset(&key, 0, sizeof(DBT));
302 memset(&data, 0, sizeof(DBT));
304 /* Iterate over the database, retrieving each record in turn. */
305 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
306 int ret = m_db->put(m_db, NULL, &key, &data, 0);
309 assert (ret == DB_NOTFOUND);
311 /* Cursors must be closed */
312 if ( cursorp != NULL )
313 cursorp->c_close(cursorp);
315 if ( tmp_db != NULL )
316 tmp_db->close(tmp_db, 0);
321 bool Bigram::save_db(const char * dbfile){
324 int ret = unlink(dbfile);
325 if ( ret != 0 && errno != ENOENT)
328 ret = db_create(&tmp_db, NULL, 0);
331 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
332 DB_HASH, DB_CREATE, 0600);
336 DBC * cursorp = NULL;
339 m_db->cursor(m_db, NULL, &cursorp, 0);
341 /* Initialize our DBTs. */
342 memset(&key, 0, sizeof(DBT));
343 memset(&data, 0, sizeof(DBT));
345 /* Iterate over the database, retrieving each record in turn. */
346 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
347 int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
350 assert (ret == DB_NOTFOUND);
352 /* Cursors must be closed */
353 if ( cursorp != NULL )
354 cursorp->c_close(cursorp);
356 if ( tmp_db != NULL )
357 tmp_db->close(tmp_db, 0);
362 bool Bigram::attach(const char * dbfile, guint32 flags){
364 u_int32_t db_flags = 0;
366 if ( flags & ATTACH_READONLY )
367 db_flags |= DB_RDONLY;
368 if ( flags & ATTACH_READWRITE )
369 assert( !( flags & ATTACH_READONLY ) );
370 if ( flags & ATTACH_CREATE )
371 db_flags |= DB_CREATE;
375 int ret = db_create(&m_db, NULL, 0);
379 ret = m_db->open(m_db, NULL, dbfile, NULL,
380 DB_HASH, db_flags, 0644);
387 bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
393 memset(&db_key, 0, sizeof(DBT));
394 db_key.data = &index;
395 db_key.size = sizeof(phrase_token_t);
398 memset(&db_data, 0, sizeof(DBT));
399 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
403 single_gram = new SingleGram(db_data.data, db_data.size);
407 bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
412 memset(&db_key, 0, sizeof(DBT));
413 db_key.data = &index;
414 db_key.size = sizeof(phrase_token_t);
416 memset(&db_data, 0, sizeof(DBT));
417 db_data.data = single_gram->m_chunk.begin();
418 db_data.size = single_gram->m_chunk.size();
420 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
424 bool Bigram::remove(/* in */ phrase_token_t index){
429 memset(&db_key, 0, sizeof(DBT));
430 db_key.data = &index;
431 db_key.size = sizeof(phrase_token_t);
433 int ret = m_db->del(m_db, NULL, &db_key, 0);
437 bool Bigram::get_all_items(GArray * items){
438 g_array_set_size(items, 0);
443 DBC * cursorp = NULL;
447 m_db->cursor(m_db, NULL, &cursorp, 0);
449 /* Initialize our DBTs. */
450 memset(&key, 0, sizeof(DBT));
451 memset(&data, 0, sizeof(DBT));
453 /* Iterate over the database, retrieving each record in turn. */
454 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
455 assert(key.size == sizeof(phrase_token_t));
456 phrase_token_t * token = (phrase_token_t *)key.data;
457 g_array_append_val(items, *token);
460 assert (ret == DB_NOTFOUND);
462 /* Cursors must be closed */
464 cursorp->c_close(cursorp);
469 bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
470 GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
472 if (!get_all_items(items)) {
473 g_array_free(items, TRUE);
477 for (size_t i = 0; i < items->len; ++i) {
478 phrase_token_t index = g_array_index(items, phrase_token_t, i);
480 if ((index & mask) == value) {
481 assert(remove(index));
485 SingleGram * gram = NULL;
486 assert(load(index, gram));
488 int num = gram->mask_out(mask, value);
494 if (0 == gram->get_length()) {
495 assert(remove(index));
497 assert(store(index, gram));
503 g_array_free(items, TRUE);
510 /* merge origin system info and delta user info */
511 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
512 const SingleGram * user){
513 if (NULL == system && NULL == user)
516 MemoryChunk & merged_chunk = merged->m_chunk;
518 if (NULL == system) {
519 merged_chunk.set_chunk(user->m_chunk.begin(),
520 user->m_chunk.size(), NULL);
525 merged_chunk.set_chunk(system->m_chunk.begin(),
526 system->m_chunk.size(), NULL);
531 merged_chunk.set_size(sizeof(guint32));
533 /* merge the origin info and delta info */
534 guint32 system_total, user_total;
535 assert(system->get_total_freq(system_total));
536 assert(user->get_total_freq(user_total));
537 const guint32 merged_total = system_total + user_total;
538 merged_chunk.set_content(0, &merged_total, sizeof(guint32));
540 const SingleGramItem * cur_system = (const SingleGramItem *)
541 (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
542 const SingleGramItem * system_end = (const SingleGramItem *)
543 system->m_chunk.end();
545 const SingleGramItem * cur_user = (const SingleGramItem *)
546 (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
547 const SingleGramItem * user_end = (const SingleGramItem *)
550 while (cur_system < system_end && cur_user < user_end) {
552 if (cur_system->m_token < cur_user->m_token) {
553 /* do append operation here */
554 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
556 } else if (cur_system->m_token > cur_user->m_token) {
557 /* do append operation here */
558 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
561 assert(cur_system->m_token == cur_user->m_token);
563 SingleGramItem merged_item;
564 merged_item.m_token = cur_system->m_token;
565 merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
567 merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
568 cur_system++; cur_user++;
572 /* add remained items. */
573 while (cur_system < system_end) {
574 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
578 while (cur_user < user_end) {
579 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));