3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <glib/gstdio.h>
26 #include "memory_chunk.h"
27 #include "novel_types.h"
30 using namespace pinyin;
32 struct SingleGramItem{
33 phrase_token_t m_token;
37 SingleGram::SingleGram(){
38 m_chunk.set_size(sizeof(guint32));
39 memset(m_chunk.begin(), 0, sizeof(guint32));
42 SingleGram::SingleGram(void * buffer, size_t length){
43 m_chunk.set_chunk(buffer, length, NULL);
46 bool SingleGram::get_total_freq(guint32 & total) const{
47 char * buf_begin = (char *)m_chunk.begin();
48 total = *((guint32 *)buf_begin);
52 bool SingleGram::set_total_freq(guint32 total){
53 char * buf_begin = (char *)m_chunk.begin();
54 *((guint32 *)buf_begin) = total;
58 guint32 SingleGram::get_length(){
59 /* get the number of items. */
60 const SingleGramItem * begin = (const SingleGramItem *)
61 ((const char *)(m_chunk.begin()) + sizeof(guint32));
62 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
64 const guint32 length = end - begin;
67 /* no items here, total freq should be zero. */
68 guint32 total_freq = 0;
69 assert(get_total_freq(total_freq));
70 assert(0 == total_freq);
76 guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
77 guint32 removed_items = 0;
79 guint32 total_freq = 0;
80 assert(get_total_freq(total_freq));
82 const SingleGramItem * begin = (const SingleGramItem *)
83 ((const char *)(m_chunk.begin()) + sizeof(guint32));
84 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
86 for (const SingleGramItem * cur = begin; cur != end; ++cur) {
87 if ((cur->m_token & mask) != value)
90 total_freq -= cur->m_freq;
91 size_t offset = sizeof(guint32) +
92 sizeof(SingleGramItem) * (cur - begin);
93 m_chunk.remove_content(offset, sizeof(SingleGramItem));
95 /* update chunk end. */
96 end = (const SingleGramItem *) m_chunk.end();
101 assert(set_total_freq(total_freq));
102 return removed_items;
105 bool SingleGram::prune(){
108 SingleGramItem * begin = (SingleGramItem *)
109 ((const char *)(m_chunk.begin()) + sizeof(guint32));
110 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
113 for ( SingleGramItem * cur = begin; cur != end; ++cur){
116 if ( cur->m_freq == 0 ){
117 size_t offset = sizeof(guint32) + (cur - begin)
118 * sizeof(SingleGramItem) ;
119 m_chunk.remove_content(offset, sizeof(SingleGramItem));
123 assert(get_total_freq(total_freq));
124 assert(set_total_freq(total_freq - nitem));
129 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
130 return lhs.m_token < rhs.m_token;
133 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
135 const SingleGramItem * begin = (const SingleGramItem *)
136 ((const char *)(m_chunk.begin()) + sizeof(guint32));
137 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
140 BigramPhraseItemWithCount bigram_item_with_count;
141 assert(get_total_freq(total_freq));
143 for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
144 bigram_item_with_count.m_token = cur_item->m_token;
145 bigram_item_with_count.m_count = cur_item->m_freq;
146 bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
147 g_array_append_val(array, bigram_item_with_count);
153 bool SingleGram::search(/* in */ PhraseIndexRange * range,
154 /* out */ BigramPhraseArray array) const {
155 const SingleGramItem * begin = (const SingleGramItem *)
156 ((const char *)(m_chunk.begin()) + sizeof(guint32));
157 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
159 SingleGramItem compare_item;
160 compare_item.m_token = range->m_range_begin;
161 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
164 BigramPhraseItem bigram_item;
165 assert(get_total_freq(total_freq));
167 for ( ; cur_item != end; ++cur_item){
168 if ( cur_item->m_token >= range->m_range_end )
170 bigram_item.m_token = cur_item->m_token;
171 bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
172 g_array_append_val(array, bigram_item);
178 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
179 /* in */ guint32 freq){
180 SingleGramItem * begin = (SingleGramItem *)
181 ((const char *)(m_chunk.begin()) + sizeof(guint32));
182 SingleGramItem * end = (SingleGramItem *) m_chunk.end();
183 SingleGramItem compare_item;
184 compare_item.m_token = token;
185 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
187 SingleGramItem insert_item;
188 insert_item.m_token = token;
189 insert_item.m_freq = freq;
190 for ( ; cur_item != end; ++cur_item ){
191 if ( cur_item->m_token > token ){
192 size_t offset = sizeof(guint32) +
193 sizeof(SingleGramItem) * (cur_item - begin);
194 m_chunk.insert_content(offset, &insert_item,
195 sizeof(SingleGramItem));
198 if ( cur_item->m_token == token ){
202 m_chunk.insert_content(m_chunk.size(), &insert_item,
203 sizeof(SingleGramItem));
207 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
208 /* out */ guint32 & freq){
210 const SingleGramItem * begin = (const SingleGramItem *)
211 ((const char *)(m_chunk.begin()) + sizeof(guint32));
212 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
213 SingleGramItem compare_item;
214 compare_item.m_token = token;
215 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
217 for ( ; cur_item != end; ++cur_item ){
218 if ( cur_item->m_token > token )
220 if ( cur_item->m_token == token ){
221 freq = cur_item -> m_freq;
222 size_t offset = sizeof(guint32) +
223 sizeof(SingleGramItem) * (cur_item - begin);
224 m_chunk.remove_content(offset, sizeof(SingleGramItem));
231 bool SingleGram::get_freq(/* in */ phrase_token_t token,
232 /* out */ guint32 & freq) const {
234 const SingleGramItem * begin = (const SingleGramItem *)
235 ((const char *)(m_chunk.begin()) + sizeof(guint32));
236 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
237 SingleGramItem compare_item;
238 compare_item.m_token = token;
239 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
241 for ( ; cur_item != end; ++cur_item){
242 if ( cur_item->m_token > token )
244 if ( cur_item->m_token == token ){
245 freq = cur_item -> m_freq;
252 bool SingleGram::set_freq( /* in */ phrase_token_t token,
253 /* in */ guint32 freq){
254 SingleGramItem * begin = (SingleGramItem *)
255 ((const char *)(m_chunk.begin()) + sizeof(guint32));
256 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
257 SingleGramItem compare_item;
258 compare_item.m_token = token;
259 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
261 for ( ;cur_item != end; ++cur_item){
262 if ( cur_item->m_token > token ){
265 if ( cur_item->m_token == token ){
266 cur_item -> m_freq = freq;
273 bool Bigram::load_db(const char * dbfile){
276 /* create in memory db. */
277 int ret = db_create(&m_db, NULL, 0);
280 ret = m_db->open(m_db, NULL, NULL, NULL,
281 DB_HASH, DB_CREATE, 0600);
285 /* load db into memory. */
287 ret = db_create(&tmp_db, NULL, 0);
293 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
294 DB_HASH, DB_RDONLY, 0600);
298 DBC * cursorp = NULL;
302 tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
307 /* Initialize our DBTs. */
308 memset(&key, 0, sizeof(DBT));
309 memset(&data, 0, sizeof(DBT));
311 /* Iterate over the database, retrieving each record in turn. */
312 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
313 int ret = m_db->put(m_db, NULL, &key, &data, 0);
316 assert (ret == DB_NOTFOUND);
318 /* Cursors must be closed */
319 if ( cursorp != NULL )
320 cursorp->c_close(cursorp);
322 if ( tmp_db != NULL )
323 tmp_db->close(tmp_db, 0);
328 bool Bigram::save_db(const char * dbfile){
331 int ret = unlink(dbfile);
332 if ( ret != 0 && errno != ENOENT)
335 ret = db_create(&tmp_db, NULL, 0);
341 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
342 DB_HASH, DB_CREATE, 0600);
346 DBC * cursorp = NULL;
349 m_db->cursor(m_db, NULL, &cursorp, 0);
354 /* Initialize our DBTs. */
355 memset(&key, 0, sizeof(DBT));
356 memset(&data, 0, sizeof(DBT));
358 /* Iterate over the database, retrieving each record in turn. */
359 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
360 int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
363 assert (ret == DB_NOTFOUND);
365 /* Cursors must be closed */
366 if ( cursorp != NULL )
367 cursorp->c_close(cursorp);
369 if ( tmp_db != NULL )
370 tmp_db->close(tmp_db, 0);
375 bool Bigram::attach(const char * dbfile, guint32 flags){
377 u_int32_t db_flags = 0;
379 if ( flags & ATTACH_READONLY )
380 db_flags |= DB_RDONLY;
381 if ( flags & ATTACH_READWRITE )
382 assert( !( flags & ATTACH_READONLY ) );
383 if ( flags & ATTACH_CREATE )
384 db_flags |= DB_CREATE;
388 int ret = db_create(&m_db, NULL, 0);
392 ret = m_db->open(m_db, NULL, dbfile, NULL,
393 DB_HASH, db_flags, 0644);
400 bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
406 memset(&db_key, 0, sizeof(DBT));
407 db_key.data = &index;
408 db_key.size = sizeof(phrase_token_t);
411 memset(&db_data, 0, sizeof(DBT));
412 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
416 single_gram = new SingleGram(db_data.data, db_data.size);
420 bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
425 memset(&db_key, 0, sizeof(DBT));
426 db_key.data = &index;
427 db_key.size = sizeof(phrase_token_t);
429 memset(&db_data, 0, sizeof(DBT));
430 db_data.data = single_gram->m_chunk.begin();
431 db_data.size = single_gram->m_chunk.size();
433 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
437 bool Bigram::remove(/* in */ phrase_token_t index){
442 memset(&db_key, 0, sizeof(DBT));
443 db_key.data = &index;
444 db_key.size = sizeof(phrase_token_t);
446 int ret = m_db->del(m_db, NULL, &db_key, 0);
450 bool Bigram::get_all_items(GArray * items){
451 g_array_set_size(items, 0);
456 DBC * cursorp = NULL;
460 m_db->cursor(m_db, NULL, &cursorp, 0);
465 /* Initialize our DBTs. */
466 memset(&key, 0, sizeof(DBT));
467 memset(&data, 0, sizeof(DBT));
469 /* Iterate over the database, retrieving each record in turn. */
470 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
471 assert(key.size == sizeof(phrase_token_t));
472 phrase_token_t * token = (phrase_token_t *)key.data;
473 g_array_append_val(items, *token);
476 assert (ret == DB_NOTFOUND);
478 /* Cursors must be closed */
480 cursorp->c_close(cursorp);
485 bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
486 GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
488 if (!get_all_items(items)) {
489 g_array_free(items, TRUE);
493 for (size_t i = 0; i < items->len; ++i) {
494 phrase_token_t index = g_array_index(items, phrase_token_t, i);
496 if ((index & mask) == value) {
497 assert(remove(index));
501 SingleGram * gram = NULL;
502 assert(load(index, gram));
504 int num = gram->mask_out(mask, value);
510 if (0 == gram->get_length()) {
511 assert(remove(index));
513 assert(store(index, gram));
519 g_array_free(items, TRUE);
526 /* merge origin system info and delta user info */
527 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
528 const SingleGram * user){
529 if (NULL == system && NULL == user)
532 MemoryChunk & merged_chunk = merged->m_chunk;
534 if (NULL == system) {
535 merged_chunk.set_chunk(user->m_chunk.begin(),
536 user->m_chunk.size(), NULL);
541 merged_chunk.set_chunk(system->m_chunk.begin(),
542 system->m_chunk.size(), NULL);
547 merged_chunk.set_size(sizeof(guint32));
549 /* merge the origin info and delta info */
550 guint32 system_total, user_total;
551 assert(system->get_total_freq(system_total));
552 assert(user->get_total_freq(user_total));
553 const guint32 merged_total = system_total + user_total;
554 merged_chunk.set_content(0, &merged_total, sizeof(guint32));
556 const SingleGramItem * cur_system = (const SingleGramItem *)
557 (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
558 const SingleGramItem * system_end = (const SingleGramItem *)
559 system->m_chunk.end();
561 const SingleGramItem * cur_user = (const SingleGramItem *)
562 (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
563 const SingleGramItem * user_end = (const SingleGramItem *)
566 while (cur_system < system_end && cur_user < user_end) {
568 if (cur_system->m_token < cur_user->m_token) {
569 /* do append operation here */
570 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
572 } else if (cur_system->m_token > cur_user->m_token) {
573 /* do append operation here */
574 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
577 assert(cur_system->m_token == cur_user->m_token);
579 SingleGramItem merged_item;
580 merged_item.m_token = cur_system->m_token;
581 merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
583 merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
584 cur_system++; cur_user++;
588 /* add remained items. */
589 while (cur_system < system_end) {
590 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
594 while (cur_user < user_end) {
595 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));