3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <glib/gstdio.h>
26 #include "memory_chunk.h"
27 #include "novel_types.h"
30 using namespace pinyin;
32 struct SingleGramItem{
33 phrase_token_t m_token;
37 SingleGram::SingleGram(){
38 m_chunk.set_size(sizeof(guint32));
39 memset(m_chunk.begin(), 0, sizeof(guint32));
42 SingleGram::SingleGram(void * buffer, size_t length){
43 m_chunk.set_chunk(buffer, length, NULL);
46 bool SingleGram::get_total_freq(guint32 & total) const{
47 char * buf_begin = (char *)m_chunk.begin();
48 total = *((guint32 *)buf_begin);
52 bool SingleGram::set_total_freq(guint32 total){
53 char * buf_begin = (char *)m_chunk.begin();
54 *((guint32 *)buf_begin) = total;
58 bool SingleGram::prune(){
61 SingleGramItem * begin = (SingleGramItem *)
62 ((const char *)(m_chunk.begin()) + sizeof(guint32));
63 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
66 for ( SingleGramItem * cur = begin; cur != end; ++cur){
69 if ( cur->m_freq == 0 ){
70 size_t offset = sizeof(guint32) + (cur - begin)
71 * sizeof(SingleGramItem) ;
72 m_chunk.remove_content(offset, sizeof(SingleGramItem));
76 assert(get_total_freq(total_freq));
77 assert(set_total_freq(total_freq - nitem));
82 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
83 return lhs.m_token < rhs.m_token;
86 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
88 const SingleGramItem * begin = (const SingleGramItem *)
89 ((const char *)(m_chunk.begin()) + sizeof(guint32));
90 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
93 BigramPhraseItemWithCount bigram_item_with_count;
94 assert(get_total_freq(total_freq));
96 for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
97 bigram_item_with_count.m_token = cur_item->m_token;
98 bigram_item_with_count.m_count = cur_item->m_freq;
99 bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
100 g_array_append_val(array, bigram_item_with_count);
106 bool SingleGram::search(/* in */ PhraseIndexRange * range,
107 /* out */ BigramPhraseArray array) const {
108 const SingleGramItem * begin = (const SingleGramItem *)
109 ((const char *)(m_chunk.begin()) + sizeof(guint32));
110 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
112 SingleGramItem compare_item;
113 compare_item.m_token = range->m_range_begin;
114 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
117 BigramPhraseItem bigram_item;
118 assert(get_total_freq(total_freq));
120 for ( ; cur_item != end; ++cur_item){
121 if ( cur_item->m_token >= range->m_range_end )
123 bigram_item.m_token = cur_item->m_token;
124 bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
125 g_array_append_val(array, bigram_item);
131 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
132 /* in */ guint32 freq){
133 SingleGramItem * begin = (SingleGramItem *)
134 ((const char *)(m_chunk.begin()) + sizeof(guint32));
135 SingleGramItem * end = (SingleGramItem *) m_chunk.end();
136 SingleGramItem compare_item;
137 compare_item.m_token = token;
138 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
140 SingleGramItem insert_item;
141 insert_item.m_token = token;
142 insert_item.m_freq = freq;
143 for ( ; cur_item != end; ++cur_item ){
144 if ( cur_item->m_token > token ){
145 size_t offset = sizeof(guint32) +
146 sizeof(SingleGramItem) * (cur_item - begin);
147 m_chunk.insert_content(offset, &insert_item,
148 sizeof(SingleGramItem));
151 if ( cur_item->m_token == token ){
155 m_chunk.insert_content(m_chunk.size(), &insert_item,
156 sizeof(SingleGramItem));
160 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
161 /* out */ guint32 & freq){
163 const SingleGramItem * begin = (const SingleGramItem *)
164 ((const char *)(m_chunk.begin()) + sizeof(guint32));
165 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
166 SingleGramItem compare_item;
167 compare_item.m_token = token;
168 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
170 for ( ; cur_item != end; ++cur_item ){
171 if ( cur_item->m_token > token )
173 if ( cur_item->m_token == token ){
174 freq = cur_item -> m_freq;
175 size_t offset = sizeof(guint32) +
176 sizeof(SingleGramItem) * (cur_item - begin);
177 m_chunk.remove_content(offset, sizeof(SingleGramItem));
184 bool SingleGram::get_freq(/* in */ phrase_token_t token,
185 /* out */ guint32 & freq) const {
187 const SingleGramItem * begin = (const SingleGramItem *)
188 ((const char *)(m_chunk.begin()) + sizeof(guint32));
189 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
190 SingleGramItem compare_item;
191 compare_item.m_token = token;
192 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
194 for ( ; cur_item != end; ++cur_item){
195 if ( cur_item->m_token > token )
197 if ( cur_item->m_token == token ){
198 freq = cur_item -> m_freq;
205 bool SingleGram::set_freq( /* in */ phrase_token_t token,
206 /* in */ guint32 freq){
207 SingleGramItem * begin = (SingleGramItem *)
208 ((const char *)(m_chunk.begin()) + sizeof(guint32));
209 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
210 SingleGramItem compare_item;
211 compare_item.m_token = token;
212 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
214 for ( ;cur_item != end; ++cur_item){
215 if ( cur_item->m_token > token ){
218 if ( cur_item->m_token == token ){
219 cur_item -> m_freq = freq;
226 bool Bigram::load_db(const char * dbfile){
229 /* create in memory db. */
230 int ret = db_create(&m_db, NULL, 0);
233 ret = m_db->open(m_db, NULL, NULL, NULL,
234 DB_HASH, DB_CREATE, 0600);
238 /* load db into memory. */
240 ret = db_create(&tmp_db, NULL, 0);
243 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
244 DB_HASH, DB_RDONLY, 0600);
248 DBC * cursorp = NULL;
251 tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
253 /* Initialize our DBTs. */
254 memset(&key, 0, sizeof(DBT));
255 memset(&data, 0, sizeof(DBT));
257 /* Iterate over the database, retrieving each record in turn. */
258 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
259 int ret = m_db->put(m_db, NULL, &key, &data, 0);
262 assert (ret == DB_NOTFOUND);
264 /* Cursors must be closed */
265 if ( cursorp != NULL )
266 cursorp->c_close(cursorp);
268 if ( tmp_db != NULL )
269 tmp_db->close(tmp_db, 0);
274 bool Bigram::save_db(const char * dbfile){
277 int ret = g_unlink(dbfile);
278 if ( ret != 0 && errno != ENOENT)
281 ret = db_create(&tmp_db, NULL, 0);
284 ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
285 DB_HASH, DB_CREATE, 0600);
289 DBC * cursorp = NULL;
292 m_db->cursor(m_db, NULL, &cursorp, 0);
294 /* Initialize our DBTs. */
295 memset(&key, 0, sizeof(DBT));
296 memset(&data, 0, sizeof(DBT));
298 /* Iterate over the database, retrieving each record in turn. */
299 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
300 int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
303 assert (ret == DB_NOTFOUND);
305 /* Cursors must be closed */
306 if ( cursorp != NULL )
307 cursorp->c_close(cursorp);
309 if ( tmp_db != NULL )
310 tmp_db->close(tmp_db, 0);
315 bool Bigram::attach(const char * dbfile, guint32 flags){
317 u_int32_t db_flags = 0;
319 if ( flags & ATTACH_READONLY )
320 db_flags |= DB_RDONLY;
321 if ( flags & ATTACH_READWRITE )
322 assert( !( flags & ATTACH_READONLY ) );
323 if ( flags & ATTACH_CREATE )
324 db_flags |= DB_CREATE;
328 int ret = db_create(&m_db, NULL, 0);
332 ret = m_db->open(m_db, NULL, dbfile, NULL,
333 DB_HASH, db_flags, 0644);
340 bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
346 memset(&db_key, 0, sizeof(DBT));
347 db_key.data = &index;
348 db_key.size = sizeof(phrase_token_t);
351 memset(&db_data, 0, sizeof(DBT));
352 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
356 single_gram = new SingleGram(db_data.data, db_data.size);
360 bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
365 memset(&db_key, 0, sizeof(DBT));
366 db_key.data = &index;
367 db_key.size = sizeof(phrase_token_t);
369 memset(&db_data, 0, sizeof(DBT));
370 db_data.data = single_gram->m_chunk.begin();
371 db_data.size = single_gram->m_chunk.size();
373 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
377 bool Bigram::get_all_items(GArray * items){
378 g_array_set_size(items, 0);
383 DBC * cursorp = NULL;
387 m_db->cursor(m_db, NULL, &cursorp, 0);
389 /* Initialize our DBTs. */
390 memset(&key, 0, sizeof(DBT));
391 memset(&data, 0, sizeof(DBT));
393 /* Iterate over the database, retrieving each record in turn. */
394 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
395 assert(key.size == sizeof(phrase_token_t));
396 phrase_token_t * token = (phrase_token_t *)key.data;
397 g_array_append_val(items, *token);
400 assert (ret == DB_NOTFOUND);
402 /* Cursors must be closed */
404 cursorp->c_close(cursorp);
412 /* merge origin system info and delta user info */
413 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
414 const SingleGram * user){
415 if (NULL == system && NULL == user)
418 MemoryChunk & merged_chunk = merged->m_chunk;
420 if (NULL == system) {
421 merged_chunk.set_chunk(user->m_chunk.begin(),
422 user->m_chunk.size(), NULL);
427 merged_chunk.set_chunk(system->m_chunk.begin(),
428 system->m_chunk.size(), NULL);
433 merged_chunk.set_size(sizeof(guint32));
435 /* merge the origin info and delta info */
436 guint32 system_total, user_total;
437 assert(system->get_total_freq(system_total));
438 assert(user->get_total_freq(user_total));
439 const guint32 merged_total = system_total + user_total;
440 merged_chunk.set_content(0, &merged_total, sizeof(guint32));
442 const SingleGramItem * cur_system = (const SingleGramItem *)
443 (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
444 const SingleGramItem * system_end = (const SingleGramItem *)
445 system->m_chunk.end();
447 const SingleGramItem * cur_user = (const SingleGramItem *)
448 (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
449 const SingleGramItem * user_end = (const SingleGramItem *)
452 while (cur_system < system_end && cur_user < user_end) {
454 if (cur_system->m_token < cur_user->m_token) {
455 /* do append operation here */
456 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
458 } else if (cur_system->m_token > cur_user->m_token) {
459 /* do append operation here */
460 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
463 assert(cur_system->m_token == cur_user->m_token);
465 SingleGramItem merged_item;
466 merged_item.m_token = cur_system->m_token;
467 merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
469 merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
470 cur_system++; cur_user++;
474 /* add remained items. */
475 while (cur_system < system_end) {
476 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
480 while (cur_user < user_end) {
481 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));