3 * Library to deal with pinyin.
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #ifndef FLEXIBLE_NGRAM_H
25 #define FLEXIBLE_NGRAM_H
30 /* Note: the signature of the template parameters.
31 * struct MagicHeader, ArrayHeader, ArrayItem.
36 typedef GArray * FlexibleBigramPhraseArray;
40 * @ArrayHeader: the struct ArrayHeader.
41 * @ArrayItem: the struct ArrayItem.
43 * The flexible single gram is mainly used for training purpose.
47 template<typename ArrayHeader, typename ArrayItem>
48 class FlexibleSingleGram{
49 template<typename MH, typename AH,
51 friend class FlexibleBigram;
54 FlexibleSingleGram(void * buffer, size_t length){
55 m_chunk.set_chunk(buffer, length, NULL);
61 * Define the struct ArrayItemWithToken type.
65 phrase_token_t m_token;
70 static bool token_less_than(const ArrayItemWithToken & lhs,
71 const ArrayItemWithToken & rhs){
72 return lhs.m_token < rhs.m_token;
77 * FlexibleSingleGram::FlexibleSingleGram:
79 * The constructor of the FlexibleSingleGram.
83 m_chunk.set_size(sizeof(ArrayHeader));
84 memset(m_chunk.begin(), 0, sizeof(ArrayHeader));
88 * FlexibleSingleGram::retrieve_all:
89 * @array: the array to store all items in this single gram.
90 * @returns: whether the retrieve operation is successful.
92 * Retrieve all items in this single gram.
95 bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){
96 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
97 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
98 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
101 ArrayItemWithToken item;
102 for ( const ArrayItemWithToken * cur_item = begin;
105 /* Note: optimize this with g_array_append_vals? */
106 item.m_token = cur_item->m_token;
107 item.m_item = cur_item->m_item;
108 g_array_append_val(array, item);
115 * FlexibleSingleGram::search:
116 * @range: the token range.
117 * @array: the array to store the array items with token in the range.
118 * @returns: whether the search operation is successful.
120 * Search the array items with token in the range.
122 * Note: The array result may contain many items.
125 bool search(/* in */ PhraseIndexRange * range,
126 /* out */ FlexibleBigramPhraseArray array){
127 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
128 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
129 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
132 ArrayItemWithToken compare_item;
133 compare_item.m_token = range->m_range_begin;
134 const ArrayItemWithToken * cur_item = std_lite::lower_bound
135 (begin, end, compare_item, token_less_than);
137 ArrayItemWithToken item;
138 for ( ; cur_item != end; ++cur_item){
139 if ( cur_item->m_token >= range->m_range_end )
141 item.m_token = cur_item->m_token;
142 item.m_item = cur_item->m_item;
143 g_array_append_val(array, item);
150 * FlexibleSingleGram::insert_array_item:
151 * @token: the phrase token to be inserted.
152 * @item: the array item of this token.
153 * @returns: whether the insert operation is successful.
155 * Insert the array item of the token.
158 bool insert_array_item(/* in */ phrase_token_t token,
159 /* in */ const ArrayItem & item){
160 ArrayItemWithToken * begin = (ArrayItemWithToken *)
161 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
162 ArrayItemWithToken * end = (ArrayItemWithToken *)
165 ArrayItemWithToken compare_item;
166 compare_item.m_token = token;
167 ArrayItemWithToken * cur_item = std_lite::lower_bound
168 (begin, end, compare_item, token_less_than);
170 ArrayItemWithToken insert_item;
171 insert_item.m_token = token;
172 insert_item.m_item = item;
174 for ( ; cur_item != end; ++cur_item ){
175 if ( cur_item->m_token > token ){
176 size_t offset = sizeof(ArrayHeader) +
177 sizeof(ArrayItemWithToken) * (cur_item - begin);
178 m_chunk.insert_content(offset, &insert_item,
179 sizeof(ArrayItemWithToken));
182 if ( cur_item->m_token == token ){
186 m_chunk.insert_content(m_chunk.size(), &insert_item,
187 sizeof(ArrayItemWithToken));
192 * FlexibleSingleGram::remove_array_item:
193 * @token: the phrase token to be removed.
194 * @item: the content of the removed array item.
195 * @returns: whether the remove operation is successful.
197 * Remove the array item of the token.
200 bool remove_array_item(/* in */ phrase_token_t token,
201 /* out */ ArrayItem & item)
204 memset(&item, 0, sizeof(ArrayItem));
206 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
207 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
208 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
211 ArrayItemWithToken compare_item;
212 compare_item.m_token = token;
213 const ArrayItemWithToken * cur_item = std_lite::lower_bound
214 (begin, end, compare_item, token_less_than);
216 for ( ; cur_item != end; ++cur_item){
217 if ( cur_item->m_token > token )
219 if ( cur_item->m_token == token ){
220 memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
221 size_t offset = sizeof(ArrayHeader) +
222 sizeof(ArrayItemWithToken) * (cur_item - begin);
223 m_chunk.remove_content(offset, sizeof(ArrayItemWithToken));
231 * FlexibleSingleGram::get_array_item:
232 * @token: the phrase token.
233 * @item: the array item of the token.
234 * @returns: whether the get operation is successful.
236 * Get the array item of the token.
239 bool get_array_item(/* in */ phrase_token_t token,
240 /* out */ ArrayItem & item)
243 memset(&item, 0, sizeof(ArrayItem));
245 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
246 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
247 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
250 ArrayItemWithToken compare_item;
251 compare_item.m_token = token;
252 const ArrayItemWithToken * cur_item = std_lite::lower_bound
253 (begin, end, compare_item, token_less_than);
255 for ( ; cur_item != end; ++cur_item){
256 if ( cur_item->m_token > token )
258 if ( cur_item->m_token == token ){
259 memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
267 * FlexibleSingleGram::set_array_item:
268 * @token: the phrase token.
269 * @item: the array item of the token.
270 * @returns: whether the set operation is successful.
272 * Set the array item of the token.
275 bool set_array_item(/* in */ phrase_token_t token,
276 /* in */ const ArrayItem & item){
277 ArrayItemWithToken * begin = (ArrayItemWithToken *)
278 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
279 ArrayItemWithToken * end = (ArrayItemWithToken *)
282 ArrayItemWithToken compare_item;
283 compare_item.m_token = token;
284 ArrayItemWithToken * cur_item = std_lite::lower_bound
285 (begin, end, compare_item, token_less_than);
287 for ( ; cur_item != end; ++cur_item ){
288 if ( cur_item->m_token > token ){
291 if ( cur_item->m_token == token ){
292 memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem));
300 * FlexibleSingleGram::get_array_header:
301 * @header: the array header of this single gram.
302 * @returns: whether the get operation is successful.
304 * Get the array header of this single gram.
307 bool get_array_header(/* out */ ArrayHeader & header){
309 memset(&header, 0, sizeof(ArrayHeader));
310 char * buf_begin = (char *)m_chunk.begin();
311 memcpy(&header, buf_begin, sizeof(ArrayHeader));
316 * FlexibleSingleGram::set_array_header:
317 * @header: the array header of this single gram.
318 * @returns: whether the set operation is successful.
320 * Set the array header of this single gram.
323 bool set_array_header(/* in */ const ArrayHeader & header){
324 char * buf_begin = (char *)m_chunk.begin();
325 memcpy(buf_begin, &header, sizeof(ArrayHeader));
332 * @MagicHeader: the struct type of the magic header.
333 * @ArrayHeader: the struct type of the array header.
334 * @ArrayItem: the struct type of the array item.
336 * The flexible bi-gram is mainly used for training purpose.
339 template<typename MagicHeader, typename ArrayHeader,
341 class FlexibleBigram{
342 /* Note: some flexible bi-gram file format check should be here. */
346 phrase_token_t m_magic_header_index[2];
348 char m_magic_number[4];
353 m_db->close(m_db, 0);
360 * FlexibleBigram::FlexibleBigram:
361 * @magic_number: the 4 bytes magic number of the flexible bi-gram.
363 * The constructor of the FlexibleBigram.
366 FlexibleBigram(const char * magic_number){
368 m_magic_header_index[0] = null_token;
369 m_magic_header_index[1] = null_token;
371 memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
375 * FlexibleBigram::~FlexibleBigram:
377 * The destructor of the FlexibleBigram.
385 * FlexibleBigram::attach:
386 * @dbfile: the path name of the flexible bi-gram.
387 * @flags: the attach flags for the Berkeley DB.
388 * @returns: whether the attach operation is successful.
390 * Attach Berkeley DB on filesystem for training purpose.
393 bool attach(const char * dbfile, guint32 flags){
395 u_int32_t db_flags = 0;
397 if ( flags & ATTACH_READONLY )
398 db_flags |= DB_RDONLY;
399 if ( flags & ATTACH_READWRITE )
400 assert( !(flags & ATTACH_READONLY ) );
404 int ret = db_create(&m_db, NULL, 0);
408 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
409 if ( ret != 0 && (flags & ATTACH_CREATE) ) {
410 db_flags |= DB_CREATE;
411 /* Create database file here, and write the signature. */
412 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
417 memset(&db_key, 0, sizeof(DBT));
418 db_key.data = m_magic_header_index;
419 db_key.size = sizeof(m_magic_header_index);
421 memset(&db_data, 0, sizeof(DBT));
422 db_data.data = m_magic_number;
423 db_data.size = sizeof(m_magic_number);
424 db_data.flags = DB_DBT_PARTIAL;
426 db_data.dlen = sizeof(m_magic_number);
428 ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
432 /* check the signature. */
434 memset(&db_key, 0, sizeof(DBT));
435 db_key.data = m_magic_header_index;
436 db_key.size = sizeof(m_magic_header_index);
438 memset(&db_data, 0, sizeof(DBT));
439 db_data.flags = DB_DBT_PARTIAL;
441 db_data.dlen = sizeof(m_magic_number);
442 ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
445 if ( sizeof(m_magic_number) != db_data.size )
447 if ( memcmp(db_data.data, m_magic_number,
448 sizeof(m_magic_number)) == 0 )
454 * FlexibleBigram::load:
455 * @index: the previous token in the flexible bi-gram.
456 * @single_gram: the single gram of the previous token.
457 * @returns: whether the load operation is successful.
459 * Load the single gram of the previous token.
462 bool load(phrase_token_t index,
463 FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
468 memset(&db_key, 0, sizeof(DBT));
469 db_key.data = &index;
470 db_key.size = sizeof(phrase_token_t);
475 memset(&db_data, 0, sizeof(DBT));
476 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
480 single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
481 (db_data.data, db_data.size);
487 * FlexibleBigram::store:
488 * @index: the previous token in the flexible bi-gram.
489 * @single_gram: the single gram of the previous token.
490 * @returns: whether the store operation is successful.
492 * Store the single gram of the previous token.
495 bool store(phrase_token_t index,
496 FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
501 memset(&db_key, 0, sizeof(DBT));
502 db_key.data = &index;
503 db_key.size = sizeof(phrase_token_t);
505 memset(&db_data, 0, sizeof(DBT));
506 db_data.data = single_gram->m_chunk.begin();
507 db_data.size = single_gram->m_chunk.size();
509 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
514 * FlexibleBigram::remove:
515 * @index: the previous token in the flexible bi-gram.
516 * @returns: whether the remove operation is successful.
518 * Remove the single gram of the previous token.
521 bool remove(phrase_token_t index){
526 memset(&db_key, 0, sizeof(DBT));
527 db_key.data = &index;
528 db_key.size = sizeof(phrase_token_t);
530 int ret = m_db->del(m_db, NULL, &db_key, 0);
535 * FlexibleBigram::get_all_items:
536 * @items: the GArray to store all previous tokens.
537 * @returns: whether the get operation is successful.
539 * Get the array of all previous tokens for parameter estimation.
542 bool get_all_items(GArray * items){
543 g_array_set_size(items, 0);
552 m_db->cursor(m_db, NULL, &cursorp, 0);
554 /* Initialize our DBTs. */
555 memset(&key, 0, sizeof(DBT));
556 memset(&data, 0, sizeof(DBT));
558 /* Iterate over the database, retrieving each record in turn. */
559 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
560 if (key.size != sizeof(phrase_token_t)){
561 /* skip magic header. */
564 phrase_token_t * token = (phrase_token_t *) key.data;
565 g_array_append_val(items, *token);
568 if ( ret != DB_NOTFOUND ){
569 fprintf(stderr, "training db error, exit!");
573 /* Cursors must be closed */
575 cursorp->c_close(cursorp);
580 * FlexibleBigram::get_magic_header:
581 * @header: the magic header.
582 * @returns: whether the get operation is successful.
584 * Get the magic header of the flexible bi-gram.
587 bool get_magic_header(MagicHeader & header){
589 memset(&header, 0, sizeof(MagicHeader));
595 memset(&db_key, 0, sizeof(DBT));
596 db_key.data = m_magic_header_index;
597 db_key.size = sizeof(m_magic_header_index);
599 memset(&db_data, 0, sizeof(DBT));
600 db_data.flags = DB_DBT_PARTIAL;
601 db_data.doff = sizeof(m_magic_number);
602 db_data.dlen = sizeof(MagicHeader);
604 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
608 if ( sizeof(MagicHeader) != db_data.size )
611 memcpy(&header, db_data.data, sizeof(MagicHeader));
616 * FlexibleBigram::set_magic_header:
617 * @header: the magic header.
618 * @returns: whether the set operation is successful.
620 * Set the magic header of the flexible bi-gram.
623 bool set_magic_header(const MagicHeader & header){
628 memset(&db_key, 0, sizeof(DBT));
629 db_key.data = m_magic_header_index;
630 db_key.size = sizeof(m_magic_header_index);
632 memset(&db_data, 0, sizeof(DBT));
633 db_data.data = (void *) &header;
634 db_data.size = sizeof(MagicHeader);
635 db_data.flags = DB_DBT_PARTIAL;
636 db_data.doff = sizeof(m_magic_number);
637 db_data.dlen = sizeof(MagicHeader);
639 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
644 * FlexibleBigram::get_array_header:
645 * @index: the previous token in the flexible bi-gram.
646 * @header: the array header in the single gram of the previous token.
647 * @returns: whether the get operation is successful.
649 * Get the array header in the single gram of the previous token.
652 bool get_array_header(phrase_token_t index, ArrayHeader & header){
654 memset(&header, 0, sizeof(ArrayHeader));
660 memset(&db_key, 0, sizeof(DBT));
661 db_key.data = &index;
662 db_key.size = sizeof(phrase_token_t);
665 memset(&db_data, 0, sizeof(DBT));
666 db_data.flags = DB_DBT_PARTIAL;
668 db_data.dlen = sizeof(ArrayHeader);
669 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
673 assert(db_data.size == sizeof(ArrayHeader));
674 memcpy(&header, db_data.data, sizeof(ArrayHeader));
679 * FlexibleBigram::set_array_header:
680 * @index: the previous token of the flexible bi-gram.
681 * @header: the array header in the single gram of the previous token.
682 * @returns: whether the set operation is successful.
684 * Set the array header in the single gram of the previous token.
687 bool set_array_header(phrase_token_t index, const ArrayHeader & header){
692 memset(&db_key, 0, sizeof(DBT));
693 db_key.data = &index;
694 db_key.size = sizeof(phrase_token_t);
696 memset(&db_data, 0, sizeof(DBT));
697 db_data.data = (void *)&header;
698 db_data.size = sizeof(ArrayHeader);
699 db_data.flags = DB_DBT_PARTIAL;
701 db_data.dlen = sizeof(ArrayHeader);
703 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);