3 * Library to deal with pinyin.
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 #ifndef FLEXIBLE_NGRAM_H
25 #define FLEXIBLE_NGRAM_H
30 /* Note: the signature of the template parameters.
31 * struct MagicHeader, ArrayHeader, ArrayItem.
36 typedef GArray * FlexibleBigramPhraseArray;
40 * @ArrayHeader: the struct ArrayHeader.
41 * @ArrayItem: the struct ArrayItem.
43 * The flexible single gram is mainly used for training purpose.
47 template<typename ArrayHeader, typename ArrayItem>
48 class FlexibleSingleGram{
49 template<typename MH, typename AH,
51 friend class FlexibleBigram;
54 FlexibleSingleGram(void * buffer, size_t length){
55 m_chunk.set_chunk(buffer, length, NULL);
61 * Define the struct ArrayItemWithToken type.
65 phrase_token_t m_token;
70 static bool token_less_than(const ArrayItemWithToken & lhs,
71 const ArrayItemWithToken & rhs){
72 return lhs.m_token < rhs.m_token;
77 * FlexibleSingleGram::FlexibleSingleGram:
79 * The constructor of the FlexibleSingleGram.
83 m_chunk.set_size(sizeof(ArrayHeader));
84 memset(m_chunk.begin(), 0, sizeof(ArrayHeader));
88 * FlexibleSingleGram::retrieve_all:
89 * @array: the array to store all items in this single gram.
90 * @returns: whether the retrieve operation is successful.
92 * Retrieve all items in this single gram.
95 bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){
96 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
97 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
98 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
101 ArrayItemWithToken item;
102 for ( const ArrayItemWithToken * cur_item = begin;
105 /* Note: optimize this with g_array_append_vals? */
106 item.m_token = cur_item->m_token;
107 item.m_item = cur_item->m_item;
108 g_array_append_val(array, item);
115 * FlexibleSingleGram::search:
116 * @range: the token range.
117 * @array: the array to store the array items with token in the range.
118 * @returns: whether the search operation is successful.
120 * Search the array items with token in the range.
122 * Note: The array result may contain many items.
125 bool search(/* in */ PhraseIndexRange * range,
126 /* out */ FlexibleBigramPhraseArray array){
127 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
128 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
129 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
132 ArrayItemWithToken compare_item;
133 compare_item.m_token = range->m_range_begin;
134 const ArrayItemWithToken * cur_item = std_lite::lower_bound
135 (begin, end, compare_item, token_less_than);
137 ArrayItemWithToken item;
138 for ( ; cur_item != end; ++cur_item){
139 if ( cur_item->m_token >= range->m_range_end )
141 item.m_token = cur_item->m_token;
142 item.m_item = cur_item->m_item;
143 g_array_append_val(array, item);
150 * FlexibleSingleGram::insert_array_item:
151 * @token: the phrase token to be inserted.
152 * @item: the array item of this token.
153 * @returns: whether the insert operation is successful.
155 * Insert the array item of the token.
158 bool insert_array_item(/* in */ phrase_token_t token,
159 /* in */ const ArrayItem & item){
160 ArrayItemWithToken * begin = (ArrayItemWithToken *)
161 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
162 ArrayItemWithToken * end = (ArrayItemWithToken *)
165 ArrayItemWithToken compare_item;
166 compare_item.m_token = token;
167 ArrayItemWithToken * cur_item = std_lite::lower_bound
168 (begin, end, compare_item, token_less_than);
170 ArrayItemWithToken insert_item;
171 insert_item.m_token = token;
172 insert_item.m_item = item;
174 for ( ; cur_item != end; ++cur_item ){
175 if ( cur_item->m_token > token ){
176 size_t offset = sizeof(ArrayHeader) +
177 sizeof(ArrayItemWithToken) * (cur_item - begin);
178 m_chunk.insert_content(offset, &insert_item,
179 sizeof(ArrayItemWithToken));
182 if ( cur_item->m_token == token ){
186 m_chunk.insert_content(m_chunk.size(), &insert_item,
187 sizeof(ArrayItemWithToken));
192 * FlexibleSingleGram::remove_array_item:
193 * @token: the phrase token to be removed.
194 * @item: the content of the removed array item.
195 * @returns: whether the remove operation is successful.
197 * Remove the array item of the token.
200 bool remove_array_item(/* in */ phrase_token_t token,
201 /* out */ ArrayItem & item)
204 memset(&item, 0, sizeof(ArrayItem));
206 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
207 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
208 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
211 ArrayItemWithToken compare_item;
212 compare_item.m_token = token;
213 const ArrayItemWithToken * cur_item = std_lite::lower_bound
214 (begin, end, compare_item, token_less_than);
216 for ( ; cur_item != end; ++cur_item){
217 if ( cur_item->m_token > token )
219 if ( cur_item->m_token == token ){
220 memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
221 size_t offset = sizeof(ArrayHeader) +
222 sizeof(ArrayItemWithToken) * (cur_item - begin);
223 m_chunk.remove_content(offset, sizeof(ArrayItemWithToken));
231 * FlexibleSingleGram::get_array_item:
232 * @token: the phrase token.
233 * @item: the array item of the token.
234 * @returns: whether the get operation is successful.
236 * Get the array item of the token.
239 bool get_array_item(/* in */ phrase_token_t token,
240 /* out */ ArrayItem & item)
243 memset(&item, 0, sizeof(ArrayItem));
245 const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
246 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
247 const ArrayItemWithToken * end = (const ArrayItemWithToken *)
250 ArrayItemWithToken compare_item;
251 compare_item.m_token = token;
252 const ArrayItemWithToken * cur_item = std_lite::lower_bound
253 (begin, end, compare_item, token_less_than);
255 for ( ; cur_item != end; ++cur_item){
256 if ( cur_item->m_token > token )
258 if ( cur_item->m_token == token ){
259 memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
267 * FlexibleSingleGram::set_array_item:
268 * @token: the phrase token.
269 * @item: the array item of the token.
270 * @returns: whether the set operation is successful.
272 * Set the array item of the token.
275 bool set_array_item(/* in */ phrase_token_t token,
276 /* in */ const ArrayItem & item){
277 ArrayItemWithToken * begin = (ArrayItemWithToken *)
278 ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
279 ArrayItemWithToken * end = (ArrayItemWithToken *)
282 ArrayItemWithToken compare_item;
283 compare_item.m_token = token;
284 ArrayItemWithToken * cur_item = std_lite::lower_bound
285 (begin, end, compare_item, token_less_than);
287 for ( ; cur_item != end; ++cur_item ){
288 if ( cur_item->m_token > token ){
291 if ( cur_item->m_token == token ){
292 memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem));
300 * FlexibleSingleGram::get_array_header:
301 * @header: the array header of this single gram.
302 * @returns: whether the get operation is successful.
304 * Get the array header of this single gram.
307 bool get_array_header(/* out */ ArrayHeader & header){
309 memset(&header, 0, sizeof(ArrayHeader));
310 char * buf_begin = (char *)m_chunk.begin();
311 memcpy(&header, buf_begin, sizeof(ArrayHeader));
316 * FlexibleSingleGram::set_array_header:
317 * @header: the array header of this single gram.
318 * @returns: whether the set operation is successful.
320 * Set the array header of this single gram.
323 bool set_array_header(/* in */ const ArrayHeader & header){
324 char * buf_begin = (char *)m_chunk.begin();
325 memcpy(buf_begin, &header, sizeof(ArrayHeader));
332 * @MagicHeader: the struct type of the magic header.
333 * @ArrayHeader: the struct type of the array header.
334 * @ArrayItem: the struct type of the array item.
336 * The flexible bi-gram is mainly used for training purpose.
339 template<typename MagicHeader, typename ArrayHeader,
341 class FlexibleBigram{
342 /* Note: some flexible bi-gram file format check should be here. */
346 phrase_token_t m_magic_header_index[2];
348 char m_magic_number[4];
353 m_db->close(m_db, 0);
360 * FlexibleBigram::FlexibleBigram:
361 * @magic_number: the 4 bytes magic number of the flexible bi-gram.
363 * The constructor of the FlexibleBigram.
366 FlexibleBigram(const char * magic_number){
368 m_magic_header_index[0] = null_token;
369 m_magic_header_index[1] = null_token;
371 memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
375 * FlexibleBigram::~FlexibleBigram:
377 * The destructor of the FlexibleBigram.
385 * FlexibleBigram::attach:
386 * @dbfile: the path name of the flexible bi-gram.
387 * @flags: the attach flags for the Berkeley DB.
388 * @returns: whether the attach operation is successful.
390 * Attach Berkeley DB on filesystem for training purpose.
393 bool attach(const char * dbfile, guint32 flags){
395 u_int32_t db_flags = 0;
397 if ( flags & ATTACH_READONLY )
398 db_flags |= DB_RDONLY;
399 if ( flags & ATTACH_READWRITE )
400 assert( !(flags & ATTACH_READONLY ) );
404 int ret = db_create(&m_db, NULL, 0);
408 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
409 if ( ret != 0 && (flags & ATTACH_CREATE) ) {
410 db_flags |= DB_CREATE;
411 /* Create database file here, and write the signature. */
412 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
417 memset(&db_key, 0, sizeof(DBT));
418 db_key.data = m_magic_header_index;
419 db_key.size = sizeof(m_magic_header_index);
421 memset(&db_data, 0, sizeof(DBT));
422 db_data.data = m_magic_number;
423 db_data.size = sizeof(m_magic_number);
424 db_data.flags = DB_DBT_PARTIAL;
426 db_data.dlen = sizeof(m_magic_number);
428 ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
432 /* check the signature. */
434 memset(&db_key, 0, sizeof(DBT));
435 db_key.data = m_magic_header_index;
436 db_key.size = sizeof(m_magic_header_index);
438 memset(&db_data, 0, sizeof(DBT));
439 db_data.flags = DB_DBT_PARTIAL;
441 db_data.dlen = sizeof(m_magic_number);
442 ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
445 if ( sizeof(m_magic_number) != db_data.size )
447 if ( memcmp(db_data.data, m_magic_number,
448 sizeof(m_magic_number)) == 0 )
454 * FlexibleBigram::load:
455 * @index: the previous token in the flexible bi-gram.
456 * @single_gram: the single gram of the previous token.
457 * @returns: whether the load operation is successful.
459 * Load the single gram of the previous token.
462 bool load(phrase_token_t index,
463 FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
468 memset(&db_key, 0, sizeof(DBT));
469 db_key.data = &index;
470 db_key.size = sizeof(phrase_token_t);
475 memset(&db_data, 0, sizeof(DBT));
476 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
480 single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
481 (db_data.data, db_data.size);
487 * FlexibleBigram::store:
488 * @index: the previous token in the flexible bi-gram.
489 * @single_gram: the single gram of the previous token.
490 * @returns: whether the store operation is successful.
492 * Store the single gram of the previous token.
495 bool store(phrase_token_t index,
496 FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
501 memset(&db_key, 0, sizeof(DBT));
502 db_key.data = &index;
503 db_key.size = sizeof(phrase_token_t);
505 memset(&db_data, 0, sizeof(DBT));
506 db_data.data = single_gram->m_chunk.begin();
507 db_data.size = single_gram->m_chunk.size();
509 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
514 * FlexibleBigram::remove:
515 * @index: the previous token in the flexible bi-gram.
516 * @returns: whether the remove operation is successful.
518 * Remove the single gram of the previous token.
521 bool remove(phrase_token_t index){
526 memset(&db_key, 0, sizeof(DBT));
527 db_key.data = &index;
528 db_key.size = sizeof(phrase_token_t);
530 int ret = m_db->del(m_db, NULL, &db_key, 0);
535 * FlexibleBigram::get_all_items:
536 * @items: the GArray to store all previous tokens.
537 * @returns: whether the get operation is successful.
539 * Get the array of all previous tokens for parameter estimation.
542 bool get_all_items(GArray * items){
543 g_array_set_size(items, 0);
553 m_db->cursor(m_db, NULL, &cursorp, 0);
558 /* Initialize our DBTs. */
559 memset(&key, 0, sizeof(DBT));
560 memset(&data, 0, sizeof(DBT));
562 /* Iterate over the database, retrieving each record in turn. */
563 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
564 if (key.size != sizeof(phrase_token_t)){
565 /* skip magic header. */
568 phrase_token_t * token = (phrase_token_t *) key.data;
569 g_array_append_val(items, *token);
572 if ( ret != DB_NOTFOUND ){
573 fprintf(stderr, "training db error, exit!");
576 cursorp->c_close(cursorp);
581 /* Cursors must be closed */
583 cursorp->c_close(cursorp);
588 * FlexibleBigram::get_magic_header:
589 * @header: the magic header.
590 * @returns: whether the get operation is successful.
592 * Get the magic header of the flexible bi-gram.
595 bool get_magic_header(MagicHeader & header){
597 memset(&header, 0, sizeof(MagicHeader));
603 memset(&db_key, 0, sizeof(DBT));
604 db_key.data = m_magic_header_index;
605 db_key.size = sizeof(m_magic_header_index);
607 memset(&db_data, 0, sizeof(DBT));
608 db_data.flags = DB_DBT_PARTIAL;
609 db_data.doff = sizeof(m_magic_number);
610 db_data.dlen = sizeof(MagicHeader);
612 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
616 if ( sizeof(MagicHeader) != db_data.size )
619 memcpy(&header, db_data.data, sizeof(MagicHeader));
624 * FlexibleBigram::set_magic_header:
625 * @header: the magic header.
626 * @returns: whether the set operation is successful.
628 * Set the magic header of the flexible bi-gram.
631 bool set_magic_header(const MagicHeader & header){
636 memset(&db_key, 0, sizeof(DBT));
637 db_key.data = m_magic_header_index;
638 db_key.size = sizeof(m_magic_header_index);
640 memset(&db_data, 0, sizeof(DBT));
641 db_data.data = (void *) &header;
642 db_data.size = sizeof(MagicHeader);
643 db_data.flags = DB_DBT_PARTIAL;
644 db_data.doff = sizeof(m_magic_number);
645 db_data.dlen = sizeof(MagicHeader);
647 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
652 * FlexibleBigram::get_array_header:
653 * @index: the previous token in the flexible bi-gram.
654 * @header: the array header in the single gram of the previous token.
655 * @returns: whether the get operation is successful.
657 * Get the array header in the single gram of the previous token.
660 bool get_array_header(phrase_token_t index, ArrayHeader & header){
662 memset(&header, 0, sizeof(ArrayHeader));
668 memset(&db_key, 0, sizeof(DBT));
669 db_key.data = &index;
670 db_key.size = sizeof(phrase_token_t);
673 memset(&db_data, 0, sizeof(DBT));
674 db_data.flags = DB_DBT_PARTIAL;
676 db_data.dlen = sizeof(ArrayHeader);
677 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
681 assert(db_data.size == sizeof(ArrayHeader));
682 memcpy(&header, db_data.data, sizeof(ArrayHeader));
687 * FlexibleBigram::set_array_header:
688 * @index: the previous token of the flexible bi-gram.
689 * @header: the array header in the single gram of the previous token.
690 * @returns: whether the set operation is successful.
692 * Set the array header in the single gram of the previous token.
695 bool set_array_header(phrase_token_t index, const ArrayHeader & header){
700 memset(&db_key, 0, sizeof(DBT));
701 db_key.data = &index;
702 db_key.size = sizeof(phrase_token_t);
704 memset(&db_data, 0, sizeof(DBT));
705 db_data.data = (void *)&header;
706 db_data.size = sizeof(ArrayHeader);
707 db_data.flags = DB_DBT_PARTIAL;
709 db_data.dlen = sizeof(ArrayHeader);
711 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);