3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #ifndef PHRASE_INDEX_H
23 #define PHRASE_INDEX_H
27 #include "novel_types.h"
28 #include "chewing_key.h"
29 #include "pinyin_parser2.h"
30 #include "pinyin_phrase2.h"
31 #include "memory_chunk.h"
32 #include "phrase_index_logger.h"
35 * Phrase Index File Format
37 * Indirect Index: Index by Token
38 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
39 * + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
40 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
42 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
43 * + Phrase Length + number of Pronunciations + Uni-gram Frequency+
44 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45 * + Phrase String(UCS2) + n Pronunciations with Frequency +
46 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
53 /* Store delta info by phrase index logger in user home directory.
56 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
61 * The PhraseItem to access the items in phrase index.
65 friend class SubPhraseIndex;
68 bool set_n_pronunciation(guint8 n_prouns);
71 * PhraseItem::PhraseItem:
73 * The constructor of the PhraseItem.
77 m_chunk.set_size(phrase_item_header);
78 memset(m_chunk.begin(), 0, m_chunk.size());
82 PhraseItem(MemoryChunk & chunk){
83 m_chunk.set_content(0, chunk->begin(), chunk->size());
84 assert ( m_chunk.size() >= phrase_item_header);
89 * PhraseItem::get_phrase_length:
90 * @returns: the length of this phrase item.
92 * Get the length of this phrase item.
95 guint8 get_phrase_length(){
96 char * buf_begin = (char *)m_chunk.begin();
97 return (*(guint8 *)buf_begin);
101 * PhraseItem::get_n_pronunciation:
102 * @returns: the number of the pronunciations.
104 * Get the number of the pronunciations.
107 guint8 get_n_pronunciation(){
108 char * buf_begin = ( char *) m_chunk.begin();
109 return (*(guint8 *)(buf_begin + sizeof(guint8)));
113 * PhraseItem::get_unigram_frequency:
114 * @returns: the uni-gram frequency of this phrase item.
116 * Get the uni-gram frequency of this phrase item.
119 guint32 get_unigram_frequency(){
120 char * buf_begin = (char *)m_chunk.begin();
121 return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
125 * PhraseItem::get_pronunciation_possibility:
126 * @options: the pinyin options.
127 * @keys: the pronunciation keys.
128 * @returns: the possibility of this phrase item pronounces the pinyin.
130 * Get the possibility of this phrase item pronounces the pinyin.
133 gfloat get_pronunciation_possibility(pinyin_option_t options,
135 guint8 phrase_length = get_phrase_length();
136 guint8 npron = get_n_pronunciation();
137 size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t);
138 char * buf_begin = (char *)m_chunk.begin();
139 guint32 matched = 0, total_freq =0;
140 for ( int i = 0 ; i < npron ; ++i){
141 char * chewing_begin = buf_begin + offset +
142 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
143 guint32 * freq = (guint32 *)(chewing_begin +
144 phrase_length * sizeof(ChewingKey));
146 if ( 0 == pinyin_compare_with_ambiguities2
148 (ChewingKey *)chewing_begin,phrase_length) ){
152 // use preprocessor to avoid zero freq, in gen_pinyin_table.
154 if ( 0 == total_freq )
157 gfloat retval = matched / (gfloat) total_freq;
166 * PhraseItem::increase_pronunciation_possibility:
167 * @options: the pinyin options.
168 * @keys: the pronunciation keys.
169 * @delta: the delta to be added to the pronunciation keys.
171 * Add the delta to the pronunciation of the pronunciation keys.
174 void increase_pronunciation_possibility(pinyin_option_t options,
179 * PhraseItem::get_phrase_string:
180 * @phrase: the ucs4 character buffer.
181 * @returns: whether the get operation is successful.
183 * Get the ucs4 characters of this phrase item.
186 bool get_phrase_string(ucs4_t * phrase);
189 * PhraseItem::set_phrase_string:
190 * @phrase_length: the ucs4 character length of this phrase item.
191 * @phrase: the ucs4 character buffer.
192 * @returns: whether the set operation is successful.
194 * Set the length and ucs4 characters of this phrase item.
197 bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase);
200 * PhraseItem::get_nth_pronunciation:
201 * @index: the pronunciation index.
202 * @keys: the pronunciation keys.
203 * @freq: the frequency of the pronunciation.
204 * @returns: whether the get operation is successful.
206 * Get the nth pronunciation of this phrase item.
209 bool get_nth_pronunciation(size_t index,
210 /* out */ ChewingKey * keys,
211 /* out */ guint32 & freq);
214 * PhraseItem::append_pronunciation:
215 * @keys: the pronunciation keys.
216 * @freq: the frequency of the pronunciation.
218 * Append one pronunciation.
221 void append_pronunciation(ChewingKey * keys, guint32 freq);
224 * PhraseItem::remove_nth_pronunciation:
225 * @index: the pronunciation index.
227 * Remove the nth pronunciation.
229 * Note: Normally don't change the first pronunciation,
230 * which decides the token number.
233 void remove_nth_pronunciation(size_t index);
235 bool operator == (const PhraseItem & rhs) const{
236 if (m_chunk.size() != rhs.m_chunk.size())
238 return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
239 m_chunk.size()) == 0;
242 bool operator != (const PhraseItem & rhs) const{
243 return ! (*this == rhs);
248 * In Sub Phrase Index, token == (token & PHRASE_MASK).
254 * The SubPhraseIndex class for internal usage.
257 class SubPhraseIndex{
259 guint32 m_total_freq;
260 MemoryChunk m_phrase_index;
261 MemoryChunk m_phrase_content;
262 MemoryChunk * m_chunk;
266 m_phrase_index.set_size(0);
267 m_phrase_content.set_size(0);
276 * SubPhraseIndex::SubPhraseIndex:
278 * The constructor of the SubPhraseIndex.
281 SubPhraseIndex():m_total_freq(0){
286 * SubPhraseIndex::~SubPhraseIndex:
288 * The destructor of the SubPhraseIndex.
296 * SubPhraseIndex::load:
297 * @chunk: the memory chunk of the binary sub phrase index.
298 * @offset: the begin of binary data in the memory chunk.
299 * @end: the end of binary data in the memory chunk.
300 * @returns: whether the load operation is successful.
302 * Load the sub phrase index from the memory chunk.
305 bool load(MemoryChunk * chunk,
306 table_offset_t offset, table_offset_t end);
309 * SubPhraseIndex::store:
310 * @new_chunk: the new memory chunk to store this sub phrase index.
311 * @offset: the begin of binary data in the memory chunk.
312 * @end: the end of stored binary data in the memory chunk.
313 * @returns: whether the store operation is successful.
315 * Store the sub phrase index to the new memory chunk.
318 bool store(MemoryChunk * new_chunk,
319 table_offset_t offset, table_offset_t & end);
322 * SubPhraseIndex::diff:
323 * @oldone: the original content of sub phrase index.
324 * @logger: the delta information of user self-learning data.
325 * @returns: whether the diff operation is successful.
327 * Compare this sub phrase index with the original content of the system
328 * sub phrase index to generate the logger of difference.
330 * Note: Switch to logger format to reduce user space storage.
333 bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
336 * SubPhraseIndex::merge:
337 * @logger: the logger of difference in user home directory.
338 * @returns: whether the merge operation is successful.
340 * Merge the user logger of difference with this sub phrase index.
343 bool merge(PhraseIndexLogger * logger);
346 * SubPhraseIndex::get_range:
347 * @range: the token range.
348 * @returns: whether the get operation is successful.
350 * Get the token range in this sub phrase index.
353 int get_range(/* out */ PhraseIndexRange & range);
356 * SubPhraseIndex::get_phrase_index_total_freq:
357 * @returns: the total frequency of this sub phrase index.
359 * Get the total frequency of this sub phrase index.
361 * Note: maybe call it "Zero-gram".
364 guint32 get_phrase_index_total_freq();
367 * SubPhraseIndex::add_unigram_frequency:
368 * @token: the phrase token.
369 * @delta: the delta value of the phrase token.
370 * @returns: the status of the add operation.
372 * Add delta value to the phrase of the token.
374 * Note: this method is a fast path to add delta value.
375 * Maybe use the get_phrase_item method instead in future.
378 int add_unigram_frequency(phrase_token_t token, guint32 delta);
381 * SubPhraseIndex::get_phrase_item:
382 * @token: the phrase token.
383 * @item: the phrase item of the token.
384 * @returns: the status of the get operation.
386 * Get the phrase item from this sub phrase index.
388 * Note:get_phrase_item function can't modify the phrase item size,
389 * but can increment the freq of the special pronunciation,
390 * or change the content without size increasing.
393 int get_phrase_item(phrase_token_t token, PhraseItem & item);
396 * SubPhraseIndex::add_phrase_item:
397 * @token: the phrase token.
398 * @item: the phrase item of the token.
399 * @returns: the status of the add operation.
401 * Add the phrase item to this sub phrase index.
404 int add_phrase_item(phrase_token_t token, PhraseItem * item);
407 * SubPhraseIndex::remove_phrase_item:
408 * @token: the phrase token.
409 * @item: the removed phrase item of the token.
410 * @returns: the status of the remove operation.
412 * Remove the phrase item of the token.
414 * Note: this remove_phrase_item method will substract the unigram
415 * frequency of the removed item from m_total_freq.
418 int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
425 * The facade class of phrase index.
428 class FacadePhraseIndex{
430 guint32 m_total_freq;
431 SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
434 * FacadePhraseIndex::FacadePhraseIndex:
436 * The constructor of the FacadePhraseIndex.
441 memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
445 * FacadePhraseIndex::~FacadePhraseIndex:
447 * The destructor of the FacadePhraseIndex.
450 ~FacadePhraseIndex(){
451 for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
452 if ( m_sub_phrase_indices[i] ){
453 delete m_sub_phrase_indices[i];
454 m_sub_phrase_indices[i] = NULL;
460 * FacadePhraseIndex::load_text:
461 * @phrase_index: the index of sub phrase index to be loaded.
462 * @infile: the textual format file of the phrase table.
463 * @returns: whether the load operation is successful.
465 * Load one sub phrase index from the textual format file.
466 * Note: load sub phrase index according to the config in future.
469 bool load_text(guint8 phrase_index, FILE * infile);
472 * FacadePhraseIndex::load:
473 * @phrase_index: the index of sub phrase index to be loaded.
474 * @chunk: the memory chunk of sub phrase index to be loaded.
475 * @returns: whether the load operation is successful.
477 * Load one sub phrase index from the memory chunk.
480 bool load(guint8 phrase_index, MemoryChunk * chunk);
483 * FacadePhraseIndex::store:
484 * @phrase_index: the index of sub phrase index to be stored.
485 * @new_chunk: the memory chunk of sub phrase index to be stored.
486 * @returns: whether the store operation is successful.
488 * Store one sub phrase index to the memory chunk.
491 bool store(guint8 phrase_index, MemoryChunk * new_chunk);
494 * FacadePhraseIndex::unload:
495 * @phrase_index: the index of sub phrase index to be unloaded.
496 * @returns: whether the unload operation is successful.
498 * Unload one sub phrase index.
501 bool unload(guint8 phrase_index);
505 * FacadePhraseIndex::diff:
506 * @phrase_index: the index of sub phrase index to be differed.
507 * @oldchunk: the original content of sub phrase index.
508 * @newlog: the delta information of user self-learning data.
509 * @returns: whether the diff operation is successful.
511 * Store user delta information in the logger format.
513 * Note: the ownership of oldchunk is transfered here.
516 bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
517 MemoryChunk * newlog);
520 * FacadePhraseIndex::merge:
521 * @phrase_index: the index of sub phrase index to be merged.
522 * @log: the logger of difference in user home directory.
523 * @returns: whether the merge operation is successful.
525 * Merge the user logger of difference with the sub phrase index.
527 * Note: the ownership of log is transfered here.
530 bool merge(guint8 phrase_index, MemoryChunk * log);
533 * FacadePhraseIndex::compact:
534 * @returns: whether the compact operation is successful.
536 * Compat all sub phrase index memory usage.
542 * FacadePhraseIndex::get_sub_phrase_range:
543 * @min_index: the minimal sub phrase index.
544 * @max_index: the maximal sub phrase index.
545 * @returns: the status of the get operation.
547 * Get the minimum and maximum of the sub phrase index.
550 int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
553 * FacadePhraseIndex::get_range:
554 * @phrase_index: the index of sub phrase index.
555 * @range: the token range of the sub phrase index.
556 * @returns: the status of the get operation.
558 * Get the token range of the sub phrase index.
561 int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
564 * FacadePhraseIndex::get_phrase_index_total_freq:
565 * @returns: the total freq of the facade phrase index.
567 * Get the total freq of the facade phrase index.
569 * Note: maybe call it "Zero-gram".
572 guint32 get_phrase_index_total_freq(){
577 * FacadePhraseIndex::add_unigram_frequency:
578 * @token: the phrase token.
579 * @delta: the delta value of the phrase token.
580 * @returns: the status of the add operation.
582 * Add delta value to the phrase of the token.
585 int add_unigram_frequency(phrase_token_t token, guint32 delta){
586 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
587 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
589 return ERROR_NO_SUB_PHRASE_INDEX;
590 m_total_freq += delta;
591 return sub_phrase->add_unigram_frequency(token, delta);
595 * FacadePhraseIndex::get_phrase_item:
596 * @token: the phrase token.
597 * @item: the phrase item of the token.
598 * @returns: the status of the get operation.
600 * Get the phrase item from the facade phrase index.
603 int get_phrase_item(phrase_token_t token, PhraseItem & item){
604 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
605 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
607 return ERROR_NO_SUB_PHRASE_INDEX;
608 return sub_phrase->get_phrase_item(token, item);
612 * FacadePhraseIndex::add_phrase_item:
613 * @token: the phrase token.
614 * @item: the phrase item of the token.
615 * @returns: the status of the add operation.
617 * Add the phrase item to the facade phrase index.
620 int add_phrase_item(phrase_token_t token, PhraseItem * item){
621 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
622 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
624 sub_phrase = new SubPhraseIndex;
626 m_total_freq += item->get_unigram_frequency();
627 return sub_phrase->add_phrase_item(token, item);
631 * FacadePhraseIndex::remove_phrase_item:
632 * @token: the phrase token.
633 * @item: the removed phrase item of the token.
634 * @returns: the status of the remove operation.
636 * Remove the phrase item of the token.
639 int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
640 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
641 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
643 return ERROR_NO_SUB_PHRASE_INDEX;
645 int result = sub_phrase->remove_phrase_item(token, item);
648 m_total_freq -= item->get_unigram_frequency();
653 * FacadePhraseIndex::prepare_ranges:
654 * @ranges: the ranges to be prepared.
655 * @returns: whether the prepare operation is successful.
657 * Prepare the ranges.
660 bool prepare_ranges(PhraseIndexRanges ranges) {
661 /* assume memset(ranges, 0, sizeof(ranges)); */
662 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
663 GArray * & range = ranges[i];
664 assert(NULL == range);
666 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
668 range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
675 * FacadePhraseIndex::clear_ranges:
676 * @ranges: the ranges to be cleared.
677 * @returns: whether the clear operation is successful.
682 bool clear_ranges(PhraseIndexRanges ranges) {
683 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
684 GArray * range = ranges[i];
686 g_array_set_size(range, 0);
693 * FacadePhraseIndex::destroy_ranges:
694 * @ranges: the ranges to be destroyed.
695 * @returns: whether the destroy operation is successful.
697 * Destroy the ranges.
700 bool destroy_ranges(PhraseIndexRanges ranges) {
701 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
702 GArray * & range = ranges[i];
704 g_array_free(range, TRUE);
712 * FacadePhraseIndex::create_sub_phrase:
713 * @index: the phrase index to be created.
714 * @returns: the result of the create operation.
716 * Create the sub phrase index.
719 int create_sub_phrase(guint8 index) {
720 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
722 return ERROR_ALREADY_EXISTS;
725 sub_phrase = new SubPhraseIndex;
732 NOT_USED, /* not used. */
733 SYSTEM_FILE, /* system phrase file. */
734 USER_FILE, /* user only phrase file. */
738 const char * m_table_filename;
739 const char * m_system_filename;
740 const char * m_user_filename;
741 PHRASE_FILE_TYPE m_file_type;
742 } pinyin_table_info_t;
744 extern const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT];