3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #ifndef PHRASE_INDEX_H
23 #define PHRASE_INDEX_H
27 #include "novel_types.h"
28 #include "chewing_key.h"
29 #include "pinyin_parser2.h"
30 #include "pinyin_phrase2.h"
31 #include "memory_chunk.h"
32 #include "phrase_index_logger.h"
35 * Phrase Index File Format
37 * Indirect Index: Index by Token
38 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
39 * + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
40 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
42 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
43 * + Phrase Length + number of Pronunciations + Uni-gram Frequency+
44 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45 * + Phrase String(UCS4) + n Pronunciations with Frequency +
46 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
51 /* Store delta info by phrase index logger in user home directory.
54 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
59 * The PhraseItem to access the items in phrase index.
63 friend class SubPhraseIndex;
64 friend bool _compute_new_header(PhraseIndexLogger * logger,
67 guint32 & new_total_freq);
71 bool set_n_pronunciation(guint8 n_prouns);
74 * PhraseItem::PhraseItem:
76 * The constructor of the PhraseItem.
80 m_chunk.set_size(phrase_item_header);
81 memset(m_chunk.begin(), 0, m_chunk.size());
85 PhraseItem(MemoryChunk & chunk){
86 m_chunk.set_content(0, chunk->begin(), chunk->size());
87 assert ( m_chunk.size() >= phrase_item_header);
92 * PhraseItem::get_phrase_length:
93 * @returns: the length of this phrase item.
95 * Get the length of this phrase item.
98 guint8 get_phrase_length(){
99 char * buf_begin = (char *)m_chunk.begin();
100 return (*(guint8 *)buf_begin);
104 * PhraseItem::get_n_pronunciation:
105 * @returns: the number of the pronunciations.
107 * Get the number of the pronunciations.
110 guint8 get_n_pronunciation(){
111 char * buf_begin = ( char *) m_chunk.begin();
112 return (*(guint8 *)(buf_begin + sizeof(guint8)));
116 * PhraseItem::get_unigram_frequency:
117 * @returns: the uni-gram frequency of this phrase item.
119 * Get the uni-gram frequency of this phrase item.
122 guint32 get_unigram_frequency(){
123 char * buf_begin = (char *)m_chunk.begin();
124 return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
128 * PhraseItem::get_pronunciation_possibility:
129 * @options: the pinyin options.
130 * @keys: the pronunciation keys.
131 * @returns: the possibility of this phrase item pronounces the pinyin.
133 * Get the possibility of this phrase item pronounces the pinyin.
136 gfloat get_pronunciation_possibility(pinyin_option_t options,
138 guint8 phrase_length = get_phrase_length();
139 guint8 npron = get_n_pronunciation();
140 size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t);
141 char * buf_begin = (char *)m_chunk.begin();
142 guint32 matched = 0, total_freq =0;
143 for ( int i = 0 ; i < npron ; ++i){
144 char * chewing_begin = buf_begin + offset +
145 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
146 guint32 * freq = (guint32 *)(chewing_begin +
147 phrase_length * sizeof(ChewingKey));
149 if ( 0 == pinyin_compare_with_ambiguities2
151 (ChewingKey *)chewing_begin,phrase_length) ){
157 /* an additional safe guard for chewing. */
158 if ( 0 == total_freq )
162 /* used preprocessor to avoid zero freq, in gen_chewing_table. */
163 gfloat retval = matched / (gfloat) total_freq;
168 * PhraseItem::increase_pronunciation_possibility:
169 * @options: the pinyin options.
170 * @keys: the pronunciation keys.
171 * @delta: the delta to be added to the pronunciation keys.
173 * Add the delta to the pronunciation of the pronunciation keys.
176 void increase_pronunciation_possibility(pinyin_option_t options,
181 * PhraseItem::get_phrase_string:
182 * @phrase: the ucs4 character buffer.
183 * @returns: whether the get operation is successful.
185 * Get the ucs4 characters of this phrase item.
188 bool get_phrase_string(ucs4_t * phrase);
191 * PhraseItem::set_phrase_string:
192 * @phrase_length: the ucs4 character length of this phrase item.
193 * @phrase: the ucs4 character buffer.
194 * @returns: whether the set operation is successful.
196 * Set the length and ucs4 characters of this phrase item.
199 bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase);
202 * PhraseItem::get_nth_pronunciation:
203 * @index: the pronunciation index.
204 * @keys: the pronunciation keys.
205 * @freq: the frequency of the pronunciation.
206 * @returns: whether the get operation is successful.
208 * Get the nth pronunciation of this phrase item.
211 bool get_nth_pronunciation(size_t index,
212 /* out */ ChewingKey * keys,
213 /* out */ guint32 & freq);
216 * PhraseItem::append_pronunciation:
217 * @keys: the pronunciation keys.
218 * @freq: the frequency of the pronunciation.
220 * Append one pronunciation.
223 void append_pronunciation(ChewingKey * keys, guint32 freq);
226 * PhraseItem::remove_nth_pronunciation:
227 * @index: the pronunciation index.
229 * Remove the nth pronunciation.
231 * Note: Normally don't change the first pronunciation,
232 * which decides the token number.
235 void remove_nth_pronunciation(size_t index);
237 bool operator == (const PhraseItem & rhs) const{
238 if (m_chunk.size() != rhs.m_chunk.size())
240 return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
241 m_chunk.size()) == 0;
244 bool operator != (const PhraseItem & rhs) const{
245 return ! (*this == rhs);
250 * In Sub Phrase Index, token == (token & PHRASE_MASK).
256 * The SubPhraseIndex class for internal usage.
259 class SubPhraseIndex{
261 guint32 m_total_freq;
262 MemoryChunk m_phrase_index;
263 MemoryChunk m_phrase_content;
264 MemoryChunk * m_chunk;
268 m_phrase_index.set_size(0);
269 m_phrase_content.set_size(0);
278 * SubPhraseIndex::SubPhraseIndex:
280 * The constructor of the SubPhraseIndex.
283 SubPhraseIndex():m_total_freq(0){
288 * SubPhraseIndex::~SubPhraseIndex:
290 * The destructor of the SubPhraseIndex.
298 * SubPhraseIndex::load:
299 * @chunk: the memory chunk of the binary sub phrase index.
300 * @offset: the begin of binary data in the memory chunk.
301 * @end: the end of binary data in the memory chunk.
302 * @returns: whether the load operation is successful.
304 * Load the sub phrase index from the memory chunk.
307 bool load(MemoryChunk * chunk,
308 table_offset_t offset, table_offset_t end);
311 * SubPhraseIndex::store:
312 * @new_chunk: the new memory chunk to store this sub phrase index.
313 * @offset: the begin of binary data in the memory chunk.
314 * @end: the end of stored binary data in the memory chunk.
315 * @returns: whether the store operation is successful.
317 * Store the sub phrase index to the new memory chunk.
320 bool store(MemoryChunk * new_chunk,
321 table_offset_t offset, table_offset_t & end);
324 * SubPhraseIndex::diff:
325 * @oldone: the original content of sub phrase index.
326 * @logger: the delta information of user self-learning data.
327 * @returns: whether the diff operation is successful.
329 * Compare this sub phrase index with the original content of the system
330 * sub phrase index to generate the logger of difference.
332 * Note: Switch to logger format to reduce user space storage.
335 bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
338 * SubPhraseIndex::merge:
339 * @logger: the logger of difference in user home directory.
340 * @returns: whether the merge operation is successful.
342 * Merge the user logger of difference with this sub phrase index.
345 bool merge(PhraseIndexLogger * logger);
348 * SubPhraseIndex::get_range:
349 * @range: the token range.
350 * @returns: whether the get operation is successful.
352 * Get the token range in this sub phrase index.
355 int get_range(/* out */ PhraseIndexRange & range);
358 * SubPhraseIndex::get_phrase_index_total_freq:
359 * @returns: the total frequency of this sub phrase index.
361 * Get the total frequency of this sub phrase index.
363 * Note: maybe call it "Zero-gram".
366 guint32 get_phrase_index_total_freq();
369 * SubPhraseIndex::add_unigram_frequency:
370 * @token: the phrase token.
371 * @delta: the delta value of the phrase token.
372 * @returns: the status of the add operation.
374 * Add delta value to the phrase of the token.
376 * Note: this method is a fast path to add delta value.
377 * Maybe use the get_phrase_item method instead in future.
380 int add_unigram_frequency(phrase_token_t token, guint32 delta);
383 * SubPhraseIndex::get_phrase_item:
384 * @token: the phrase token.
385 * @item: the phrase item of the token.
386 * @returns: the status of the get operation.
388 * Get the phrase item from this sub phrase index.
390 * Note:get_phrase_item function can't modify the phrase item size,
391 * but can increment the freq of the special pronunciation,
392 * or change the content without size increasing.
395 int get_phrase_item(phrase_token_t token, PhraseItem & item);
398 * SubPhraseIndex::add_phrase_item:
399 * @token: the phrase token.
400 * @item: the phrase item of the token.
401 * @returns: the status of the add operation.
403 * Add the phrase item to this sub phrase index.
406 int add_phrase_item(phrase_token_t token, PhraseItem * item);
409 * SubPhraseIndex::remove_phrase_item:
410 * @token: the phrase token.
411 * @item: the removed phrase item of the token.
412 * @returns: the status of the remove operation.
414 * Remove the phrase item of the token.
416 * Note: this remove_phrase_item method will substract the unigram
417 * frequency of the removed item from m_total_freq.
420 int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
423 * SubPhraseIndex::mask_out:
426 * @returns: whether the mask out operation is successful.
428 * Mask out the matched phrase items.
431 bool mask_out(phrase_token_t mask, phrase_token_t value);
437 * The facade class of phrase index.
440 class FacadePhraseIndex{
442 guint32 m_total_freq;
443 SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
446 * FacadePhraseIndex::FacadePhraseIndex:
448 * The constructor of the FacadePhraseIndex.
453 memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
457 * FacadePhraseIndex::~FacadePhraseIndex:
459 * The destructor of the FacadePhraseIndex.
462 ~FacadePhraseIndex(){
463 for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
464 if ( m_sub_phrase_indices[i] ){
465 delete m_sub_phrase_indices[i];
466 m_sub_phrase_indices[i] = NULL;
472 * FacadePhraseIndex::load_text:
473 * @phrase_index: the index of sub phrase index to be loaded.
474 * @infile: the textual format file of the phrase table.
475 * @returns: whether the load operation is successful.
477 * Load one sub phrase index from the textual format file.
478 * Note: load sub phrase index according to the config in future.
481 bool load_text(guint8 phrase_index, FILE * infile);
484 * FacadePhraseIndex::load:
485 * @phrase_index: the index of sub phrase index to be loaded.
486 * @chunk: the memory chunk of sub phrase index to be loaded.
487 * @returns: whether the load operation is successful.
489 * Load one sub phrase index from the memory chunk.
492 bool load(guint8 phrase_index, MemoryChunk * chunk);
495 * FacadePhraseIndex::store:
496 * @phrase_index: the index of sub phrase index to be stored.
497 * @new_chunk: the memory chunk of sub phrase index to be stored.
498 * @returns: whether the store operation is successful.
500 * Store one sub phrase index to the memory chunk.
503 bool store(guint8 phrase_index, MemoryChunk * new_chunk);
506 * FacadePhraseIndex::unload:
507 * @phrase_index: the index of sub phrase index to be unloaded.
508 * @returns: whether the unload operation is successful.
510 * Unload one sub phrase index.
513 bool unload(guint8 phrase_index);
517 * FacadePhraseIndex::diff:
518 * @phrase_index: the index of sub phrase index to be differed.
519 * @oldchunk: the original content of sub phrase index.
520 * @newlog: the delta information of user self-learning data.
521 * @returns: whether the diff operation is successful.
523 * Store user delta information in the logger format.
525 * Note: the ownership of oldchunk is transfered here.
528 bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
529 MemoryChunk * newlog);
532 * FacadePhraseIndex::merge:
533 * @phrase_index: the index of sub phrase index to be merged.
534 * @log: the logger of difference in user home directory.
535 * @returns: whether the merge operation is successful.
537 * Merge the user logger of difference with the sub phrase index.
539 * Note: the ownership of log is transfered here.
542 bool merge(guint8 phrase_index, MemoryChunk * log);
545 * FacadePhraseIndex::merge_with_mask:
546 * @phrase_index: the index of sub phrase index to be merged.
547 * @log: the logger of difference in user home directory.
550 * @returns: whether the merge operation is successful.
552 * Merge the user logger of difference with mask operation.
554 * Note: the ownership of log is transfered here.
557 bool merge_with_mask(guint8 phrase_index, MemoryChunk * log,
558 phrase_token_t mask, phrase_token_t value);
561 * FacadePhraseIndex::compact:
562 * @returns: whether the compact operation is successful.
564 * Compat all sub phrase index memory usage.
570 * FacadePhraseIndex::mask_out:
571 * @phrase_index: the index of sub phrase index.
574 * @returns: whether the mask out operation is successful.
576 * Mask out the matched phrase items.
578 * Note: should call compact() after the mask out operation.
581 bool mask_out(guint8 phrase_index,
582 phrase_token_t mask, phrase_token_t value);
585 * FacadePhraseIndex::get_sub_phrase_range:
586 * @min_index: the minimal sub phrase index.
587 * @max_index: the maximal sub phrase index.
588 * @returns: the status of the get operation.
590 * Get the minimum and maximum of the sub phrase index.
593 int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
596 * FacadePhraseIndex::get_range:
597 * @phrase_index: the index of sub phrase index.
598 * @range: the token range of the sub phrase index.
599 * @returns: the status of the get operation.
601 * Get the token range of the sub phrase index.
604 int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
607 * FacadePhraseIndex::get_phrase_index_total_freq:
608 * @returns: the total freq of the facade phrase index.
610 * Get the total freq of the facade phrase index.
612 * Note: maybe call it "Zero-gram".
615 guint32 get_phrase_index_total_freq(){
620 * FacadePhraseIndex::add_unigram_frequency:
621 * @token: the phrase token.
622 * @delta: the delta value of the phrase token.
623 * @returns: the status of the add operation.
625 * Add delta value to the phrase of the token.
628 int add_unigram_frequency(phrase_token_t token, guint32 delta){
629 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
630 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
632 return ERROR_NO_SUB_PHRASE_INDEX;
633 m_total_freq += delta;
634 return sub_phrase->add_unigram_frequency(token, delta);
638 * FacadePhraseIndex::get_phrase_item:
639 * @token: the phrase token.
640 * @item: the phrase item of the token.
641 * @returns: the status of the get operation.
643 * Get the phrase item from the facade phrase index.
646 int get_phrase_item(phrase_token_t token, PhraseItem & item){
647 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
648 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
650 return ERROR_NO_SUB_PHRASE_INDEX;
651 return sub_phrase->get_phrase_item(token, item);
655 * FacadePhraseIndex::add_phrase_item:
656 * @token: the phrase token.
657 * @item: the phrase item of the token.
658 * @returns: the status of the add operation.
660 * Add the phrase item to the facade phrase index.
663 int add_phrase_item(phrase_token_t token, PhraseItem * item){
664 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
665 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
667 sub_phrase = new SubPhraseIndex;
669 m_total_freq += item->get_unigram_frequency();
670 return sub_phrase->add_phrase_item(token, item);
674 * FacadePhraseIndex::remove_phrase_item:
675 * @token: the phrase token.
676 * @item: the removed phrase item of the token.
677 * @returns: the status of the remove operation.
679 * Remove the phrase item of the token.
682 int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
683 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
684 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
686 return ERROR_NO_SUB_PHRASE_INDEX;
688 int result = sub_phrase->remove_phrase_item(token, item);
691 m_total_freq -= item->get_unigram_frequency();
696 * FacadePhraseIndex::prepare_ranges:
697 * @ranges: the ranges to be prepared.
698 * @returns: whether the prepare operation is successful.
700 * Prepare the ranges.
703 bool prepare_ranges(PhraseIndexRanges ranges) {
704 /* assume memset(ranges, 0, sizeof(ranges)); */
705 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
706 GArray * & range = ranges[i];
707 assert(NULL == range);
709 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
711 range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
718 * FacadePhraseIndex::clear_ranges:
719 * @ranges: the ranges to be cleared.
720 * @returns: whether the clear operation is successful.
725 bool clear_ranges(PhraseIndexRanges ranges) {
726 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
727 GArray * range = ranges[i];
729 g_array_set_size(range, 0);
736 * FacadePhraseIndex::destroy_ranges:
737 * @ranges: the ranges to be destroyed.
738 * @returns: whether the destroy operation is successful.
740 * Destroy the ranges.
743 bool destroy_ranges(PhraseIndexRanges ranges) {
744 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
745 GArray * & range = ranges[i];
747 g_array_free(range, TRUE);
755 * FacadePhraseIndex::prepare_tokens:
756 * @tokens: the tokens to be prepared.
757 * @returns: whether the prepare operation is successful.
759 * Prepare the tokens.
762 bool prepare_tokens(PhraseTokens tokens) {
763 /* assume memset(tokens, 0, sizeof(tokens)); */
764 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
765 GArray * & token = tokens[i];
766 assert(NULL == token);
768 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
770 token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
777 * FacadePhraseIndex::clear_tokens:
778 * @tokens: the tokens to be cleared.
779 * @return: whether the clear operation is successful.
784 bool clear_tokens(PhraseTokens tokens) {
785 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
786 GArray * token = tokens[i];
788 g_array_set_size(token, 0);
795 * FacadePhraseIndex::destroy_tokens:
796 * @tokens: the tokens to be destroyed.
797 * @returns: whether the destroy operation is successful.
799 * Destroy the tokens.
802 bool destroy_tokens(PhraseTokens tokens) {
803 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
804 GArray * & token = tokens[i];
806 g_array_free(token, TRUE);
814 * FacadePhraseIndex::create_sub_phrase:
815 * @index: the phrase index to be created.
816 * @returns: the result of the create operation.
818 * Create the sub phrase index.
821 int create_sub_phrase(guint8 index) {
822 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
824 return ERROR_ALREADY_EXISTS;
827 sub_phrase = new SubPhraseIndex;
834 NOT_USED, /* not used. */
835 SYSTEM_FILE, /* system phrase file. */
836 DICTIONARY, /* professional dictionary. */
837 USER_FILE, /* user only phrase file. */
841 const PHRASE_INDEX_LIBRARIES m_dict_index; /* for assert purpose. */
842 const char * m_table_filename;
843 const char * m_system_filename;
844 const char * m_user_filename;
845 PHRASE_FILE_TYPE m_file_type;
846 } pinyin_table_info_t;
848 extern const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT];
850 PhraseIndexLogger * mask_out_phrase_index_logger
851 (PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value);