3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #ifndef PHRASE_INDEX_H
23 #define PHRASE_INDEX_H
27 #include "novel_types.h"
28 #include "chewing_key.h"
29 #include "pinyin_parser2.h"
30 #include "pinyin_phrase2.h"
31 #include "memory_chunk.h"
32 #include "phrase_index_logger.h"
35 * Phrase Index File Format
37 * Indirect Index: Index by Token
38 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
39 * + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
40 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
42 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
43 * + Phrase Length + number of Pronunciations + Uni-gram Frequency+
44 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45 * + Phrase String(UCS4) + n Pronunciations with Frequency +
46 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
51 /* Store delta info by phrase index logger in user home directory.
54 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
59 * The PhraseItem to access the items in phrase index.
63 friend class SubPhraseIndex;
64 friend bool _compute_new_header(PhraseIndexLogger * logger,
67 guint32 & new_total_freq);
71 bool set_n_pronunciation(guint8 n_prouns);
74 * PhraseItem::PhraseItem:
76 * The constructor of the PhraseItem.
80 m_chunk.set_size(phrase_item_header);
81 memset(m_chunk.begin(), 0, m_chunk.size());
85 PhraseItem(MemoryChunk & chunk){
86 m_chunk.set_content(0, chunk->begin(), chunk->size());
87 assert ( m_chunk.size() >= phrase_item_header);
92 * PhraseItem::get_phrase_length:
93 * @returns: the length of this phrase item.
95 * Get the length of this phrase item.
98 guint8 get_phrase_length(){
99 char * buf_begin = (char *)m_chunk.begin();
100 return (*(guint8 *)buf_begin);
104 * PhraseItem::get_n_pronunciation:
105 * @returns: the number of the pronunciations.
107 * Get the number of the pronunciations.
110 guint8 get_n_pronunciation(){
111 char * buf_begin = ( char *) m_chunk.begin();
112 return (*(guint8 *)(buf_begin + sizeof(guint8)));
116 * PhraseItem::get_unigram_frequency:
117 * @returns: the uni-gram frequency of this phrase item.
119 * Get the uni-gram frequency of this phrase item.
122 guint32 get_unigram_frequency(){
123 char * buf_begin = (char *)m_chunk.begin();
124 return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
128 * PhraseItem::get_pronunciation_possibility:
129 * @options: the pinyin options.
130 * @keys: the pronunciation keys.
131 * @returns: the possibility of this phrase item pronounces the pinyin.
133 * Get the possibility of this phrase item pronounces the pinyin.
136 gfloat get_pronunciation_possibility(pinyin_option_t options,
138 guint8 phrase_length = get_phrase_length();
139 guint8 npron = get_n_pronunciation();
140 size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t);
141 char * buf_begin = (char *)m_chunk.begin();
142 guint32 matched = 0, total_freq =0;
143 for ( int i = 0 ; i < npron ; ++i){
144 char * chewing_begin = buf_begin + offset +
145 i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
146 guint32 * freq = (guint32 *)(chewing_begin +
147 phrase_length * sizeof(ChewingKey));
149 if ( 0 == pinyin_compare_with_ambiguities2
151 (ChewingKey *)chewing_begin,phrase_length) ){
157 /* an additional safe guard for chewing. */
158 if ( 0 == total_freq )
162 /* used preprocessor to avoid zero freq, in gen_chewing_table. */
163 gfloat retval = matched / (gfloat) total_freq;
168 * PhraseItem::increase_pronunciation_possibility:
169 * @options: the pinyin options.
170 * @keys: the pronunciation keys.
171 * @delta: the delta to be added to the pronunciation keys.
173 * Add the delta to the pronunciation of the pronunciation keys.
176 void increase_pronunciation_possibility(pinyin_option_t options,
181 * PhraseItem::get_phrase_string:
182 * @phrase: the ucs4 character buffer.
183 * @returns: whether the get operation is successful.
185 * Get the ucs4 characters of this phrase item.
188 bool get_phrase_string(ucs4_t * phrase);
191 * PhraseItem::set_phrase_string:
192 * @phrase_length: the ucs4 character length of this phrase item.
193 * @phrase: the ucs4 character buffer.
194 * @returns: whether the set operation is successful.
196 * Set the length and ucs4 characters of this phrase item.
199 bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase);
202 * PhraseItem::get_nth_pronunciation:
203 * @index: the pronunciation index.
204 * @keys: the pronunciation keys.
205 * @freq: the frequency of the pronunciation.
206 * @returns: whether the get operation is successful.
208 * Get the nth pronunciation of this phrase item.
211 bool get_nth_pronunciation(size_t index,
212 /* out */ ChewingKey * keys,
213 /* out */ guint32 & freq);
216 * PhraseItem::add_pronunciation:
217 * @keys: the pronunciation keys.
218 * @delta: the delta of the frequency of the pronunciation.
219 * @returns: whether the add operation is successful.
221 * Add one pronunciation.
224 bool add_pronunciation(ChewingKey * keys, guint32 delta);
227 * PhraseItem::remove_nth_pronunciation:
228 * @index: the pronunciation index.
230 * Remove the nth pronunciation.
232 * Note: Normally don't change the first pronunciation,
233 * which decides the token number.
236 void remove_nth_pronunciation(size_t index);
238 bool operator == (const PhraseItem & rhs) const{
239 if (m_chunk.size() != rhs.m_chunk.size())
241 return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
242 m_chunk.size()) == 0;
245 bool operator != (const PhraseItem & rhs) const{
246 return ! (*this == rhs);
251 * In Sub Phrase Index, token == (token & PHRASE_MASK).
257 * The SubPhraseIndex class for internal usage.
260 class SubPhraseIndex{
262 guint32 m_total_freq;
263 MemoryChunk m_phrase_index;
264 MemoryChunk m_phrase_content;
265 MemoryChunk * m_chunk;
269 m_phrase_index.set_size(0);
270 m_phrase_content.set_size(0);
279 * SubPhraseIndex::SubPhraseIndex:
281 * The constructor of the SubPhraseIndex.
284 SubPhraseIndex():m_total_freq(0){
289 * SubPhraseIndex::~SubPhraseIndex:
291 * The destructor of the SubPhraseIndex.
299 * SubPhraseIndex::load:
300 * @chunk: the memory chunk of the binary sub phrase index.
301 * @offset: the begin of binary data in the memory chunk.
302 * @end: the end of binary data in the memory chunk.
303 * @returns: whether the load operation is successful.
305 * Load the sub phrase index from the memory chunk.
308 bool load(MemoryChunk * chunk,
309 table_offset_t offset, table_offset_t end);
312 * SubPhraseIndex::store:
313 * @new_chunk: the new memory chunk to store this sub phrase index.
314 * @offset: the begin of binary data in the memory chunk.
315 * @end: the end of stored binary data in the memory chunk.
316 * @returns: whether the store operation is successful.
318 * Store the sub phrase index to the new memory chunk.
321 bool store(MemoryChunk * new_chunk,
322 table_offset_t offset, table_offset_t & end);
325 * SubPhraseIndex::diff:
326 * @oldone: the original content of sub phrase index.
327 * @logger: the delta information of user self-learning data.
328 * @returns: whether the diff operation is successful.
330 * Compare this sub phrase index with the original content of the system
331 * sub phrase index to generate the logger of difference.
333 * Note: Switch to logger format to reduce user space storage.
336 bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
339 * SubPhraseIndex::merge:
340 * @logger: the logger of difference in user home directory.
341 * @returns: whether the merge operation is successful.
343 * Merge the user logger of difference with this sub phrase index.
346 bool merge(PhraseIndexLogger * logger);
349 * SubPhraseIndex::get_range:
350 * @range: the token range.
351 * @returns: whether the get operation is successful.
353 * Get the token range in this sub phrase index.
356 int get_range(/* out */ PhraseIndexRange & range);
359 * SubPhraseIndex::get_phrase_index_total_freq:
360 * @returns: the total frequency of this sub phrase index.
362 * Get the total frequency of this sub phrase index.
364 * Note: maybe call it "Zero-gram".
367 guint32 get_phrase_index_total_freq();
370 * SubPhraseIndex::add_unigram_frequency:
371 * @token: the phrase token.
372 * @delta: the delta value of the phrase token.
373 * @returns: the status of the add operation.
375 * Add delta value to the phrase of the token.
377 * Note: this method is a fast path to add delta value.
378 * Maybe use the get_phrase_item method instead in future.
381 int add_unigram_frequency(phrase_token_t token, guint32 delta);
384 * SubPhraseIndex::get_phrase_item:
385 * @token: the phrase token.
386 * @item: the phrase item of the token.
387 * @returns: the status of the get operation.
389 * Get the phrase item from this sub phrase index.
391 * Note:get_phrase_item function can't modify the phrase item size,
392 * but can increment the freq of the special pronunciation,
393 * or change the content without size increasing.
396 int get_phrase_item(phrase_token_t token, PhraseItem & item);
399 * SubPhraseIndex::add_phrase_item:
400 * @token: the phrase token.
401 * @item: the phrase item of the token.
402 * @returns: the status of the add operation.
404 * Add the phrase item to this sub phrase index.
407 int add_phrase_item(phrase_token_t token, PhraseItem * item);
410 * SubPhraseIndex::remove_phrase_item:
411 * @token: the phrase token.
412 * @item: the removed phrase item of the token.
413 * @returns: the status of the remove operation.
415 * Remove the phrase item of the token.
417 * Note: this remove_phrase_item method will substract the unigram
418 * frequency of the removed item from m_total_freq.
421 int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
424 * SubPhraseIndex::mask_out:
427 * @returns: whether the mask out operation is successful.
429 * Mask out the matched phrase items.
432 bool mask_out(phrase_token_t mask, phrase_token_t value);
438 * The facade class of phrase index.
441 class FacadePhraseIndex{
443 guint32 m_total_freq;
444 SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
447 * FacadePhraseIndex::FacadePhraseIndex:
449 * The constructor of the FacadePhraseIndex.
454 memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
458 * FacadePhraseIndex::~FacadePhraseIndex:
460 * The destructor of the FacadePhraseIndex.
463 ~FacadePhraseIndex(){
464 for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
465 if ( m_sub_phrase_indices[i] ){
466 delete m_sub_phrase_indices[i];
467 m_sub_phrase_indices[i] = NULL;
473 * FacadePhraseIndex::load_text:
474 * @phrase_index: the index of sub phrase index to be loaded.
475 * @infile: the textual format file of the phrase table.
476 * @returns: whether the load operation is successful.
478 * Load one sub phrase index from the textual format file.
479 * Note: load sub phrase index according to the config in future.
482 bool load_text(guint8 phrase_index, FILE * infile);
485 * FacadePhraseIndex::load:
486 * @phrase_index: the index of sub phrase index to be loaded.
487 * @chunk: the memory chunk of sub phrase index to be loaded.
488 * @returns: whether the load operation is successful.
490 * Load one sub phrase index from the memory chunk.
493 bool load(guint8 phrase_index, MemoryChunk * chunk);
496 * FacadePhraseIndex::store:
497 * @phrase_index: the index of sub phrase index to be stored.
498 * @new_chunk: the memory chunk of sub phrase index to be stored.
499 * @returns: whether the store operation is successful.
501 * Store one sub phrase index to the memory chunk.
504 bool store(guint8 phrase_index, MemoryChunk * new_chunk);
507 * FacadePhraseIndex::unload:
508 * @phrase_index: the index of sub phrase index to be unloaded.
509 * @returns: whether the unload operation is successful.
511 * Unload one sub phrase index.
514 bool unload(guint8 phrase_index);
518 * FacadePhraseIndex::diff:
519 * @phrase_index: the index of sub phrase index to be differed.
520 * @oldchunk: the original content of sub phrase index.
521 * @newlog: the delta information of user self-learning data.
522 * @returns: whether the diff operation is successful.
524 * Store user delta information in the logger format.
526 * Note: the ownership of oldchunk is transfered here.
529 bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
530 MemoryChunk * newlog);
533 * FacadePhraseIndex::merge:
534 * @phrase_index: the index of sub phrase index to be merged.
535 * @log: the logger of difference in user home directory.
536 * @returns: whether the merge operation is successful.
538 * Merge the user logger of difference with the sub phrase index.
540 * Note: the ownership of log is transfered here.
543 bool merge(guint8 phrase_index, MemoryChunk * log);
546 * FacadePhraseIndex::merge_with_mask:
547 * @phrase_index: the index of sub phrase index to be merged.
548 * @log: the logger of difference in user home directory.
551 * @returns: whether the merge operation is successful.
553 * Merge the user logger of difference with mask operation.
555 * Note: the ownership of log is transfered here.
558 bool merge_with_mask(guint8 phrase_index, MemoryChunk * log,
559 phrase_token_t mask, phrase_token_t value);
562 * FacadePhraseIndex::compact:
563 * @returns: whether the compact operation is successful.
565 * Compat all sub phrase index memory usage.
571 * FacadePhraseIndex::mask_out:
572 * @phrase_index: the index of sub phrase index.
575 * @returns: whether the mask out operation is successful.
577 * Mask out the matched phrase items.
579 * Note: should call compact() after the mask out operation.
582 bool mask_out(guint8 phrase_index,
583 phrase_token_t mask, phrase_token_t value);
586 * FacadePhraseIndex::get_sub_phrase_range:
587 * @min_index: the minimal sub phrase index.
588 * @max_index: the maximal sub phrase index.
589 * @returns: the status of the get operation.
591 * Get the minimum and maximum of the sub phrase index.
594 int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
597 * FacadePhraseIndex::get_range:
598 * @phrase_index: the index of sub phrase index.
599 * @range: the token range of the sub phrase index.
600 * @returns: the status of the get operation.
602 * Get the token range of the sub phrase index.
605 int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
608 * FacadePhraseIndex::get_phrase_index_total_freq:
609 * @returns: the total freq of the facade phrase index.
611 * Get the total freq of the facade phrase index.
613 * Note: maybe call it "Zero-gram".
616 guint32 get_phrase_index_total_freq(){
621 * FacadePhraseIndex::add_unigram_frequency:
622 * @token: the phrase token.
623 * @delta: the delta value of the phrase token.
624 * @returns: the status of the add operation.
626 * Add delta value to the phrase of the token.
629 int add_unigram_frequency(phrase_token_t token, guint32 delta){
630 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
631 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
633 return ERROR_NO_SUB_PHRASE_INDEX;
634 m_total_freq += delta;
635 return sub_phrase->add_unigram_frequency(token, delta);
639 * FacadePhraseIndex::get_phrase_item:
640 * @token: the phrase token.
641 * @item: the phrase item of the token.
642 * @returns: the status of the get operation.
644 * Get the phrase item from the facade phrase index.
647 int get_phrase_item(phrase_token_t token, PhraseItem & item){
648 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
649 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
651 return ERROR_NO_SUB_PHRASE_INDEX;
652 return sub_phrase->get_phrase_item(token, item);
656 * FacadePhraseIndex::add_phrase_item:
657 * @token: the phrase token.
658 * @item: the phrase item of the token.
659 * @returns: the status of the add operation.
661 * Add the phrase item to the facade phrase index.
664 int add_phrase_item(phrase_token_t token, PhraseItem * item){
665 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
666 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
668 sub_phrase = new SubPhraseIndex;
670 m_total_freq += item->get_unigram_frequency();
671 return sub_phrase->add_phrase_item(token, item);
675 * FacadePhraseIndex::remove_phrase_item:
676 * @token: the phrase token.
677 * @item: the removed phrase item of the token.
678 * @returns: the status of the remove operation.
680 * Remove the phrase item of the token.
683 int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
684 guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
685 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
687 return ERROR_NO_SUB_PHRASE_INDEX;
689 int result = sub_phrase->remove_phrase_item(token, item);
692 m_total_freq -= item->get_unigram_frequency();
697 * FacadePhraseIndex::prepare_ranges:
698 * @ranges: the ranges to be prepared.
699 * @returns: whether the prepare operation is successful.
701 * Prepare the ranges.
704 bool prepare_ranges(PhraseIndexRanges ranges) {
705 /* assume memset(ranges, 0, sizeof(ranges)); */
706 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
707 GArray * & range = ranges[i];
708 assert(NULL == range);
710 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
712 range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
719 * FacadePhraseIndex::clear_ranges:
720 * @ranges: the ranges to be cleared.
721 * @returns: whether the clear operation is successful.
726 bool clear_ranges(PhraseIndexRanges ranges) {
727 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
728 GArray * range = ranges[i];
730 g_array_set_size(range, 0);
737 * FacadePhraseIndex::destroy_ranges:
738 * @ranges: the ranges to be destroyed.
739 * @returns: whether the destroy operation is successful.
741 * Destroy the ranges.
744 bool destroy_ranges(PhraseIndexRanges ranges) {
745 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
746 GArray * & range = ranges[i];
748 g_array_free(range, TRUE);
756 * FacadePhraseIndex::prepare_tokens:
757 * @tokens: the tokens to be prepared.
758 * @returns: whether the prepare operation is successful.
760 * Prepare the tokens.
763 bool prepare_tokens(PhraseTokens tokens) {
764 /* assume memset(tokens, 0, sizeof(tokens)); */
765 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
766 GArray * & token = tokens[i];
767 assert(NULL == token);
769 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
771 token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
778 * FacadePhraseIndex::clear_tokens:
779 * @tokens: the tokens to be cleared.
780 * @return: whether the clear operation is successful.
785 bool clear_tokens(PhraseTokens tokens) {
786 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
787 GArray * token = tokens[i];
789 g_array_set_size(token, 0);
796 * FacadePhraseIndex::destroy_tokens:
797 * @tokens: the tokens to be destroyed.
798 * @returns: whether the destroy operation is successful.
800 * Destroy the tokens.
803 bool destroy_tokens(PhraseTokens tokens) {
804 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
805 GArray * & token = tokens[i];
807 g_array_free(token, TRUE);
815 * FacadePhraseIndex::create_sub_phrase:
816 * @index: the phrase index to be created.
817 * @returns: the result of the create operation.
819 * Create the sub phrase index.
822 int create_sub_phrase(guint8 index) {
823 SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
825 return ERROR_ALREADY_EXISTS;
828 sub_phrase = new SubPhraseIndex;
835 NOT_USED, /* not used. */
836 SYSTEM_FILE, /* system phrase file. */
837 DICTIONARY, /* professional dictionary. */
838 USER_FILE, /* user only phrase file. */
842 const PHRASE_INDEX_LIBRARIES m_dict_index; /* for assert purpose. */
843 const char * m_table_filename;
844 const char * m_system_filename;
845 const char * m_user_filename;
846 PHRASE_FILE_TYPE m_file_type;
847 } pinyin_table_info_t;
849 extern const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT];
851 PhraseIndexLogger * mask_out_phrase_index_logger
852 (PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value);