3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #ifndef PHRASE_LOOKUP_H
23 #define PHRASE_LOOKUP_H
25 #include "novel_types.h"
32 * The definitions of phrase lookup related classes and structs.
41 * The phrase lookup class to convert the sentence to phrase tokens.
46 const gfloat bigram_lambda;
47 const gfloat unigram_lambda;
49 PhraseItem m_cache_phrase_item;
50 SingleGram m_merged_single_gram;
53 FacadePhraseTable2 * m_phrase_table;
54 FacadePhraseIndex * m_phrase_index;
55 Bigram * m_system_bigram;
56 Bigram * m_user_bigram;
58 //internal step data structure
59 GPtrArray * m_steps_index;
60 /* Array of LookupStepIndex */
61 GPtrArray * m_steps_content;
62 /* Array of LookupStepContent */
65 int m_sentence_length;
69 /* Explicitly search the next phrase,
70 * to avoid double phrase lookup as the next token has only one.
72 bool search_unigram2(int nstep, PhraseTokens tokens);
73 bool search_bigram2(int nstep, PhraseTokens tokens);
75 bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
76 bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
78 bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);
80 bool final_step(MatchResults & results);
83 * PhraseLookup::PhraseLookup:
84 * @lambda: the lambda parameter for interpolation model.
85 * @phrase_table: the phrase table.
86 * @phrase_index: the phrase index.
87 * @system_bigram: the system bi-gram.
88 * @user_bigram: the user bi-gram.
90 * The constructor of the PhraseLookup.
93 PhraseLookup(const gfloat lambda,
94 FacadePhraseTable2 * phrase_table,
95 FacadePhraseIndex * phrase_index,
96 Bigram * system_bigram,
97 Bigram * user_bigram);
100 * PhraseLookup::~PhraseLookup:
102 * The destructor of the PhraseLookup.
108 * PhraseLookup::get_best_match:
109 * @sentence_length: the length of the sentence in ucs4 characters.
110 * @sentence: the ucs4 characters of the sentence.
111 * @results: the segmented sentence in the form of phrase tokens.
112 * @returns: whether the segment operation is successful.
114 * Segment the sentence into phrase tokens.
116 * Note: this method only accepts the characters in phrase large table.
119 bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);
122 * PhraseLookup::convert_to_utf8:
123 * @results: the guessed sentence in the form of phrase tokens.
124 * @result_string: the converted sentence in utf8 string.
125 * @returns: whether the convert operation is successful.
127 * Convert the sentence from phrase tokens to the utf8 string.
129 * Note: free the result_string by g_free.
132 bool convert_to_utf8(MatchResults results,
133 /* out */ char * & result_string)
135 return pinyin::convert_to_utf8(m_phrase_index, results,
136 "\n", true, result_string);