3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "phrase_index.h"
24 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
25 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
29 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
30 guint8 phrase_length = get_phrase_length();
31 table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
32 bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
35 return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
38 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
39 guint8 phrase_length = get_phrase_length();
40 set_n_pronunciation(get_n_pronunciation() + 1);
41 m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
42 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
45 void PhraseItem::remove_nth_pronunciation(size_t index){
46 guint8 phrase_length = get_phrase_length();
47 set_n_pronunciation(get_n_pronunciation() - 1);
48 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
49 m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
52 bool PhraseItem::get_phrase_string(utf16_t * phrase){
53 guint8 phrase_length = get_phrase_length();
54 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
57 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
58 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
59 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
63 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
64 PinyinKey * pinyin_keys,
66 guint8 phrase_length = get_phrase_length();
67 guint8 npron = get_n_pronunciation();
68 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
69 char * buf_begin = (char *) m_chunk.begin();
70 guint32 total_freq = 0;
71 for ( int i = 0 ; i < npron ; ++i){
72 char * pinyin_begin = buf_begin + offset +
73 i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
74 guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
76 if ( 0 == pinyin_compare_with_ambiguities(custom,
77 (PinyinKey *)pinyin_begin,
80 //protect against total_freq overflow.
81 if ( delta > 0 && total_freq > total_freq + delta )
90 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
94 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
95 table_offset_t offset;
97 bool result = m_phrase_index.get_content
98 ((token & PHRASE_MASK)
99 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
102 return ERROR_OUT_OF_RANGE;
105 return ERROR_NO_ITEM;
107 result = m_phrase_content.get_content
108 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
111 return ERROR_FILE_CORRUPTION;
113 //protect total_freq overflow
114 if ( delta > 0 && m_total_freq > m_total_freq + delta )
115 return ERROR_INTEGER_OVERFLOW;
118 m_total_freq += delta;
119 m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
124 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
125 table_offset_t offset;
126 guint8 phrase_length;
129 bool result = m_phrase_index.get_content
130 ((token & PHRASE_MASK)
131 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
134 return ERROR_OUT_OF_RANGE;
137 return ERROR_NO_ITEM;
139 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
141 return ERROR_FILE_CORRUPTION;
143 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
145 return ERROR_FILE_CORRUPTION;
147 size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
148 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
152 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
153 table_offset_t offset = m_phrase_content.size();
156 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
157 m_phrase_index.set_content((token & PHRASE_MASK)
158 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
159 m_total_freq += item->get_unigram_frequency();
163 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
166 int result = get_phrase_item(token, old_item);
167 if (result != ERROR_OK)
170 item = new PhraseItem;
171 //implictly copy data from m_chunk_content.
172 item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
174 const table_offset_t zero_const = 0;
175 m_phrase_index.set_content((token & PHRASE_MASK)
176 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
177 m_total_freq -= item->get_unigram_frequency();
181 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
182 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
184 sub_phrases = new SubPhraseIndex;
187 bool retval = sub_phrases->load(chunk, 0, chunk->size());
190 m_total_freq += sub_phrases->get_phrase_index_total_freq();
194 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
196 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
200 sub_phrases->store(new_chunk, 0, end);
204 bool FacadePhraseIndex::unload(guint8 phrase_index){
205 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
208 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
214 bool SubPhraseIndex::load(MemoryChunk * chunk,
215 table_offset_t offset, table_offset_t end){
216 //save the memory chunk
223 char * buf_begin = (char *)chunk->begin();
224 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
225 offset += sizeof(guint32);
226 table_offset_t index_one, index_two, index_three;
227 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
228 offset += sizeof(table_offset_t);
229 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
230 offset += sizeof(table_offset_t);
231 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
232 offset += sizeof(table_offset_t);
233 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
234 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
235 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
236 m_phrase_index.set_chunk(buf_begin + index_one,
237 index_two - 1 - index_one, NULL);
238 m_phrase_content.set_chunk(buf_begin + index_two,
239 index_three - 1 - index_two, NULL);
240 g_return_val_if_fail( index_three <= end, FALSE);
244 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
245 table_offset_t offset, table_offset_t& end){
246 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
247 table_offset_t index = offset + sizeof(guint32);
249 offset = index + sizeof(table_offset_t) * 3 ;
250 new_chunk->set_content(offset, &c_separate, sizeof(char));
251 offset += sizeof(char);
253 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
254 index += sizeof(table_offset_t);
255 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
256 offset += m_phrase_index.size();
257 new_chunk->set_content(offset, &c_separate, sizeof(char));
258 offset += sizeof(char);
260 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
261 index += sizeof(table_offset_t);
263 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
264 offset += m_phrase_content.size();
265 new_chunk->set_content(offset, &c_separate, sizeof(char));
266 offset += sizeof(char);
267 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
271 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
272 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
274 sub_phrases = new SubPhraseIndex;
279 phrase_token_t token;
281 PhraseItem * item_ptr = new PhraseItem;
282 phrase_token_t cur_token = 0;
283 while ( !feof(infile)){
284 fscanf(infile, "%s", pinyin);
285 fscanf(infile, "%s", phrase);
286 fscanf(infile, "%ld", &token);
287 fscanf(infile, "%ld", &freq);
292 utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
295 if ( 0 == cur_token ){
297 item_ptr->set_phrase_string(written, phrase_utf16);
300 if ( cur_token != token ){
301 add_phrase_item( cur_token, item_ptr);
303 item_ptr = new PhraseItem;
305 item_ptr->set_phrase_string(written, phrase_utf16);
308 PinyinDefaultParser parser;
309 NullPinyinValidator validator;
310 PinyinKeyVector keys;
311 PinyinKeyPosVector poses;
313 keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
314 poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
315 parser.parse(validator, keys, poses, pinyin);
317 assert ( item_ptr->get_phrase_length() == keys->len );
318 item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
320 g_array_free(keys, TRUE);
321 g_array_free(poses, TRUE);
322 g_free(phrase_utf16);
325 add_phrase_item( cur_token, item_ptr);
327 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
331 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
332 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
334 return ERROR_NO_SUB_PHRASE_INDEX;
336 int result = sub_phrase->get_range(range);
340 range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
341 range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
345 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
346 const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
347 const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
349 range.m_range_begin = 0;
350 range.m_range_end = end - begin;