3 * Library to deal with pinyin.
5 * Copyright (C) 2006-2007 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "phrase_index.h"
24 using namespace pinyin;
26 bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
27 m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
31 bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
32 guint8 phrase_length = get_phrase_length();
33 table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
34 bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
37 return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
40 void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
41 guint8 phrase_length = get_phrase_length();
42 set_n_pronunciation(get_n_pronunciation() + 1);
43 m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
44 m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
47 void PhraseItem::remove_nth_pronunciation(size_t index){
48 guint8 phrase_length = get_phrase_length();
49 set_n_pronunciation(get_n_pronunciation() - 1);
50 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
51 m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
54 bool PhraseItem::get_phrase_string(utf16_t * phrase){
55 guint8 phrase_length = get_phrase_length();
56 return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
59 bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
60 m_chunk.set_content(0, &phrase_length, sizeof(guint8));
61 m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
65 void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
66 PinyinKey * pinyin_keys,
68 guint8 phrase_length = get_phrase_length();
69 guint8 npron = get_n_pronunciation();
70 size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
71 char * buf_begin = (char *) m_chunk.begin();
72 guint32 total_freq = 0;
73 for ( int i = 0 ; i < npron ; ++i){
74 char * pinyin_begin = buf_begin + offset +
75 i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
76 guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
78 if ( 0 == pinyin_compare_with_ambiguities
80 (PinyinKey *)pinyin_begin, phrase_length) ){
81 //protect against total_freq overflow.
82 if ( delta > 0 && total_freq > total_freq + delta )
91 guint32 SubPhraseIndex::get_phrase_index_total_freq(){
95 int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
96 table_offset_t offset;
98 bool result = m_phrase_index.get_content
99 ((token & PHRASE_MASK)
100 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
103 return ERROR_OUT_OF_RANGE;
106 return ERROR_NO_ITEM;
108 result = m_phrase_content.get_content
109 (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
112 return ERROR_FILE_CORRUPTION;
114 //protect total_freq overflow
115 if ( delta > 0 && m_total_freq > m_total_freq + delta )
116 return ERROR_INTEGER_OVERFLOW;
119 m_total_freq += delta;
120 m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
125 int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
126 table_offset_t offset;
127 guint8 phrase_length;
130 bool result = m_phrase_index.get_content
131 ((token & PHRASE_MASK)
132 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
135 return ERROR_OUT_OF_RANGE;
138 return ERROR_NO_ITEM;
140 result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
142 return ERROR_FILE_CORRUPTION;
144 result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
146 return ERROR_FILE_CORRUPTION;
148 size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
149 item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
153 int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
154 table_offset_t offset = m_phrase_content.size();
157 m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
158 m_phrase_index.set_content((token & PHRASE_MASK)
159 * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
160 m_total_freq += item->get_unigram_frequency();
164 int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
167 int result = get_phrase_item(token, old_item);
168 if (result != ERROR_OK)
171 item = new PhraseItem;
172 //implictly copy data from m_chunk_content.
173 item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
175 const table_offset_t zero_const = 0;
176 m_phrase_index.set_content((token & PHRASE_MASK)
177 * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
178 m_total_freq -= item->get_unigram_frequency();
182 bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
183 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
185 sub_phrases = new SubPhraseIndex;
188 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
189 bool retval = sub_phrases->load(chunk, 0, chunk->size());
192 m_total_freq += sub_phrases->get_phrase_index_total_freq();
196 bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
198 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
202 sub_phrases->store(new_chunk, 0, end);
206 bool FacadePhraseIndex::unload(guint8 phrase_index){
207 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
210 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
216 bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
217 MemoryChunk * newlog){
218 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
222 SubPhraseIndex old_sub_phrases;
223 old_sub_phrases.load(oldchunk, 0, oldchunk->size());
224 PhraseIndexLogger logger;
226 bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
227 logger.store(newlog);
231 bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
232 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
236 m_total_freq -= sub_phrases->get_phrase_index_total_freq();
237 PhraseIndexLogger logger;
240 bool retval = sub_phrases->merge(&logger);
241 m_total_freq += sub_phrases->get_phrase_index_total_freq();
246 bool SubPhraseIndex::load(MemoryChunk * chunk,
247 table_offset_t offset, table_offset_t end){
248 //save the memory chunk
255 char * buf_begin = (char *)chunk->begin();
256 chunk->get_content(offset, &m_total_freq, sizeof(guint32));
257 offset += sizeof(guint32);
258 table_offset_t index_one, index_two, index_three;
259 chunk->get_content(offset, &index_one, sizeof(table_offset_t));
260 offset += sizeof(table_offset_t);
261 chunk->get_content(offset, &index_two, sizeof(table_offset_t));
262 offset += sizeof(table_offset_t);
263 chunk->get_content(offset, &index_three, sizeof(table_offset_t));
264 offset += sizeof(table_offset_t);
265 g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
266 g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
267 g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
268 m_phrase_index.set_chunk(buf_begin + index_one,
269 index_two - 1 - index_one, NULL);
270 m_phrase_content.set_chunk(buf_begin + index_two,
271 index_three - 1 - index_two, NULL);
272 g_return_val_if_fail( index_three <= end, FALSE);
276 bool SubPhraseIndex::store(MemoryChunk * new_chunk,
277 table_offset_t offset, table_offset_t& end){
278 new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
279 table_offset_t index = offset + sizeof(guint32);
281 offset = index + sizeof(table_offset_t) * 3 ;
282 new_chunk->set_content(offset, &c_separate, sizeof(char));
283 offset += sizeof(char);
285 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
286 index += sizeof(table_offset_t);
287 new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
288 offset += m_phrase_index.size();
289 new_chunk->set_content(offset, &c_separate, sizeof(char));
290 offset += sizeof(char);
292 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
293 index += sizeof(table_offset_t);
295 new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
296 offset += m_phrase_content.size();
297 new_chunk->set_content(offset, &c_separate, sizeof(char));
298 offset += sizeof(char);
299 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
303 bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
304 /* diff the header */
305 MemoryChunk oldheader, newheader;
306 guint32 total_freq = oldone->get_phrase_index_total_freq();
307 oldheader.set_content(0, &total_freq, sizeof(guint32));
308 total_freq = get_phrase_index_total_freq();
309 newheader.set_content(0, &total_freq, sizeof(guint32));
310 logger->append_record(LOG_MODIFY_HEADER, null_token,
311 &oldheader, &newheader);
313 /* diff phrase items */
314 PhraseIndexRange oldrange, currange, range;
315 oldone->get_range(oldrange); get_range(currange);
316 range.m_range_begin = std_lite::min(oldrange.m_range_begin,
317 currange.m_range_begin);
318 range.m_range_end = std_lite::max(oldrange.m_range_end,
319 currange.m_range_end);
320 PhraseItem olditem, newitem;
322 for (phrase_token_t token = range.m_range_begin;
323 token < range.m_range_end; ++token ){
324 bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
325 bool newretval = ERROR_OK == get_phrase_item(token, newitem);
328 if ( newretval ) { /* compare phrase item. */
329 if ( olditem == newitem )
331 logger->append_record(LOG_MODIFY_RECORD, token,
332 &(olditem.m_chunk), &(newitem.m_chunk));
333 } else { /* remove phrase item. */
334 logger->append_record(LOG_REMOVE_RECORD, token,
335 &(olditem.m_chunk), NULL);
338 if ( newretval ){ /* add phrase item. */
339 logger->append_record(LOG_ADD_RECORD, token,
340 NULL, &(newitem.m_chunk));
341 } else { /* both empty. */
350 bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
351 LOG_TYPE log_type; phrase_token_t token;
352 MemoryChunk oldchunk, newchunk;
353 PhraseItem olditem, newitem, item, * tmpitem;
355 while(logger->has_next_record()){
356 logger->next_record(log_type, token, &oldchunk, &newchunk);
359 case LOG_ADD_RECORD:{
360 assert( 0 == oldchunk.size() );
361 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
363 add_phrase_item(token, &newitem);
366 case LOG_REMOVE_RECORD:{
367 assert( 0 == newchunk.size() );
369 remove_phrase_item(token, tmpitem);
371 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
373 if (olditem != *tmpitem)
379 case LOG_MODIFY_RECORD:{
380 get_phrase_item(token, item);
381 olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
383 newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
388 if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
390 remove_phrase_item(token, tmpitem);
391 assert(olditem == *tmpitem);
392 add_phrase_item(token, &newitem);
394 } else { /* in place editing. */
395 /* newchunk.size() <= item.m_chunk.size() */
396 /* Hack here: we assume the behaviour of get_phrase_item
397 * point to the actual data positon, so changes to item
398 * will be saved in SubPhraseIndex immediately.
400 memmove(item.m_chunk.begin(), newchunk.begin(),
405 case LOG_MODIFY_HEADER:{
406 guint32 total_freq = get_phrase_index_total_freq();
407 guint32 tmp_freq = 0;
408 assert(null_token == token);
409 assert(oldchunk.size() == newchunk.size());
410 oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
411 if (total_freq != tmp_freq)
413 newchunk.get_content(0, &tmp_freq, sizeof(guint32));
414 m_total_freq = tmp_freq;
424 bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
425 SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
427 sub_phrases = new SubPhraseIndex;
432 phrase_token_t token;
434 PhraseItem * item_ptr = new PhraseItem;
435 phrase_token_t cur_token = 0;
436 while ( !feof(infile)){
437 fscanf(infile, "%s", pinyin);
438 fscanf(infile, "%s", phrase);
439 fscanf(infile, "%u", &token);
440 fscanf(infile, "%ld", &freq);
444 assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
447 utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
450 if ( 0 == cur_token ){
452 item_ptr->set_phrase_string(written, phrase_utf16);
455 if ( cur_token != token ){
456 add_phrase_item( cur_token, item_ptr);
458 item_ptr = new PhraseItem;
460 item_ptr->set_phrase_string(written, phrase_utf16);
463 PinyinDefaultParser parser;
464 NullPinyinValidator validator;
465 PinyinKeyVector keys;
466 PinyinKeyPosVector poses;
468 keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
469 poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
470 parser.parse(validator, keys, poses, pinyin);
472 assert ( item_ptr->get_phrase_length() == keys->len );
473 item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
475 g_array_free(keys, TRUE);
476 g_array_free(poses, TRUE);
477 g_free(phrase_utf16);
480 add_phrase_item( cur_token, item_ptr);
482 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
486 int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
488 min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
489 for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
490 if ( m_sub_phrase_indices[i] ) {
491 min_index = std_lite::min(min_index, i);
492 max_index = std_lite::max(max_index, i);
498 int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
499 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
501 return ERROR_NO_SUB_PHRASE_INDEX;
503 int result = sub_phrase->get_range(range);
507 range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
508 range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
512 int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
513 const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
514 const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
516 range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
517 range.m_range_end = end - begin;
522 bool FacadePhraseIndex::compat(){
523 for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
524 SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
528 SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
529 PhraseIndexRange range;
530 int result = sub_phrase->get_range(range);
531 if ( result != ERROR_OK ) {
532 delete new_sub_phrase;
537 for ( phrase_token_t token = range.m_range_begin;
538 token < range.m_range_end;
540 result = sub_phrase->get_phrase_item(token, item);
541 if ( result != ERROR_OK )
543 new_sub_phrase->add_phrase_item(token, &item);
547 m_sub_phrase_indices[index] = new_sub_phrase;