port phrase index
authorPeng Wu <alexepico@gmail.com>
Thu, 8 Dec 2011 08:41:17 +0000 (16:41 +0800)
committerPeng Wu <alexepico@gmail.com>
Fri, 9 Dec 2011 05:34:47 +0000 (13:34 +0800)
src/storage/phrase_index.cpp
src/storage/phrase_index.h

index c42b077c49d680cedb579cbbd3cd1ed96e8b2b1a..640927232b7b22aaa703e93ea9578fdca6065405 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include "phrase_index.h"
+#include "pinyin_custom2.h"
 
 using namespace pinyin;
 
@@ -28,27 +29,33 @@ bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
     return true;
 }
 
-bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
+bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
+                                       guint32 & freq){
     guint8 phrase_length = get_phrase_length();
-    table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
-    bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
+    table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
+
+    bool retval = m_chunk.get_content
+        (offset, keys, phrase_length * sizeof(ChewingKey));
     if ( !retval )
        return retval;
-    return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
+    return m_chunk.get_content
+        (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
 }
 
-void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
+void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
     guint8 phrase_length = get_phrase_length();
     set_n_pronunciation(get_n_pronunciation() + 1);
-    m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
+    m_chunk.set_content(m_chunk.size(), keys,
+                        phrase_length * sizeof(ChewingKey));
     m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
 }
 
 void PhraseItem::remove_nth_pronunciation(size_t index){
     guint8 phrase_length = get_phrase_length();
     set_n_pronunciation(get_n_pronunciation() - 1);
-    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
-    m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
+    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) +
+        index * (phrase_length * sizeof (ChewingKey) + sizeof(guint32));
+    m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
 }
 
 bool PhraseItem::get_phrase_string(utf16_t * phrase){
@@ -62,8 +69,8 @@ bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
     return true;
 }
 
-void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
-                                            PinyinKey * pinyin_keys,
+void PhraseItem::increase_pinyin_possibility(pinyin_option_t options,
+                                            ChewingKey * keys,
                                             gint32 delta){
     guint8 phrase_length = get_phrase_length();
     guint8 npron = get_n_pronunciation();
@@ -71,13 +78,14 @@ void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
     char * buf_begin = (char *) m_chunk.begin();
     guint32 total_freq = 0;
     for ( int i = 0 ; i < npron ; ++i){
-       char * pinyin_begin = buf_begin + offset +
-           i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
-       guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+       char * chewing_begin = buf_begin + offset +
+           i * ( phrase_length * sizeof(ChewingKey) + sizeof(guint32) );
+       guint32 * freq = (guint32 *)(chewing_begin +
+                                     phrase_length * sizeof(ChewingKey));
        total_freq += *freq;
-       if ( 0 == pinyin_compare_with_ambiguities
-             (custom, pinyin_keys,
-              (PinyinKey *)pinyin_begin, phrase_length) ){
+       if ( 0 == pinyin_compare_with_ambiguities2
+             (options, keys,
+              (ChewingKey *)chewing_begin, phrase_length) ){
            //protect against total_freq overflow.
            if ( delta > 0 && total_freq > total_freq + delta )
                return;
@@ -145,7 +153,7 @@ int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
     if ( !result ) 
        return ERROR_FILE_CORRUPTION;
 
-    size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+    size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
     item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
     return ERROR_OK;
 }
@@ -460,20 +468,19 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
            item_ptr->set_phrase_string(written, phrase_utf16);
        }
 
-       PinyinDefaultParser parser;
-       NullPinyinValidator validator;
-       PinyinKeyVector keys;
-       PinyinKeyPosVector poses;
-       
-       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
-       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
-       parser.parse(validator, keys, poses, pinyin);
+        pinyin_option_t options = USE_TONE;
+       FullPinyinParser2 parser;
+       ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+       ChewingKeyRestVector key_rests =
+            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+       parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
        
-       assert ( item_ptr->get_phrase_length() == keys->len );
-       item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
+       assert(item_ptr->get_phrase_length() == keys->len);
+       item_ptr->append_pronunciation((ChewingKey *)keys->data, freq);
 
        g_array_free(keys, TRUE);
-       g_array_free(poses, TRUE);
+       g_array_free(key_rests, TRUE);
        g_free(phrase_utf16);
     }
 
index 3c81b915b3c3f57b59f7875b088cda52c0336d40..69f39d94a0b732da8d175b6d42eaf343cedf1f1e 100644 (file)
@@ -25,8 +25,9 @@
 #include <stdio.h>
 #include <glib.h>
 #include "novel_types.h"
-#include "pinyin_base.h"
-#include "pinyin_phrase.h"
+#include "chewing_key.h"
+#include "pinyin_parser2.h"
+#include "pinyin_phrase2.h"
 #include "memory_chunk.h"
 #include "phrase_index_logger.h"
 
@@ -49,8 +50,7 @@ namespace pinyin{
 
 class PinyinLookup;
 
-/* Because this is not large,
- * Store this in user home directory.
+/* Store delta info by phrase index logger in user home directory.
  */
 
 const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
@@ -90,21 +90,22 @@ public:
        return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
     }
 
-    gfloat get_pinyin_possibility(PinyinCustomSettings & custom, 
-                                 PinyinKey * pinyin_keys){
+    gfloat get_pinyin_possibility(pinyin_option_t options,
+                                 ChewingKey * keys){
        guint8 phrase_length = get_phrase_length();
        guint8 npron = get_n_pronunciation();
-       size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+       size_t offset = phrase_item_header + phrase_length * sizeof (utf16_t);
        char * buf_begin = (char *)m_chunk.begin();
        guint32 matched = 0, total_freq =0;
        for ( int i = 0 ; i < npron ; ++i){
-           char * pinyin_begin = buf_begin + offset + 
-               i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
-           guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+           char * chewing_begin = buf_begin + offset +
+               i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+           guint32 * freq = (guint32 *)(chewing_begin +
+                                         phrase_length * sizeof(ChewingKey));
            total_freq += *freq;
-           if ( 0 == pinyin_compare_with_ambiguities
-                 (custom,  pinyin_keys,
-                  (PinyinKey *)pinyin_begin,phrase_length) ){
+           if ( 0 == pinyin_compare_with_ambiguities2
+                 (options,  keys,
+                  (ChewingKey *)chewing_begin,phrase_length) ){
                matched += *freq;
            }
        }
@@ -121,19 +122,19 @@ public:
        return retval;
     }
     
-    void increase_pinyin_possibility(PinyinCustomSettings & custom,
-                                    PinyinKey * pinyin_keys,
+    void increase_pinyin_possibility(pinyin_option_t options,
+                                    ChewingKey * keys,
                                     gint32 delta);
 
     bool get_phrase_string(utf16_t * phrase);
     bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
     bool get_nth_pronunciation(size_t index, 
-                              /* out */ PinyinKey * pinyin, 
+                              /* out */ ChewingKey * keys,
                               /* out */ guint32 & freq);
     /* Normally don't change the first pronunciation,
      * which decides the token number.
      */
-    void append_pronunciation(PinyinKey * pinyin, guint32 freq);
+    void append_pronunciation(ChewingKey * keys, guint32 freq);
     void remove_nth_pronunciation(size_t index);
 
     bool operator == (const PhraseItem & rhs) const{