write pinyin_get_full_pinyin_candidates in progress
authorPeng Wu <alexepico@gmail.com>
Wed, 18 Apr 2012 08:06:05 +0000 (16:06 +0800)
committerPeng Wu <alexepico@gmail.com>
Wed, 18 Apr 2012 08:07:35 +0000 (16:07 +0800)
src/pinyin.cpp
src/storage/pinyin_parser2.cpp
src/storage/pinyin_parser2.h

index bef326c..1cc0e3d 100644 (file)
@@ -597,7 +597,7 @@ bool pinyin_get_candidates(pinyin_instance_t * instance,
         if ( !(retval & SEARCH_OK) )
             continue;
 
-        /* reduce to a single GArray. */
+        /* reduce and append to a single GArray. */
         for (size_t m = min_index; m <= max_index; ++m) {
             for (size_t n = 0; n < ranges[m]->len; ++n) {
                 PhraseIndexRange * range =
@@ -693,10 +693,9 @@ bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
     pinyin_context_t * & context = instance->m_context;
     pinyin_option_t & options = context->m_options;
     ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+    ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
     g_array_set_size(candidates, 0);
 
-    ChewingKey * keys = &g_array_index
-        (pinyin_keys, ChewingKey, offset);
     size_t pinyin_len = pinyin_keys->len - offset;
     ssize_t i;
 
@@ -733,8 +732,147 @@ bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
 
     if (pinyin_len == 1) {
         if (options & USE_DIVIDED_TABLE) {
+            g_array_set_size(items, 0);
             /* handle "^xian$" -> "xi'an" here */
-            assert(FALSE);
+
+            /* because there is only one pinyin left,
+             *  the following for-loop will not produce 2 character candidates.
+             * the if-branch will fill the candidate list with
+             *  2 character candidates.
+             */
+
+            ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
+            ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
+                                                   ChewingKeyRest, offset);
+            ChewingKeyRest orig_rest = *rest;
+            guint16 tone = CHEWING_ZERO_TONE;
+
+            const divided_table_item_t * item = NULL;
+
+            /* back up tone */
+            if (options & USE_TONE) {
+                tone = key->m_tone;
+                if (CHEWING_ZERO_TONE != tone) {
+                    key->m_tone = CHEWING_ZERO_TONE;
+                    rest->m_raw_end --;
+                }
+            }
+
+            item = context->m_full_pinyin_parser->retrieve_divided_item
+                (options, offset, pinyin_keys, pinyin_key_rests,
+                 instance->m_raw_full_pinyin,
+                 strlen(instance->m_raw_full_pinyin));
+
+            ChewingKey divided_keys[2];
+            assert(context->m_full_pinyin_parser->
+                   parse_one_key(options, divided_keys[0], item->m_new_keys[0],
+                                 strlen(item->m_new_keys[0])));
+            assert(context->m_full_pinyin_parser->
+                   parse_one_key(options, divided_keys[1], item->m_new_keys[1],
+                                 strlen(item->m_new_keys[1])));
+
+            gchar * new_pinyins = g_strdup_printf
+                ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
+
+            /* propagate the tone */
+            if (options & USE_TONE) {
+                if (CHEWING_ZERO_TONE != tone) {
+                    assert(0 < tone && tone <= 5);
+                    gchar * tmp_str = g_strdup_printf
+                        ("%s%d", new_pinyins, tone);
+                    divided_keys[1].m_tone = tone;
+                    g_free(new_pinyins);
+                    new_pinyins = tmp_str;
+                }
+            }
+
+            /* do pinyin search. */
+            int retval = context->m_pinyin_table->search
+                (2, divided_keys, ranges);
+
+            if (retval & SEARCH_OK) {
+                /* reduce and append to a single GArray. */
+                for (size_t m = min_index; m <= max_index; ++m) {
+                    for (size_t n = 0; n < ranges[m]->len; ++n) {
+                        PhraseIndexRange * range =
+                            &g_array_index(ranges[m], PhraseIndexRange, n);
+                        for (size_t k = range->m_range_begin;
+                             k < range->m_range_end; ++k) {
+                            lookup_candidate_t item;
+                            item.m_candidate_type = DIVIDED_CANDIDATE;
+                            item.m_token = k; item.m_orig_rest = orig_rest;
+                            item.m_new_pinyins = g_strdup(new_pinyins);
+                            g_array_append_val(items, item);
+                        }
+                    }
+                }
+                g_free(new_pinyins);
+
+                g_array_sort(items, compare_item_with_token);
+
+                /* remove the duplicated items. */
+                phrase_token_t last_token = null_token;
+                for (size_t n = 0; n < items->len; ++n) {
+                    lookup_candidate_t * item = &g_array_index
+                        (items, lookup_candidate_t, n);
+                    if (last_token == item->m_token) {
+                        g_array_remove_index(items, n);
+                        n--;
+                    }
+                    last_token = item->m_token;
+                }
+
+                PhraseItem cached_item;
+                /* compute all freqs. */
+                for (i = 0; i < items->len; ++i) {
+                    lookup_candidate_t * item = &g_array_index
+                        (items, lookup_candidate_t, i);
+                    phrase_token_t & token = item->m_token;
+
+                    gfloat bigram_poss = 0; guint32 total_freq = 0;
+                    if (options & DYNAMIC_ADJUST) {
+                        if (null_token != prev_token) {
+                            guint32 bigram_freq = 0;
+                            merged_gram.get_total_freq(total_freq);
+                            merged_gram.get_freq(token, bigram_freq);
+                            if (0 != total_freq)
+                                bigram_poss = bigram_freq / (gfloat)total_freq;
+                        }
+                    }
+
+                    /* compute the m_freq. */
+                    FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+                    phrase_index->get_phrase_item(token, cached_item);
+                    total_freq = phrase_index->get_phrase_index_total_freq();
+                    assert (0 < total_freq);
+
+                    /* Note: possibility value <= 1.0. */
+                    guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
+                                    (1 - LAMBDA_PARAMETER) *
+                                    cached_item.get_unigram_frequency() /
+                                    (gfloat) total_freq) * 256 * 256 * 256;
+                    item->m_freq = freq;
+                }
+
+                /* sort the candidates of the same length by frequency. */
+                g_array_sort(items, compare_item_with_frequency);
+
+                /* transfer back items to tokens, and save it into candidates */
+                for (i = 0; i < items->len; ++i) {
+                    lookup_candidate_t * item = &g_array_index
+                        (items, lookup_candidate_t, i);
+                    g_array_append_val(candidates, item->m_token);
+                }
+
+            }
+
+            /* restore tones */
+            if (options & USE_TONE) {
+                if (CHEWING_ZERO_TONE != tone) {
+                    key->m_tone = tone;
+                    rest->m_raw_end ++;
+                }
+            }
         }
     }
 
index 662ee1b..b4175b1 100644 (file)
@@ -484,7 +484,7 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
                                       int len) const {
     int i;
     assert(keys->len == key_rests->len);
-    gint16 num_keys = keys->len;
+    gint num_keys = keys->len;
 
     ChewingKey * cur_key = NULL, * next_key = NULL;
     ChewingKeyRest * cur_rest = NULL, * next_rest = NULL;
@@ -561,7 +561,7 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
             assert(parse_one_key(options, *next_key, onepinyin, len));
         }
 
-        /* save back tones */
+        /* restore tones */
         if (options & USE_TONE) {
             if (CHEWING_ZERO_TONE != next_tone) {
                 next_key->m_tone = next_tone;
@@ -573,6 +573,49 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
     return true;
 }
 
+const divided_table_item_t * FullPinyinParser2::retrieve_divided_item
+(pinyin_option_t options, size_t offset,
+ ChewingKeyVector & keys, ChewingKeyRestVector & key_rests,
+ const char * str, int len) const {
+    assert(keys->len == key_rests->len);
+
+    gint num_keys = keys->len;
+    assert(offset < num_keys);
+
+    ChewingKey * key = &g_array_index(keys, ChewingKey, offset);
+    ChewingKeyRest * rest = &g_array_index(key_rests,
+                                           ChewingKeyRest, offset);
+    guint16 tone = CHEWING_ZERO_TONE;
+
+    /* lookup divided table */
+    size_t k;
+    const divided_table_item_t * item = NULL;
+    for (k = 0; k < G_N_ELEMENTS(divided_table); ++k) {
+        item = divided_table + k;
+
+        /* no ops */
+        assert(item->m_new_freq > 0);
+
+        const char * onepinyin = str + rest->m_raw_begin;
+        size_t len = strlen(item->m_orig_key);
+
+        if (rest->length() != len)
+            continue;
+
+        if (0 == strncmp(onepinyin, item->m_orig_key, len))
+            break;
+    }
+
+    /* found the match */
+    if (k < G_N_ELEMENTS(divided_table)) {
+        /* do divided */
+        item = divided_table + k;
+        return item;
+    }
+
+    return NULL;
+}
+
 #define IS_KEY(x)   (('a' <= x && x <= 'z') || x == ';')
 
 bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
index 09469e7..77ed3d4 100644 (file)
@@ -152,6 +152,11 @@ protected:
                        const char * str, int len) const;
 
 public:
+    const divided_table_item_t * retrieve_divided_item
+    (pinyin_option_t options, size_t offset,
+     ChewingKeyVector & keys, ChewingKeyRestVector & key_rests,
+     const char * str, int len) const;
+public:
     FullPinyinParser2();
     virtual ~FullPinyinParser2() {
         g_array_free(m_parse_steps, TRUE);