fixes phrase_index.h
[platform/upstream/libpinyin.git] / src / storage / pinyin_parser2.cpp
index d8cfaa6..131231c 100644 (file)
@@ -299,51 +299,10 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
             next_sep = k;
         }
 
-        /* Heuristic Method:
-         *   do maximum forward match first. */
-        for (size_t pos = i; pos < next_sep; ++pos) {
-            curstep = &g_array_index(m_parse_steps, parse_value_t, pos);
-            size_t try_len = std_lite::min
-                (pos + max_full_pinyin_length, next_sep);
-            for (size_t n = try_len; n > pos; --n) {
-                nextstep = &g_array_index(m_parse_steps, parse_value_t, n);
-
-                /* gen next step */
-                const char * onepinyin = input + pos;
-                gint16 onepinyinlen = n - pos;
-                value = parse_value_t();
-
-                ChewingKey key; ChewingKeyRest rest;
-                bool parsed = parse_one_key
-                    (options, key, onepinyin, onepinyinlen);
-                rest.m_raw_begin = pos; rest.m_raw_end = n;
-
-                if (!parsed)
-                    continue;
-
-                //printf("onepinyin:%s len:%d\n", onepinyin, onepinyinlen);
-                value.m_key = key; value.m_key_rest = rest;
-                value.m_num_keys = curstep->m_num_keys + 1;
-                value.m_parsed_len = curstep->m_parsed_len + onepinyinlen;
-                value.m_last_step = pos;
-
-                /* save next step */
-                if (-1 == nextstep->m_last_step)
-                    *nextstep = value;
-                if (value.m_parsed_len > nextstep->m_parsed_len)
-                    *nextstep = value;
-                if (value.m_parsed_len == nextstep->m_parsed_len &&
-                    value.m_num_keys < nextstep->m_num_keys)
-                    *nextstep = value;
-
-                /* maximum forward, set pos to n in next iteration. */
-                pos = n - 1;
-                break;
-            }
-        }
-
         /* dynamic programming here. */
-        for (size_t m = i; m < next_sep; ++m) {
+        /* for (size_t m = i; m < next_sep; ++m) */
+        {
+            size_t m = i;
             curstep = &g_array_index(m_parse_steps, parse_value_t, m);
             size_t try_len = std_lite::min
                 (m + max_full_pinyin_length, next_sep);
@@ -370,13 +329,56 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
                 value.m_last_step = m;
 
                 /* save next step */
+                /* no previous result */
                 if (-1 == nextstep->m_last_step)
                     *nextstep = value;
+                /* prefer the longest pinyin */
                 if (value.m_parsed_len > nextstep->m_parsed_len)
                     *nextstep = value;
+                /* prefer the shortest keys with the same pinyin length */
                 if (value.m_parsed_len == nextstep->m_parsed_len &&
                     value.m_num_keys < nextstep->m_num_keys)
                     *nextstep = value;
+
+                /* handle with the same pinyin length and the number of keys */
+                if (value.m_parsed_len == nextstep->m_parsed_len &&
+                    value.m_num_keys == nextstep->m_num_keys) {
+
+#if 0
+                    /* prefer the complete pinyin with shengmu
+                     * over without shengmu,
+                     * ex: "kaneiji" -> "ka'nei'ji".
+                     */
+                    if ((value.m_key.m_initial != CHEWING_ZERO_INITIAL &&
+                         !(value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+                           value.m_key.m_final == CHEWING_ZERO_FINAL)) &&
+                        nextstep->m_key.m_initial == CHEWING_ZERO_INITIAL)
+                        *nextstep = value;
+
+                    /* prefer the complete pinyin 'er'
+                     * over the in-complete pinyin 'r',
+                     * ex: "xierqi" -> "xi'er'qi."
+                     */
+                    if ((value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
+                        value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+                        value.m_key.m_final == CHEWING_ER) &&
+                        (nextstep->m_key.m_initial == CHEWING_R &&
+                         nextstep->m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+                         nextstep->m_key.m_final == CHEWING_ZERO_FINAL))
+                        *nextstep = value;
+#endif
+
+                    /* prefer the 'a' at the end of clause,
+                     * ex: "zheyanga$" -> "zhe'yang'a$".
+                     */
+                    if (value.m_parsed_len == len &&
+                        (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL &&
+                         nextstep->m_key.m_final == CHEWING_A) &&
+                        (value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
+                         value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+                         value.m_key.m_final == CHEWING_A))
+                        *nextstep = value;
+                }
             }
         }
     }
@@ -386,7 +388,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
 
     /* post processing for re-split table. */
     if (options & USE_RESPLIT_TABLE) {
-        post_process(options, keys, key_rests);
+        post_process2(options, keys, key_rests, str, len);
     }
 
     g_free(input);
@@ -416,7 +418,7 @@ int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys,
         gint16 pos = curstep->m_num_keys - 1;
 
         /* skip "'" */
-        if (0 != curstep->m_key_rest.m_table_index) {
+        if (0 != curstep->m_key.get_table_index()) {
             ChewingKey * key = &g_array_index(keys, ChewingKey, pos);
             ChewingKeyRest * rest = &g_array_index
                 (key_rests, ChewingKeyRest, pos);
@@ -430,17 +432,18 @@ int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys,
     return parsed_len;
 }
 
-
-bool FullPinyinParser2::post_process(pinyin_option_t options,
-                                     ChewingKeyVector & keys,
-                                     ChewingKeyRestVector & key_rests) const {
+bool FullPinyinParser2::post_process2(pinyin_option_t options,
+                                      ChewingKeyVector & keys,
+                                      ChewingKeyRestVector & key_rests,
+                                      const char * str,
+                                      int len) const {
     int i;
     assert(keys->len == key_rests->len);
-    gint16 num_keys = keys->len;
+    gint num_keys = keys->len;
 
     ChewingKey * cur_key = NULL, * next_key = NULL;
     ChewingKeyRest * cur_rest = NULL, * next_rest = NULL;
-    guint16 cur_tone = CHEWING_ZERO_TONE, next_tone = CHEWING_ZERO_TONE;
+    guint16 next_tone = CHEWING_ZERO_TONE;
 
     for (i = 0; i < num_keys - 1; ++i) {
         cur_rest = &g_array_index(key_rests, ChewingKeyRest, i);
@@ -453,54 +456,173 @@ bool FullPinyinParser2::post_process(pinyin_option_t options,
         cur_key = &g_array_index(keys, ChewingKey, i);
         next_key = &g_array_index(keys, ChewingKey, i + 1);
 
+        /* some tone here */
+        if (CHEWING_ZERO_TONE != cur_key->m_tone)
+            continue;
+
+        /* back up tone */
         if (options & USE_TONE) {
-            cur_tone = cur_key->m_tone;
             next_tone = next_key->m_tone;
-            cur_key->m_tone = next_key->m_tone = CHEWING_ZERO_TONE;
+            if (CHEWING_ZERO_TONE != next_tone) {
+                next_key->m_tone = CHEWING_ZERO_TONE;
+                next_rest->m_raw_end --;
+            }
         }
 
         /* lookup re-split table */
-        size_t k;
         const resplit_table_item_t * item = NULL;
-        for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
-            item = resplit_table + k;
+
+        item = retrieve_resplit_item_by_original_pinyins
+            (options, cur_key, cur_rest, next_key, next_rest, str, len);
+
+        if (item) {
             /* no ops */
             if (item->m_orig_freq >= item->m_new_freq)
                 continue;
 
-            /* use pinyin_exact_compare2 here. */
-            if (0 == pinyin_exact_compare2(item->m_orig_keys,
-                                           cur_key, 2))
-                break;
+            /* do re-split */
+            const char * onepinyin = str + cur_rest->m_raw_begin;
+            size_t len = strlen(item->m_new_keys[0]);
 
-        }
+            assert(parse_one_key(options, *cur_key, onepinyin, len));
+            cur_rest->m_raw_end = cur_rest->m_raw_begin + len;
 
-        /* find the match */
-        if (k < G_N_ELEMENTS(resplit_table)) {
-            /* do re-split */
-            item = resplit_table + k;
-            *cur_key = item->m_new_keys[0];
-            *next_key = item->m_new_keys[1];
-            /* assumes only moved one char in gen_all_resplit script. */
-            cur_rest->m_raw_end --;
-            next_rest->m_raw_begin --;
+            next_rest->m_raw_begin = cur_rest->m_raw_end;
+            onepinyin = str + next_rest->m_raw_begin;
+            len = strlen(item->m_new_keys[1]);
+
+            assert(parse_one_key(options, *next_key, onepinyin, len));
         }
 
-        /* save back tones */
+        /* restore tones */
         if (options & USE_TONE) {
-            cur_key->m_tone = cur_tone;
-            next_key->m_tone = next_tone;
+            if (CHEWING_ZERO_TONE != next_tone) {
+                next_key->m_tone = next_tone;
+                next_rest->m_raw_end ++;
+            }
         }
     }
 
     return true;
 }
 
+const divided_table_item_t * FullPinyinParser2::retrieve_divided_item
+(pinyin_option_t options, ChewingKey * key, ChewingKeyRest * rest,
+ const char * str, int len) const {
+
+    /* lookup divided table */
+    size_t k;
+    const divided_table_item_t * item = NULL;
+    for (k = 0; k < G_N_ELEMENTS(divided_table); ++k) {
+        item = divided_table + k;
+
+        const char * onepinyin = str + rest->m_raw_begin;
+        size_t len = strlen(item->m_orig_key);
+
+        if (rest->length() != len)
+            continue;
+
+        if (0 == strncmp(onepinyin, item->m_orig_key, len))
+            break;
+    }
+
+    /* found the match */
+    if (k < G_N_ELEMENTS(divided_table)) {
+        /* do divided */
+        item = divided_table + k;
+        return item;
+    }
+
+    return NULL;
+}
+
+
+const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_original_pinyins
+(pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const{
+    /* lookup re-split table */
+    size_t k;
+    const resplit_table_item_t * item = NULL;
+
+    for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
+        item = resplit_table + k;
+
+        const char * onepinyin = str + cur_rest->m_raw_begin;
+        size_t len = strlen(item->m_orig_keys[0]);
+
+        if (cur_rest->length() != len)
+            continue;
+
+        if (0 != strncmp(onepinyin, item->m_orig_keys[0], len))
+            continue;
+
+        onepinyin = str + next_rest->m_raw_begin;
+        len = strlen(item->m_orig_keys[1]);
+
+        if (next_rest->length() != len)
+            continue;
+
+        if (0 == strncmp(onepinyin, item->m_orig_keys[1], len))
+            break;
+    }
+
+    /* found the match */
+    if (k < G_N_ELEMENTS(resplit_table)) {
+        item = resplit_table + k;
+        return item;
+    }
+
+    return NULL;
+}
+
+const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_resplit_pinyins
+(pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const {
+    /* lookup divide table */
+    size_t k;
+    const resplit_table_item_t * item = NULL;
+
+    for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
+        item = resplit_table + k;
+
+        const char * onepinyin = str + cur_rest->m_raw_begin;
+        size_t len = strlen(item->m_new_keys[0]);
+
+        if (cur_rest->length() != len)
+            continue;
+
+        if (0 != strncmp(onepinyin, item->m_new_keys[0], len))
+            continue;
+
+        onepinyin = str + next_rest->m_raw_begin;
+        len = strlen(item->m_new_keys[1]);
+
+        if (next_rest->length() != len)
+            continue;
+
+        if (0 == strncmp(onepinyin, item->m_new_keys[1], len))
+            break;
+    }
+
+    /* found the match */
+    if (k < G_N_ELEMENTS(resplit_table)) {
+        item = resplit_table + k;
+        return item;
+    }
+
+    return NULL;
+}
+
 #define IS_KEY(x)   (('a' <= x && x <= 'z') || x == ';')
 
 bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
                                         ChewingKey & key,
                                         const char *str, int len) const {
+    options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
 
     if (1 == len) {
         if (!(options & PINYIN_INCOMPLETE))
@@ -523,7 +645,8 @@ bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
     }
 
     ChewingTone tone = CHEWING_ZERO_TONE;
-    options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
+    options &= ~(PINYIN_INCOMPLETE|CHEWING_INCOMPLETE);
+    options |= PINYIN_CORRECT_UE_VE | PINYIN_CORRECT_V_U;
 
     /* parse tone */
     if (3 == len) {
@@ -556,6 +679,9 @@ bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
         charid = ch == ';' ? 26 : ch - 'a';
         /* first yunmu */
         const char * yun = m_yunmu_table[charid].m_yunmus[0];
+        if (NULL == yun)
+            return false;
+
         gchar * pinyin = g_strdup_printf("%s%s", sheng, yun);
         if (search_pinyin_index(options, pinyin, key)) {
             key.m_tone = tone;
@@ -566,6 +692,9 @@ bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
 
         /* second yunmu */
         yun = m_yunmu_table[charid].m_yunmus[1];
+        if (NULL == yun)
+            return false;
+
         pinyin = g_strdup_printf("%s%s", sheng, yun);
         if (search_pinyin_index(options, pinyin, key)) {
             key.m_tone = tone;
@@ -662,12 +791,12 @@ bool DoublePinyinParser2::set_scheme(DoublePinyinScheme scheme) {
 
 /* the chewing string must be freed with g_free. */
 static bool search_chewing_symbols(const chewing_symbol_item_t * symbol_table,
-                                   const char key, char ** chewing) {
+                                   const char key, const char ** chewing) {
     *chewing = NULL;
     /* just iterate the table, as we only have < 50 items. */
     while (symbol_table->m_input != '\0') {
         if (symbol_table->m_input == key) {
-            *chewing = g_strdup(symbol_table->m_chewing);
+            *chewing = symbol_table->m_chewing;
             return true;
         }
         symbol_table ++;
@@ -693,6 +822,7 @@ static bool search_chewing_tones(const chewing_tone_item_t * tone_table,
 bool ChewingParser2::parse_one_key(pinyin_option_t options,
                                    ChewingKey & key,
                                    const char *str, int len) const {
+    options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
     char tone = CHEWING_ZERO_TONE;
 
     int symbols_len = len;
@@ -705,12 +835,11 @@ bool ChewingParser2::parse_one_key(pinyin_option_t options,
     }
 
     int i;
-    gchar * chewing = NULL, * onechar = NULL;
+    gchar * chewing = NULL; const char * onechar = NULL;
 
     /* probe the possible chewing map in the rest of str. */
     for (i = 0; i < symbols_len; ++i) {
         if (!search_chewing_symbols(m_symbol_table, str[i], &onechar)) {
-            g_free(onechar);
             g_free(chewing);
             return false;
         }
@@ -722,11 +851,10 @@ bool ChewingParser2::parse_one_key(pinyin_option_t options,
             chewing = g_strconcat(chewing, onechar, NULL);
             g_free(tmp);
         }
-        g_free(onechar);
     }
 
     /* search the chewing in the chewing index table. */
-    if (search_chewing_index(options, chewing, key)) {
+    if (chewing && search_chewing_index(options, chewing, key)) {
         /* save back tone if available. */
         key.m_tone = tone;
         g_free(chewing);
@@ -748,7 +876,7 @@ int ChewingParser2::parse(pinyin_option_t options, ChewingKeyVector & keys,
     int maximum_len = 0; int i;
     /* probe the longest possible chewing string. */
     for (i = 0; i < len; ++i) {
-        if (!in_chewing_scheme(str[i]))
+        if (!in_chewing_scheme(options, str[i], NULL))
             break;
     }
     maximum_len = i;
@@ -806,13 +934,26 @@ bool ChewingParser2::set_scheme(ChewingScheme scheme) {
 }
 
 
-bool ChewingParser2::in_chewing_scheme(const char key) const {
-    gchar * chewing = NULL;
+bool ChewingParser2::in_chewing_scheme(pinyin_option_t options,
+                                       const char key, const char ** symbol)
+ const {
+    const gchar * chewing = NULL;
     char tone = CHEWING_ZERO_TONE;
 
-    bool retval = search_chewing_symbols(m_symbol_table, key, &chewing) ||
-        search_chewing_tones(m_tone_table, key, &tone);
-    g_free(chewing);
+    if (search_chewing_symbols(m_symbol_table, key, &chewing)) {
+        if (symbol)
+            *symbol = chewing;
+        return true;
+    }
 
-    return retval;
+    if (!(options & USE_TONE))
+        return false;
+
+    if (search_chewing_tones(m_tone_table, key, &tone)) {
+        if (symbol)
+            *symbol = chewing_tone_table[tone];
+        return true;
+    }
+
+    return false;
 }