move tag utility to src/storage

author Peng Wu <alexepico@gmail.com>

Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)

committer Peng Wu <alexepico@gmail.com>

Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)
author Peng Wu <alexepico@gmail.com>
Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)
committer Peng Wu <alexepico@gmail.com>
Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)
diff --git a/src/pinyin.h b/src/pinyin.h

index 3cf6e714ffc01fc07f9b378f637ebded53b0718d..6e3c81eb4004f0e0165fd9505251450529f8dc0c 100644 (file)
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -10,6 +10,7 @@
  #include "lookup.h"
  #include "pinyin_lookup.h"
  #include "phrase_lookup.h"
+#include "tag_utility.h"
  
  /* training module */
  #include "flexible_ngram.h"
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am

index b2d5b1aeee25b5f36ebd9133f4dbc6930175599c..8c10cdf86a7bc781dc81abf00f05fa03b1322a61 100644 (file)
--- a/src/storage/Makefile.am
+++ b/src/storage/Makefile.am
@@ -21,20 +21,22 @@ INCLUDES                = -I$(top_srcdir)/src/include \
  
  noinst_HEADERS          = pinyin_large_table.h \
                           pinyin_base.h \
-                         pinyin_phrase.h \
+                         pinyin_phrase.h \
                           phrase_index.h \
-                         pinyin_zhuyin_map_data.h \
+                         pinyin_zhuyin_map_data.h \
                           phrase_large_table.h \
                           ngram.h \
-                         flexible_ngram.h
+                         flexible_ngram.h \
+                         tag_utility.h
  
  noinst_LTLIBRARIES      = libstorage.la
  
  libstorage_la_LDFLAGS  = -static
  
  libstorage_la_SOURCES    = pinyin_base.cpp \
-                         pinyin_large_table.cpp \
-                         phrase_index.cpp \
-                         phrase_large_table.cpp \
-                         ngram.cpp
+                          pinyin_large_table.cpp \
+                          phrase_index.cpp \
+                          phrase_large_table.cpp \
+                          ngram.cpp \
+                          tag_utility.cpp
  
diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp

new file mode 100644 (file)

index 0000000..dc1f520
--- /dev/null
+++ b/src/storage/tag_utility.cpp
@@ -0,0 +1,389 @@
+#include <glib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "phrase_large_table.h"
+#include "tag_utility.h"
+
+/* internal taglib structure */
+struct tag_entry{
+    int m_line_type;
+    char * m_line_tag;
+    int m_num_of_values;
+    char ** m_required_tags;
+    /* char ** m_optional_tags; */
+    /* int m_optional_count = 0; */
+    char ** m_ignored_tags;
+};
+
+tag_entry tag_entry_copy(int line_type, const char * line_tag,
+                         int num_of_values,
+                         char * required_tags[],
+                         char * ignored_tags[]){
+    tag_entry entry;
+    entry.m_line_type = line_type;
+    entry.m_line_tag = g_strdup( line_tag );
+    entry.m_num_of_values = num_of_values;
+    entry.m_required_tags = g_strdupv( required_tags );
+    entry.m_ignored_tags = g_strdupv( ignored_tags );
+    return entry;
+}
+
+tag_entry tag_entry_clone(tag_entry * entry){
+    return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
+                          entry->m_num_of_values,
+                          entry->m_required_tags, entry->m_ignored_tags);
+}
+
+void tag_entry_reclaim(tag_entry * entry){
+    g_free( entry->m_line_tag );
+    g_strfreev( entry->m_required_tags );
+    g_strfreev(entry->m_ignored_tags);
+}
+
+static bool taglib_free_tag_array(GArray * tag_array){
+    for ( size_t i = 0; i < tag_array->len; ++i) {
+        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+        tag_entry_reclaim(entry);
+    }
+    g_array_free(tag_array, TRUE);
+    return true;
+}
+
+/* special unichar to be handled in split_line. */
+static gunichar backslash = 0;
+static gunichar quote = 0;
+
+static gboolean split_line_init(){
+    backslash = g_utf8_get_char("\\");
+    quote = g_utf8_get_char("\"");
+    return TRUE;
+}
+
+/* Pointer Array of Array of tag_entry */
+static GPtrArray * g_tagutils_stack = NULL;
+
+bool taglib_init(){
+    assert( g_tagutils_stack == NULL);
+    g_tagutils_stack = g_ptr_array_new();
+    GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+    g_ptr_array_add(g_tagutils_stack, tag_array);
+
+    /* init split_line. */
+    split_line_init();
+    return true;
+}
+
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
+                    const char * required_tags, const char * ignored_tags){
+    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
+                                     g_tagutils_stack->len - 1);
+
+    /* some duplicate tagname or line_type check here. */
+    for ( size_t i = 0; i < tag_array->len; ++i) {
+        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+        if ( entry->m_line_type == line_type ||
+             strcmp( entry->m_line_tag, line_tag ) == 0 )
+            return false;
+    }
+
+    char ** required = g_strsplit_set(required_tags, ",:", -1);
+    char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
+
+    tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
+                                     required, ignored);
+    g_array_append_val(tag_array, entry);
+
+    g_strfreev(required);
+    g_strfreev(ignored);
+    return true;
+}
+
+static void ptr_array_entry_free(gpointer data, gpointer user_data){
+    g_free(data);
+}
+
+static gboolean hash_table_key_value_free(gpointer key, gpointer value,
+                                          gpointer user_data){
+    g_free(key);
+    g_free(value);
+    return TRUE;
+}
+
+/* split the line into tokens. */
+static gchar ** split_line(const gchar * line){
+    /* array for tokens. */
+    GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
+
+    for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
+        gunichar unichar = g_utf8_get_char(cur);
+        const gchar * begin = cur;
+        gchar * token = NULL;
+
+        if ( g_unichar_isspace (unichar) ) {
+            continue;
+        }else if ( unichar == quote ) {
+            /* handles "\"". */
+            /* skip the first '"'. */
+            begin = cur = g_utf8_next_char(cur);
+            while (*cur) {
+                unichar = g_utf8_get_char(cur);
+                if ( unichar == backslash ) {
+                    cur = g_utf8_next_char(cur);
+                    g_return_val_if_fail(*cur, NULL);
+                } else if ( unichar == quote ){
+                    break;
+                }
+                cur = g_utf8_next_char(cur);
+            }
+            gchar * tmp = g_strndup( begin, cur - begin);
+            /* TODO: switch to own strdup_escape implementation
+               for \"->" transforming. */
+            token = g_strdup_printf(tmp);
+            g_free(tmp);
+        } else {
+            /* handles other tokens. */
+            while(*cur) {
+                unichar = g_utf8_get_char(cur);
+                if ( g_unichar_isgraph(unichar) ) {
+                    /* next unichar */
+                    cur = g_utf8_next_char(cur);
+                } else {
+                    /* space and other characters handles. */
+                    break;
+                }
+            }
+            token = g_strndup( begin, cur - begin );
+        }
+
+        g_array_append_val(tokens, token);
+        if ( !*cur )
+            break;
+    }
+
+    return (gchar **)g_array_free(tokens, FALSE);
+}
+
+bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
+                 GHashTable * required){
+    /* reset values and required. */
+    g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
+    g_ptr_array_set_size(values, 0);
+    g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
+
+    /* use own version of split_line
+       instead of g_strsplit_set for special token.*/
+    char ** tokens = split_line(input_line);
+    int num_of_tokens = g_strv_length(tokens);
+
+    char * line_tag = tokens[0];
+    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+
+    tag_entry * cur_entry = NULL;
+    /* find line type. */
+    for ( size_t i = 0; i < tag_array->len; ++i) {
+        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+        if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
+            cur_entry = entry;
+            break;
+        }
+    }
+
+    if ( !cur_entry )
+        return false;
+
+    line_type = cur_entry->m_line_type;
+
+    for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
+        g_return_val_if_fail(i < num_of_tokens, false);
+        char * value = g_strdup( tokens[i] );
+        g_ptr_array_add(values, value);
+    }
+
+    int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
+    int required_len = g_strv_length( cur_entry->m_required_tags);
+
+    for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
+        g_return_val_if_fail(i < num_of_tokens, false);
+        const char * tmp = tokens[i];
+
+        /* check ignored tags. */
+        bool tag_ignored = false;
+        for ( int m = 0; m < ignored_len; ++m) {
+            if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
+                tag_ignored = true;
+                break;
+            }
+        }
+
+        if ( tag_ignored ) {
+            ++i;
+            continue;
+        }
+
+        /* check required tags. */
+        bool tag_required = false;
+        for ( int m = 0; m < required_len; ++m) {
+            if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
+                tag_required = true;
+                break;
+            }
+        }
+
+        /* warning on the un-expected tags. */
+        if ( !tag_required ) {
+            g_warning("un-expected tags:%s.\n", tmp);
+            ++i;
+            continue;
+        }
+
+        char * key = g_strdup(tokens[i]);
+        ++i;
+        g_return_val_if_fail(i < num_of_tokens, false);
+        char * value = g_strdup(tokens[i]);
+        g_hash_table_insert(required, key, value);
+    }
+
+    /* check for all required tags. */
+    for ( int i = 0; i < required_len; ++i) {
+        const char * required_tag_str = cur_entry->m_required_tags[i];
+        gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
+        if ( !result ) {
+            g_warning("missed required tags: %s.\n", required_tag_str);
+            g_strfreev(tokens);
+            return false;
+        }
+    }
+
+    g_strfreev(tokens);
+    return true;
+}
+
+bool taglib_remove_tag(int line_type){
+    /* Note: duplicate entry check is in taglib_add_tag. */
+    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+    for ( size_t i = 0; i < tag_array->len; ++i) {
+        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+        if (entry->m_line_type != line_type)
+            continue;
+        tag_entry_reclaim(entry);
+        g_array_remove_index(tag_array, i);
+        return true;
+    }
+    return false;
+}
+
+bool taglib_push_state(){
+    assert(g_tagutils_stack->len >= 1);
+    GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+    GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+    for ( size_t i = 0; i < prev_tag_array->len; ++i) {
+        tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
+        tag_entry new_entry = tag_entry_clone(entry);
+        g_array_append_val(next_tag_array, new_entry);
+    }
+    g_ptr_array_add(g_tagutils_stack, next_tag_array);
+    return true;
+}
+
+bool taglib_pop_state(){
+    assert(g_tagutils_stack->len > 1);
+    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+    g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+    taglib_free_tag_array(tag_array);
+    return true;
+}
+
+bool taglib_fini(){
+    for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
+        GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
+        taglib_free_tag_array(tag_array);
+    }
+    g_ptr_array_free(g_tagutils_stack, TRUE);
+    g_tagutils_stack = NULL;
+    return true;
+}
+
+static phrase_token_t taglib_special_string_to_token(const char * string){
+    struct token_pair{
+        phrase_token_t token;
+        const char * string;
+    };
+
+    static const token_pair tokens [] = {
+        {sentence_start, "<start>"},
+        {0, NULL}
+    };
+
+    const token_pair * pair = tokens;
+    while (pair->string) {
+        if ( strcmp(string, pair->string ) == 0 ){
+            return pair->token;
+        }
+    }
+
+    fprintf(stderr, "error: unknown token:%s.\n", string);
+    return 0;
+}
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
+    phrase_token_t token = 0;
+    if ( string[0] == '<' ) {
+        return taglib_special_string_to_token(string);
+    }
+
+    glong phrase_len = g_utf8_strlen(string, -1);
+    utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+    int result = phrases->search(phrase_len, phrase, token);
+    if ( !(result & SEARCH_OK) )
+        fprintf(stderr, "error: unknown token:%s.\n", string);
+
+    g_free(phrase);
+    return token;
+}
+
+static const char * taglib_special_token_to_string(phrase_token_t token){
+    struct token_pair{
+        phrase_token_t token;
+        const char * string;
+    };
+
+    static const token_pair tokens [] = {
+        {sentence_start, "<start>"},
+        {0, NULL}
+    };
+
+    const token_pair * pair = tokens;
+    while (pair->token) {
+        if ( token == pair->token )
+            return pair->string;
+    }
+
+    fprintf(stderr, "error: unknown token:%d.\n", token);
+    return NULL;
+}
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+                              phrase_token_t token) {
+    PhraseItem item;
+    utf16_t buffer[MAX_PHRASE_LENGTH];
+
+    gchar * phrase;
+    /* deal with the special phrase index, for "<start>..." */
+    if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
+        return g_strdup(taglib_special_token_to_string(token));
+    }
+
+    int result = phrase_index->get_phrase_item(token, item);
+    if (result != ERROR_OK) {
+        fprintf(stderr, "error: unknown token:%d.\n", token);
+        return NULL;
+    }
+
+    item.get_phrase_string(buffer);
+    guint8 length = item.get_phrase_length();
+    phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+    return phrase;
+}
diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h

new file mode 100644 (file)

index 0000000..67d8946
--- /dev/null
+++ b/src/storage/tag_utility.h
@@ -0,0 +1,68 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef TAG_UTILITY_H
+#define TAG_UTILITY_H
+
+#include "novel_types.h"
+
+/* Note: the optional tag has been removed from the first implementation.
+ * Maybe the optional tag will be added back later.
+ */
+
+bool taglib_init();
+
+/* Note: most tags are separated by ',' or ':' . */
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags);
+
+/* most parameters are hash table of string (const char *). */
+bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required);
+
+/* Note: taglib_write is omited, as printf is more suitable for this. */
+
+/* Note the following function is only available when the optional tag exists.
+ * bool taglib_report_status(int line_type);
+ */
+
+/* remove the tag of type line_type. */
+bool taglib_remove_tag(int line_type);
+
+/* the following functions are used to save current known tag list in stack.
+ * Used when the parsing context is changed.
+ */
+bool taglib_push_state();
+bool taglib_pop_state();
+
+bool taglib_fini();
+
+namespace pinyin{
+    class PhraseLargeTable;
+};
+
+using namespace pinyin;
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
+                                      const char * string);
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+                              phrase_token_t token);
+
+#endif
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am

index f314a1a2273949cce9322e224e3e70bd09b6231b..bc033f1077c26ead335ffd6e234c4783ad5f6c1d 100644 (file)
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -21,8 +21,6 @@ INCLUDES                = -I$(top_srcdir)/src \
                            -I$(top_srcdir)/src/lookup \
                            @GLIB2_CPPFLAGS@
  
-noinst_HEADERS         = tag_utility.h
-
  noinst_PROGRAMS          = gen_pinyin_table gen_binary_files export_interpolation import_interpolation
  
  gen_pinyin_table_SOURCES    = gen_pinyin_table.cpp
@@ -33,16 +31,10 @@ gen_binary_files_SOURCES    = gen_binary_files.cpp
  
  gen_binary_files_LDADD      = ../../src/libpinyin.la @GLIB2_LDFLAGS@
  
-noinst_LTLIBRARIES        = libtagutils.la
-
-libtagutils_la_LDFLAGS    = -static
-
-libtagutils_la_SOURCES    = tag_utility.cpp
-
  import_interpolation_SOURCES = import_interpolation.cpp
  
-import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
  
  export_interpolation_SOURCES = export_interpolation.cpp
  
-export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp

deleted file mode 100644 (file)

index 5dcb35a..0000000
--- a/utils/storage/tag_utility.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "pinyin.h"
-#include <glib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "tag_utility.h"
-
-/* internal taglib structure */
-struct tag_entry{
-    int m_line_type;
-    char * m_line_tag;
-    int m_num_of_values;
-    char ** m_required_tags;
-    /* char ** m_optional_tags; */
-    /* int m_optional_count = 0; */
-    char ** m_ignored_tags;
-};
-
-tag_entry tag_entry_copy(int line_type, const char * line_tag,
-                         int num_of_values,
-                         char * required_tags[],
-                         char * ignored_tags[]){
-    tag_entry entry;
-    entry.m_line_type = line_type;
-    entry.m_line_tag = g_strdup( line_tag );
-    entry.m_num_of_values = num_of_values;
-    entry.m_required_tags = g_strdupv( required_tags );
-    entry.m_ignored_tags = g_strdupv( ignored_tags );
-    return entry;
-}
-
-tag_entry tag_entry_clone(tag_entry * entry){
-    return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
-                          entry->m_num_of_values,
-                          entry->m_required_tags, entry->m_ignored_tags);
-}
-
-void tag_entry_reclaim(tag_entry * entry){
-    g_free( entry->m_line_tag );
-    g_strfreev( entry->m_required_tags );
-    g_strfreev(entry->m_ignored_tags);
-}
-
-static bool taglib_free_tag_array(GArray * tag_array){
-    for ( size_t i = 0; i < tag_array->len; ++i) {
-        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
-        tag_entry_reclaim(entry);
-    }
-    g_array_free(tag_array, TRUE);
-    return true;
-}
-
-/* special unichar to be handled in split_line. */
-static gunichar backslash = 0;
-static gunichar quote = 0;
-
-static gboolean split_line_init(){
-    backslash = g_utf8_get_char("\\");
-    quote = g_utf8_get_char("\"");
-    return TRUE;
-}
-
-/* Pointer Array of Array of tag_entry */
-static GPtrArray * g_tagutils_stack = NULL;
-
-bool taglib_init(){
-    assert( g_tagutils_stack == NULL);
-    g_tagutils_stack = g_ptr_array_new();
-    GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
-    g_ptr_array_add(g_tagutils_stack, tag_array);
-
-    /* init split_line. */
-    split_line_init();
-    return true;
-}
-
-bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
-                    const char * required_tags, const char * ignored_tags){
-    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
-                                     g_tagutils_stack->len - 1);
-
-    /* some duplicate tagname or line_type check here. */
-    for ( size_t i = 0; i < tag_array->len; ++i) {
-        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
-        if ( entry->m_line_type == line_type ||
-             strcmp( entry->m_line_tag, line_tag ) == 0 )
-            return false;
-    }
-
-    char ** required = g_strsplit_set(required_tags, ",:", -1);
-    char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
-
-    tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
-                                     required, ignored);
-    g_array_append_val(tag_array, entry);
-
-    g_strfreev(required);
-    g_strfreev(ignored);
-    return true;
-}
-
-static void ptr_array_entry_free(gpointer data, gpointer user_data){
-    g_free(data);
-}
-
-static gboolean hash_table_key_value_free(gpointer key, gpointer value,
-                                          gpointer user_data){
-    g_free(key);
-    g_free(value);
-    return TRUE;
-}
-
-/* split the line into tokens. */
-static gchar ** split_line(const gchar * line){
-    /* array for tokens. */
-    GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
-
-    for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
-        gunichar unichar = g_utf8_get_char(cur);
-        const gchar * begin = cur;
-        gchar * token = NULL;
-
-        if ( g_unichar_isspace (unichar) ) {
-            continue;
-        }else if ( unichar == quote ) {
-            /* handles "\"". */
-            /* skip the first '"'. */
-            begin = cur = g_utf8_next_char(cur);
-            while (*cur) {
-                unichar = g_utf8_get_char(cur);
-                if ( unichar == backslash ) {
-                    cur = g_utf8_next_char(cur);
-                    g_return_val_if_fail(*cur, NULL);
-                } else if ( unichar == quote ){
-                    break;
-                }
-                cur = g_utf8_next_char(cur);
-            }
-            gchar * tmp = g_strndup( begin, cur - begin);
-            /* TODO: switch to own strdup_escape implementation
-               for \"->" transforming. */
-            token = g_strdup_printf(tmp);
-            g_free(tmp);
-        } else {
-            /* handles other tokens. */
-            while(*cur) {
-                unichar = g_utf8_get_char(cur);
-                if ( g_unichar_isgraph(unichar) ) {
-                    /* next unichar */
-                    cur = g_utf8_next_char(cur);
-                } else {
-                    /* space and other characters handles. */
-                    break;
-                }
-            }
-            token = g_strndup( begin, cur - begin );
-        }
-
-        g_array_append_val(tokens, token);
-        if ( !*cur )
-            break;
-    }
-
-    return (gchar **)g_array_free(tokens, FALSE);
-}
-
-bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
-                 GHashTable * required){
-    /* reset values and required. */
-    g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
-    g_ptr_array_set_size(values, 0);
-    g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
-
-    /* use own version of split_line
-       instead of g_strsplit_set for special token.*/
-    char ** tokens = split_line(input_line);
-    int num_of_tokens = g_strv_length(tokens);
-
-    char * line_tag = tokens[0];
-    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-
-    tag_entry * cur_entry = NULL;
-    /* find line type. */
-    for ( size_t i = 0; i < tag_array->len; ++i) {
-        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
-        if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
-            cur_entry = entry;
-            break;
-        }
-    }
-
-    if ( !cur_entry )
-        return false;
-
-    line_type = cur_entry->m_line_type;
-
-    for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
-        g_return_val_if_fail(i < num_of_tokens, false);
-        char * value = g_strdup( tokens[i] );
-        g_ptr_array_add(values, value);
-    }
-
-    int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
-    int required_len = g_strv_length( cur_entry->m_required_tags);
-
-    for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
-        g_return_val_if_fail(i < num_of_tokens, false);
-        const char * tmp = tokens[i];
-
-        /* check ignored tags. */
-        bool tag_ignored = false;
-        for ( int m = 0; m < ignored_len; ++m) {
-            if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
-                tag_ignored = true;
-                break;
-            }
-        }
-
-        if ( tag_ignored ) {
-            ++i;
-            continue;
-        }
-
-        /* check required tags. */
-        bool tag_required = false;
-        for ( int m = 0; m < required_len; ++m) {
-            if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
-                tag_required = true;
-                break;
-            }
-        }
-
-        /* warning on the un-expected tags. */
-        if ( !tag_required ) {
-            g_warning("un-expected tags:%s.\n", tmp);
-            ++i;
-            continue;
-        }
-
-        char * key = g_strdup(tokens[i]);
-        ++i;
-        g_return_val_if_fail(i < num_of_tokens, false);
-        char * value = g_strdup(tokens[i]);
-        g_hash_table_insert(required, key, value);
-    }
-
-    /* check for all required tags. */
-    for ( int i = 0; i < required_len; ++i) {
-        const char * required_tag_str = cur_entry->m_required_tags[i];
-        gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
-        if ( !result ) {
-            g_warning("missed required tags: %s.\n", required_tag_str);
-            g_strfreev(tokens);
-            return false;
-        }
-    }
-
-    g_strfreev(tokens);
-    return true;
-}
-
-bool taglib_remove_tag(int line_type){
-    /* Note: duplicate entry check is in taglib_add_tag. */
-    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-    for ( size_t i = 0; i < tag_array->len; ++i) {
-        tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
-        if (entry->m_line_type != line_type)
-            continue;
-        tag_entry_reclaim(entry);
-        g_array_remove_index(tag_array, i);
-        return true;
-    }
-    return false;
-}
-
-bool taglib_push_state(){
-    assert(g_tagutils_stack->len >= 1);
-    GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
-    GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-    for ( size_t i = 0; i < prev_tag_array->len; ++i) {
-        tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
-        tag_entry new_entry = tag_entry_clone(entry);
-        g_array_append_val(next_tag_array, new_entry);
-    }
-    g_ptr_array_add(g_tagutils_stack, next_tag_array);
-    return true;
-}
-
-bool taglib_pop_state(){
-    assert(g_tagutils_stack->len > 1);
-    GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-    g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-    taglib_free_tag_array(tag_array);
-    return true;
-}
-
-bool taglib_fini(){
-    for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
-        GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
-        taglib_free_tag_array(tag_array);
-    }
-    g_ptr_array_free(g_tagutils_stack, TRUE);
-    g_tagutils_stack = NULL;
-    return true;
-}
-
-static phrase_token_t taglib_special_string_to_token(const char * string){
-    struct token_pair{
-        phrase_token_t token;
-        const char * string;
-    };
-
-    static const token_pair tokens [] = {
-        {sentence_start, "<start>"},
-        {0, NULL}
-    };
-
-    const token_pair * pair = tokens;
-    while (pair->string) {
-        if ( strcmp(string, pair->string ) == 0 ){
-            return pair->token;
-        }
-    }
-
-    fprintf(stderr, "error: unknown token:%s.\n", string);
-    return 0;
-}
-
-phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
-    phrase_token_t token = 0;
-    if ( string[0] == '<' ) {
-        return taglib_special_string_to_token(string);
-    }
-
-    glong phrase_len = g_utf8_strlen(string, -1);
-    utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
-    int result = phrases->search(phrase_len, phrase, token);
-    if ( !(result & SEARCH_OK) )
-        fprintf(stderr, "error: unknown token:%s.\n", string);
-
-    g_free(phrase);
-    return token;
-}
-
-static const char * taglib_special_token_to_string(phrase_token_t token){
-    struct token_pair{
-        phrase_token_t token;
-        const char * string;
-    };
-
-    static const token_pair tokens [] = {
-        {sentence_start, "<start>"},
-        {0, NULL}
-    };
-
-    const token_pair * pair = tokens;
-    while (pair->token) {
-        if ( token == pair->token )
-            return pair->string;
-    }
-
-    fprintf(stderr, "error: unknown token:%d.\n", token);
-    return NULL;
-}
-
-char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
-                              phrase_token_t token) {
-    PhraseItem item;
-    utf16_t buffer[MAX_PHRASE_LENGTH];
-
-    gchar * phrase;
-    /* deal with the special phrase index, for "<start>..." */
-    if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
-        return g_strdup(taglib_special_token_to_string(token));
-    }
-
-    int result = phrase_index->get_phrase_item(token, item);
-    if (result != ERROR_OK) {
-        fprintf(stderr, "error: unknown token:%d.\n", token);
-        return NULL;
-    }
-
-    item.get_phrase_string(buffer);
-    guint8 length = item.get_phrase_length();
-    phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
-    return phrase;
-}
diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h

deleted file mode 100644 (file)

index 67d8946..0000000
--- a/utils/storage/tag_utility.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* 
- *  libpinyin
- *  Library to deal with pinyin.
- *  
- *  Copyright (C) 2010 Peng Wu
- *  
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- * 
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *  GNU General Public License for more details.
- *  
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef TAG_UTILITY_H
-#define TAG_UTILITY_H
-
-#include "novel_types.h"
-
-/* Note: the optional tag has been removed from the first implementation.
- * Maybe the optional tag will be added back later.
- */
-
-bool taglib_init();
-
-/* Note: most tags are separated by ',' or ':' . */
-bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags);
-
-/* most parameters are hash table of string (const char *). */
-bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required);
-
-/* Note: taglib_write is omited, as printf is more suitable for this. */
-
-/* Note the following function is only available when the optional tag exists.
- * bool taglib_report_status(int line_type);
- */
-
-/* remove the tag of type line_type. */
-bool taglib_remove_tag(int line_type);
-
-/* the following functions are used to save current known tag list in stack.
- * Used when the parsing context is changed.
- */
-bool taglib_push_state();
-bool taglib_pop_state();
-
-bool taglib_fini();
-
-namespace pinyin{
-    class PhraseLargeTable;
-};
-
-using namespace pinyin;
-
-phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
-                                      const char * string);
-
-char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
-                              phrase_token_t token);
-
-#endif
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am

index 7411e787aaf6a93d8f30979da25555547f24946e..0915479cdb99b8c4ff4a403ffeacc27595c705e4 100644 (file)
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -20,8 +20,7 @@ MAINTAINERCLEANFILES    = Makefile.in
  INCLUDES               = -I$(top_srcdir)/src \
                           -I$(top_srcdir)/src/include \
                           -I$(top_srcdir)/src/storage \
-              -I$(top_srcdir)/src/lookup \
-                         -I$(top_srcdir)/utils/storage \
+                         -I$(top_srcdir)/src/lookup \
                           @GLIB2_CPPFLAGS@
  
  noinst_HEADERS         = k_mixture_model.h
@@ -72,12 +71,12 @@ prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
  
  import_k_mixture_model_SOURCES = import_k_mixture_model.cpp
  
-import_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@
+import_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
  
  export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
  
-export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@
+export_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
  
  k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
  
-k_mixture_model_to_interpolation_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@
-\ No newline at end of file
+k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
+\ No newline at end of file
author	Peng Wu <alexepico@gmail.com>
	Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)
committer	Peng Wu <alexepico@gmail.com>
	Fri, 20 May 2011 03:02:55 +0000 (11:02 +0800)
src/pinyin.h		patch \| blob \| history
src/storage/Makefile.am		patch \| blob \| history
src/storage/tag_utility.cpp	[new file with mode: 0644]	patch \| blob
src/storage/tag_utility.h	[new file with mode: 0644]	patch \| blob
utils/storage/Makefile.am		patch \| blob \| history
utils/storage/tag_utility.cpp	[deleted file]	patch \| blob \| history
utils/storage/tag_utility.h	[deleted file]	patch \| blob \| history
utils/training/Makefile.am		patch \| blob \| history