utils/segment/ngseg.cpp

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2010 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <locale.h>
  25 #include "pinyin_internal.h"
  26 #include "utils_helper.h"
  27
  28
  29 void print_help(){
  30     printf("Usage: ngseg [--generate-extra-enter]  [-o outputfile] [inputfile]\n");
  31 }
  32
  33
  34 static gboolean gen_extra_enter = FALSE;
  35 static gchar * outputfile = NULL;
  36
  37 static GOptionEntry entries[] =
  38 {
  39     {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
  40     {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
  41     {NULL}
  42 };
  43
  44
  45 /* n-gram based sentence segment. */
  46
  47 /* Note:
  48  * Currently libpinyin supports ucs4 characters.
  49  * This is a pre-processor tool for raw corpus,
  50  * and skips non-Chinese characters.
  51  */
  52
  53 /* TODO:
  54  * Try to add punctuation mark and english support,
  55  * such as ',', '.', '?', '!', <english>, and other punctuations.
  56  */
  57
  58 enum CONTEXT_STATE{
  59     CONTEXT_INIT,
  60     CONTEXT_SEGMENTABLE,
  61     CONTEXT_UNKNOWN
  62 };
  63
  64 bool deal_with_segmentable(PhraseLookup * phrase_lookup,
  65                            GArray * current_ucs4,
  66                            FILE * output){
  67     char * result_string = NULL;
  68     MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
  69     phrase_lookup->get_best_match(current_ucs4->len,
  70                                   (ucs4_t *) current_ucs4->data, results);
  71
  72     phrase_lookup->convert_to_utf8(results, result_string);
  73
  74     if (result_string) {
  75         fprintf(output, "%s\n", result_string);
  76     } else {
  77         char * tmp_string = g_ucs4_to_utf8
  78             ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
  79               NULL, NULL, NULL);
  80         fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
  81                 tmp_string);
  82         g_array_free(results, TRUE);
  83         return false;
  84     }
  85     g_array_free(results, TRUE);
  86     g_free(result_string);
  87     return true;
  88 }
  89
  90 bool deal_with_unknown(GArray * current_ucs4, FILE * output){
  91     char * result_string = g_ucs4_to_utf8
  92         ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
  93           NULL, NULL, NULL);
  94     fprintf(output, "%d %s\n", null_token, result_string);
  95     g_free(result_string);
  96     return true;
  97 }
  98
  99
 100 int main(int argc, char * argv[]){
 101     FILE * input = stdin;
 102     FILE * output = stdout;
 103
 104     setlocale(LC_ALL, "");
 105
 106     GError * error = NULL;
 107     GOptionContext * context;
 108
 109     context = g_option_context_new("- n-gram segment");
 110     g_option_context_add_main_entries(context, entries, NULL);
 111     if (!g_option_context_parse(context, &argc, &argv, &error)) {
 112         g_print("option parsing failed:%s\n", error->message);
 113         exit(EINVAL);
 114     }
 115
 116     if (outputfile) {
 117         output = fopen(outputfile, "w");
 118         if (NULL == output) {
 119             perror("open file failed");
 120             exit(EINVAL);
 121         }
 122     }
 123
 124     if (argc > 2) {
 125         fprintf(stderr, "too many arguments.\n");
 126         exit(EINVAL);
 127     }
 128
 129     if (2 == argc) {
 130         input = fopen(argv[1], "r");
 131         if (NULL == input) {
 132             perror("open file failed");
 133             exit(EINVAL);
 134         }
 135     }
 136
 137     SystemTableInfo system_table_info;
 138
 139     bool retval = system_table_info.load("table.conf");
 140     if (!retval) {
 141         fprintf(stderr, "load table.conf failed.\n");
 142         exit(ENOENT);
 143     }
 144
 145     /* init phrase table */
 146     FacadePhraseTable2 phrase_table;
 147     MemoryChunk * chunk = new MemoryChunk;
 148     chunk->load("phrase_index.bin");
 149     phrase_table.load(chunk, NULL);
 150
 151     /* init phrase index */
 152     FacadePhraseIndex phrase_index;
 153
 154     const pinyin_table_info_t * phrase_files =
 155         system_table_info.get_table_info();
 156
 157     if (!load_phrase_index(phrase_files, &phrase_index))
 158         exit(ENOENT);
 159
 160     /* init bi-gram */
 161     Bigram system_bigram;
 162     system_bigram.attach("bigram.db", ATTACH_READONLY);
 163     Bigram user_bigram;
 164
 165     /* init phrase lookup */
 166     PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
 167                                &system_bigram, &user_bigram);
 168
 169
 170     CONTEXT_STATE state, next_state;
 171     GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
 172
 173     PhraseTokens tokens;
 174     memset(tokens, 0, sizeof(PhraseTokens));
 175     phrase_index.prepare_tokens(tokens);
 176
 177     /* split the sentence */
 178     char * linebuf = NULL; size_t size = 0; ssize_t read;
 179     while( (read = getline(&linebuf, &size, input)) != -1 ){
 180         if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
 181             linebuf[strlen(linebuf) - 1] = '\0';
 182         }
 183
 184         /* check non-ucs4 characters */
 185         const glong num_of_chars = g_utf8_strlen(linebuf, -1);
 186         glong len = 0;
 187         ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
 188         if ( len != num_of_chars ) {
 189             fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
 190             fprintf(output, "%d \n", null_token);
 191             continue;
 192         }
 193
 194         /* only new-line persists. */
 195         if ( 0  == num_of_chars ) {
 196             fprintf(output, "%d \n", null_token);
 197             continue;
 198         }
 199
 200         state = CONTEXT_INIT;
 201         int result = phrase_table.search( 1, sentence, tokens);
 202         g_array_append_val( current_ucs4, sentence[0]);
 203         if ( result & SEARCH_OK )
 204             state = CONTEXT_SEGMENTABLE;
 205         else
 206             state = CONTEXT_UNKNOWN;
 207
 208         for ( int i = 1; i < num_of_chars; ++i) {
 209             int result = phrase_table.search( 1, sentence + i, tokens);
 210             if ( result & SEARCH_OK )
 211                 next_state = CONTEXT_SEGMENTABLE;
 212             else
 213                 next_state = CONTEXT_UNKNOWN;
 214
 215             if ( state == next_state ){
 216                 g_array_append_val(current_ucs4, sentence[i]);
 217                 continue;
 218             }
 219
 220             assert ( state != next_state );
 221             if ( state == CONTEXT_SEGMENTABLE )
 222                 deal_with_segmentable(&phrase_lookup, current_ucs4, output);
 223
 224             if ( state == CONTEXT_UNKNOWN )
 225                 deal_with_unknown(current_ucs4, output);
 226
 227             /* save the current character */
 228             g_array_set_size(current_ucs4, 0);
 229             g_array_append_val(current_ucs4, sentence[i]);
 230             state = next_state;
 231         }
 232
 233         if ( current_ucs4->len ) {
 234             /* this seems always true. */
 235             if ( state == CONTEXT_SEGMENTABLE )
 236                 deal_with_segmentable(&phrase_lookup, current_ucs4, output);
 237
 238             if ( state == CONTEXT_UNKNOWN )
 239                 deal_with_unknown(current_ucs4, output);
 240             g_array_set_size(current_ucs4, 0);
 241         }
 242
 243         /* print extra enter */
 244         if ( gen_extra_enter )
 245             fprintf(output, "%d \n", null_token);
 246     }
 247     phrase_index.destroy_tokens(tokens);
 248
 249     /* print enter at file tail */
 250     fprintf(output, "%d \n", null_token);
 251     g_array_free(current_ucs4, TRUE);
 252     free(linebuf);
 253     fclose(input);
 254     fclose(output);
 255     return 0;
 256 }