3 * Library to deal with pinyin.
5 * Copyright (C) 2010 Peng Wu
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include "pinyin_internal.h"
26 #include "utils_helper.h"
30 printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
34 static gboolean gen_extra_enter = FALSE;
35 static gchar * outputfile = NULL;
37 static GOptionEntry entries[] =
39 {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
40 {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
45 /* n-gram based sentence segment. */
48 * Currently libpinyin supports ucs4 characters.
49 * This is a pre-processor tool for raw corpus,
50 * and skips non-Chinese characters.
54 * Try to add punctuation mark and english support,
55 * such as ',', '.', '?', '!', <english>, and other punctuations.
64 bool deal_with_segmentable(PhraseLookup * phrase_lookup,
65 GArray * current_ucs4,
67 char * result_string = NULL;
68 MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
69 phrase_lookup->get_best_match(current_ucs4->len,
70 (ucs4_t *) current_ucs4->data, results);
72 phrase_lookup->convert_to_utf8(results, result_string);
75 fprintf(output, "%s\n", result_string);
77 char * tmp_string = g_ucs4_to_utf8
78 ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
80 fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
82 g_array_free(results, TRUE);
85 g_array_free(results, TRUE);
86 g_free(result_string);
90 bool deal_with_unknown(GArray * current_ucs4, FILE * output){
91 char * result_string = g_ucs4_to_utf8
92 ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
94 fprintf(output, "%d %s\n", null_token, result_string);
95 g_free(result_string);
100 int main(int argc, char * argv[]){
101 FILE * input = stdin;
102 FILE * output = stdout;
104 setlocale(LC_ALL, "");
106 GError * error = NULL;
107 GOptionContext * context;
109 context = g_option_context_new("- n-gram segment");
110 g_option_context_add_main_entries(context, entries, NULL);
111 if (!g_option_context_parse(context, &argc, &argv, &error)) {
112 g_print("option parsing failed:%s\n", error->message);
117 output = fopen(outputfile, "w");
118 if (NULL == output) {
119 perror("open file failed");
125 fprintf(stderr, "too many arguments.\n");
130 input = fopen(argv[1], "r");
132 perror("open file failed");
137 SystemTableInfo system_table_info;
139 bool retval = system_table_info.load("table.conf");
141 fprintf(stderr, "load table.conf failed.\n");
145 /* init phrase table */
146 FacadePhraseTable2 phrase_table;
147 MemoryChunk * chunk = new MemoryChunk;
148 chunk->load("phrase_index.bin");
149 phrase_table.load(chunk, NULL);
151 /* init phrase index */
152 FacadePhraseIndex phrase_index;
154 const pinyin_table_info_t * phrase_files =
155 system_table_info.get_table_info();
157 if (!load_phrase_index(phrase_files, &phrase_index))
161 Bigram system_bigram;
162 system_bigram.attach("bigram.db", ATTACH_READONLY);
165 /* init phrase lookup */
166 PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
167 &system_bigram, &user_bigram);
170 CONTEXT_STATE state, next_state;
171 GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
174 memset(tokens, 0, sizeof(PhraseTokens));
175 phrase_index.prepare_tokens(tokens);
177 /* split the sentence */
178 char * linebuf = NULL; size_t size = 0; ssize_t read;
179 while( (read = getline(&linebuf, &size, input)) != -1 ){
180 if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
181 linebuf[strlen(linebuf) - 1] = '\0';
184 /* check non-ucs4 characters */
185 const glong num_of_chars = g_utf8_strlen(linebuf, -1);
187 ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
188 if ( len != num_of_chars ) {
189 fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
190 fprintf(output, "%d \n", null_token);
194 /* only new-line persists. */
195 if ( 0 == num_of_chars ) {
196 fprintf(output, "%d \n", null_token);
200 state = CONTEXT_INIT;
201 int result = phrase_table.search( 1, sentence, tokens);
202 g_array_append_val( current_ucs4, sentence[0]);
203 if ( result & SEARCH_OK )
204 state = CONTEXT_SEGMENTABLE;
206 state = CONTEXT_UNKNOWN;
208 for ( int i = 1; i < num_of_chars; ++i) {
209 int result = phrase_table.search( 1, sentence + i, tokens);
210 if ( result & SEARCH_OK )
211 next_state = CONTEXT_SEGMENTABLE;
213 next_state = CONTEXT_UNKNOWN;
215 if ( state == next_state ){
216 g_array_append_val(current_ucs4, sentence[i]);
220 assert ( state != next_state );
221 if ( state == CONTEXT_SEGMENTABLE )
222 deal_with_segmentable(&phrase_lookup, current_ucs4, output);
224 if ( state == CONTEXT_UNKNOWN )
225 deal_with_unknown(current_ucs4, output);
227 /* save the current character */
228 g_array_set_size(current_ucs4, 0);
229 g_array_append_val(current_ucs4, sentence[i]);
233 if ( current_ucs4->len ) {
234 /* this seems always true. */
235 if ( state == CONTEXT_SEGMENTABLE )
236 deal_with_segmentable(&phrase_lookup, current_ucs4, output);
238 if ( state == CONTEXT_UNKNOWN )
239 deal_with_unknown(current_ucs4, output);
240 g_array_set_size(current_ucs4, 0);
243 /* print extra enter */
244 if ( gen_extra_enter )
245 fprintf(output, "%d \n", null_token);
247 phrase_index.destroy_tokens(tokens);
249 /* print enter at file tail */
250 fprintf(output, "%d \n", null_token);
251 g_array_free(current_ucs4, TRUE);