2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
54 #include "../sim_dict.h"
55 #include "../sim_sen.h"
57 static struct option long_options[] =
59 { "dict", 1, 0, 'd' },
60 { "format", 1, 0, 'f' },
61 { "show-id", 0, 0, 'i' },
62 { "s-tok", 1, 0, 's' },
63 { "ambiguious-id", 1, 0, 'a' },
67 static char* s_strDictFile = NULL;
68 static bool s_bTextOut = false;
69 static bool s_bShowId = false;
70 static TSIMWordId s_iSTOKID = 10;
71 static TSIMWordId s_iAmbiID = 0;
73 static CSIMDict *s_dict = NULL;
78 fprintf(stderr, "\nUsage:\n");
81 "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n");
82 fprintf(stderr, " -f --format:\n");
84 " Output Format, can be 'text' or 'bin'. default 'bin'\n");
87 " Normally, in text mode, word text are output, while in binary mode,\n");
89 " binary short integer of the word-ids are written to stdout.\n");
90 fprintf(stderr, " -s --stok:\n");
91 fprintf(stderr, " Sentence token id. Default 10.\n");
94 " It will be written to output in binary mode after every sentence.\n");
95 fprintf(stderr, " -i --show-id:\n");
98 " Show Id info. Under text output format mode, attach id after known.\n");
99 fprintf(stderr, " words. If under binary mode, print id(s) in text.\n");
100 fprintf(stderr, " -a --ambiguious-id:\n");
103 " Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n");
106 " The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n");
109 " is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n");
110 fprintf(stderr, " is 0.\n");
111 fprintf(stderr, "\n");
112 fprintf(stderr, "Notes:\n");
114 " Under binary mode, consecutive id of 0 are merged into one 0.\n");
117 " Under text mode, no space are inserted between unknown-words. \n");
118 fprintf(stderr, "\n");
119 fprintf(stderr, "\n");
124 getParameters(int argc, char* argv[])
128 getopt_long(argc, argv, "d:if:s:a:", long_options,
132 s_strDictFile = strdup(optarg);
138 s_bTextOut = (strcmp(optarg, "text") == 0);
141 s_iSTOKID = atoi(optarg);
144 s_iAmbiID = atoi(optarg);
151 if (s_strDictFile == NULL)
156 output_stok(int& nWords)
161 printf("%d", unsigned(s_iSTOKID));
163 fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
175 static char mbword[1024];
176 static TWCHAR wcword[1024];
178 bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
180 for (int i = 0; i < len; ++i, ++p)
183 WCSTOMBS(mbword, wcword, sizeof(mbword));
184 if (bRealGap && idprev == SIM_ID_NOT_WORD)
185 printf("(%d)", unsigned(idprev));
186 if (bRealGap && (nWords > 0))
188 (s_iAmbiID && idcur == s_iAmbiID) ? printf("<ambi>%s</ambi>", mbword) :
189 printf("%s", mbword);
190 if (s_bShowId && idcur != SIM_ID_NOT_WORD)
191 printf("(%d)", unsigned(idcur));
197 printf("%d", unsigned(idcur));
199 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
207 * Return 最大交集歧义长度. For example, ABCDEF if ABC CD DEF are words.
208 * if return len > word_len, then ambiguious exists at word [p p+len)...
211 getAmbiLen(const TWCHAR* p, int word_len)
213 const CSIMDict::TState* pstate;
215 for (int i = 1; i < word_len && *(p + i) != WCH_NULL; ++i) {
216 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p + i);
217 if (word_len < i + len)
225 processSingleFile(FILE* fp, int &nWords, int &nAmbis)
231 CSIMCharReader *pReader = new CSIMCharReader(fp);
232 CSIMCharReader::iterator iter = pReader->begin();
233 TSIMWordId idcur, idprev = s_iSTOKID;
239 if (ReadSentence(sntnc, iter, false) == false)
242 for (const TWCHAR *p = sntnc.c_str(); (*p); ) {
243 const CSIMDict::TState* pstate;
244 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
246 idcur = SIM_ID_NOT_WORD;
249 idcur = pstate->word_id;
251 if (s_iAmbiID != WCH_NULL) {
252 int ambiLen = getAmbiLen(p, len);
260 output(len, p, idprev, idcur, nWords);
277 main(int argc, char *argv[])
281 setlocale(LC_ALL, "");
282 getParameters(argc, argv);
286 fprintf(stderr, "Loading lexicon..."); fflush(stderr);
287 s_dict = new CSIMDict();
288 if (!s_dict->parseText(s_strDictFile)) {
289 fprintf(stderr, "fail\n"); fflush(stderr);
292 fprintf(stderr, "done"); fflush(stderr);
295 fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr);
296 processSingleFile(stdin, nWords, nAmbis);
297 fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
300 for (int i = 0; i < argc; ++i) {
301 fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
302 FILE *fp = fopen(argv[i], "r");
304 processSingleFile(fp, nWords, nAmbis);
306 "@Offset %ld, %d words, %d ambiguious. Done!\n",
309 nAmbis); fflush(stderr);
311 fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr);