2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
58 #include "../sim_dict.h"
59 #include "../sim_sen.h"
62 static struct option long_options[] =
64 { "dict", 1, 0, 'd' },
65 { "format", 1, 0, 'f' },
66 { "show-id", 0, 0, 'i' },
67 { "s-tok", 1, 0, 's' },
68 { "model", 1, 0, 'm' },
72 static char* s_strDictFile = NULL;
73 static char* s_strSlmFile = NULL;
74 static bool s_bTextOut = false;
75 static bool s_bShowId = false;
76 static TSIMWordId s_iSTOKID = 10;
78 static CSIMDict *s_dict = NULL;
79 static CThreadSlm *s_tslm = NULL;
84 fprintf(stderr, "\nUsage:\n");
87 "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n");
88 fprintf(stderr, " -f --format:\n");
90 " Output Format, can be 'text' or 'bin'. default 'bin'\n");
93 " Normally, in text mode, word text are output, while in binary mode,\n");
95 " binary short integer of the word-ids are writed to stdout.\n");
96 fprintf(stderr, " -s --stok:\n");
97 fprintf(stderr, " Sentence token id. Default 10.\n");
100 " It will be write to output in binary mode after every sentence.\n");
101 fprintf(stderr, " -i --show-id:\n");
104 " Show Id info. Under text output format mode, Attach id after known-words.\n");
105 fprintf(stderr, " Under binary mode, print id in text.\n");
106 fprintf(stderr, " -m --model:\n");
107 fprintf(stderr, " Language model file name");
108 fprintf(stderr, "\n");
109 fprintf(stderr, "Notes:\n");
111 " Under binary mode, consecutive id of 0 are merged into one 0.\n");
113 " Under text mode, no space are insert between unknown-words. \n");
114 fprintf(stderr, "\n");
115 fprintf(stderr, "\n");
120 getParameters(int argc, char* argv[])
124 getopt_long(argc, argv, "d:if:s:m:", long_options,
128 s_strDictFile = strdup(optarg);
134 s_bTextOut = (strcmp(optarg, "text") == 0);
137 s_iSTOKID = atoi(optarg);
140 s_strSlmFile = strdup(optarg);
147 if (s_strDictFile == NULL)
152 output_stok(int& nWords)
157 printf("%d", unsigned(s_iSTOKID));
159 fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
171 static char mbword[1024];
172 static TWCHAR wcword[1024];
174 bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
176 for (int i = 0; i < len; ++i, ++p)
179 WCSTOMBS(mbword, wcword, sizeof(mbword));
180 if (bRealGap && idprev == SIM_ID_NOT_WORD)
181 printf("(%d)", unsigned(idprev));
182 if (bRealGap && (nWords > 0))
184 printf("%s", mbword);
185 if (s_bShowId && idcur != SIM_ID_NOT_WORD)
186 printf("(%d)", unsigned(idcur));
192 printf("%d", unsigned(idcur));
194 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
201 struct TLatticeWord {
206 TLatticeWord(int left = 0, int right = 0, int wid = 0)
207 : m_left(left), m_right(right), m_wordId(wid)
212 typedef std::vector<TLatticeWord> TLatticeWordVec;
214 struct TLatticeStateValue {
216 TLatticeWord* mp_btword;
217 CThreadSlm::TState m_btstate;
219 TLatticeStateValue(double pr = 0.0,
220 TLatticeWord* btword = NULL,
221 CThreadSlm::TState btstate = CThreadSlm::TState())
222 : m_pr(pr), mp_btword(btword), m_btstate(btstate)
227 typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates;
229 struct TLatticeColumn {
230 TLatticeWordVec m_wordstarting;
231 TLatticeColumnStates m_states;
234 typedef std::vector<TLatticeColumn> CLattice;
237 insertLatticeWord(CLattice& lattice, TLatticeWord word)
239 lattice[word.m_left].m_wordstarting.push_back(word);
243 getAmbiLen(const TWCHAR* p, int word_len)
245 const CSIMDict::TState* pstate;
247 for (int i = 1; (i < word_len) && *(p + i) != WCH_NULL; ++i) {
248 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p + i);
249 if (word_len < i + len)
257 fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice)
259 for (int right = left + len; left < right; ++left) {
262 const TWCHAR* p = sntnc.c_str() + left;
263 const CSIMDict::TState* pds = s_dict->getRoot();
264 for (len = 0; left + len < right; ++len) {
265 if ((pds = s_dict->step(pds, *p++)) == NULL)
267 if (pds->word_id != SIM_ID_NOT_WORD) {
269 insertLatticeWord(lattice,
270 TLatticeWord(left, left + len + 1,
275 insertLatticeWord(lattice,
276 TLatticeWord(left, left + 1, SIM_ID_NOT_WORD));
281 * Lattice head should have one state, with its TState using slm's root. its
282 * pr = 0 and its mp_btword == NULL;
283 * Lattice tail must contain no word, and it previous node contain only one word
284 * with its right = left+1, right == tail.
285 * The lattice should ensure the lattice path existing
288 buildLattice(wstring &sntnc, CLattice& lattice)
291 lattice.resize(sntnc.size() + 2);
293 unsigned int idcur = SIM_ID_NOT_WORD;
294 lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(
300 for (int i = 0, sz = sntnc.size(); i < sz; ) {
301 const CSIMDict::TState* pstate;
302 const TWCHAR* p = sntnc.c_str() + i;
303 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
305 idcur = SIM_ID_NOT_WORD;
308 idcur = pstate->word_id;
310 int ambilen = getAmbiLen(p, len);
312 if (ambilen <= len) {
313 insertLatticeWord(lattice, TLatticeWord(i, i + len, idcur));
316 fullSegBuildLattice(sntnc, i, ambilen, lattice);
320 lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(),
326 searchBest(CLattice& lattice)
328 for (int i = 0, sz = lattice.size(); i < sz; ++i) {
329 TLatticeColumnStates & states = lattice[i].m_states;
330 TLatticeColumnStates::iterator itss = states.begin();
331 TLatticeColumnStates::iterator itse = states.end();
332 for (; itss != itse; ++itss) {
333 TLatticeWordVec::iterator itws = lattice[i].m_wordstarting.begin();
334 TLatticeWordVec::iterator itwe = lattice[i].m_wordstarting.end();
335 for (; itws != itwe; ++itws) {
336 CThreadSlm::TState his = itss->first;
337 double pr = itss->second.m_pr;
338 pr += s_tslm->transferNegLog(his, itws->m_wordId, his);
339 TLatticeColumnStates & rss = lattice[itws->m_right].m_states;
340 s_tslm->historify(his);
341 TLatticeColumnStates::iterator itn = rss.find(his);
342 if (itn == rss.end()) {
343 rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
345 if (itn->second.m_pr > pr) {
346 rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
355 getBestPath(CLattice& lattice, TLatticeWordVec& segResult)
357 TLatticeColumnStates & states = lattice.back().m_states;
358 TLatticeColumnStates::iterator its = states.begin();
360 TLatticeWord* pbtword = its->second.mp_btword;
361 CThreadSlm::TState btstate = its->second.m_btstate;
362 its = lattice[pbtword->m_left].m_states.find(btstate);
363 assert(its != lattice[pbtword->m_left].m_states.end());
367 pbtword = its->second.mp_btword;
368 if (pbtword != NULL) {
369 #ifndef HOST_OS_GNUC_2
370 segResult.push_back(*pbtword);
371 #else // HOST_OS_GNUC_2
372 segResult.insert(segResult.begin(), *pbtword);
373 #endif // !HOST_OS_GNUC_2
374 btstate = its->second.m_btstate;
375 its = lattice[pbtword->m_left].m_states.find(btstate);
376 assert(its != lattice[pbtword->m_left].m_states.end());
381 #ifndef HOST_OS_GNUC_2
382 std::reverse(segResult.begin(), segResult.end());
383 #endif // HOST_OS_GNUC_2
387 processSingleFile(FILE* fp, int &nWords, int &nAmbis)
393 CSIMCharReader *pReader = new CSIMCharReader(fp);
394 CSIMCharReader::iterator iter = pReader->begin();
395 TSIMWordId idcur, idprev = s_iSTOKID;
401 if (ReadSentence(sntnc, iter, false) == false)
405 buildLattice(sntnc, lattice);
408 TLatticeWordVec segResult;
409 getBestPath(lattice, segResult);
411 for (int i = 0, sz = segResult.size(); i < sz; ++i) {
412 const TWCHAR *p = sntnc.c_str() + segResult[i].m_left;
413 int len = segResult[i].m_right - segResult[i].m_left;
414 idcur = segResult[i].m_wordId;
416 output(len, p, idprev, idcur, nWords);
431 main(int argc, char *argv[])
435 setlocale(LC_ALL, "");
436 getParameters(argc, argv);
440 fprintf(stderr, "Loading lexicon...");
442 s_dict = new CSIMDict();
443 s_tslm = new CThreadSlm();
444 if (!s_dict->parseText(s_strDictFile)) {
445 fprintf(stderr, "fail to open Lexicon file!\n");
449 if (!s_tslm->load(s_strSlmFile, true)) {
450 fprintf(stderr, "fail to open slm file!\n");
454 fprintf(stderr, "done");
458 fprintf(stderr, "\nProcessing from stdin...");
460 processSingleFile(stdin, nWords, nAmbis);
461 fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
464 for (int i = 0; i < argc; ++i) {
465 fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
466 FILE *fp = fopen(argv[i], "r");
468 processSingleFile(fp, nWords, nAmbis);
469 fprintf(stderr, "@Offset %ld, %d words, %d ambiguious. Done!\n",
470 ftell(fp), nWords, nAmbis);
473 fprintf(stderr, "Can not Open!!!!!!!\n");