src/slm/slmseg/slmseg.cpp

   1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
   3  *
   4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
   5  *
   6  * The contents of this file are subject to the terms of either the GNU Lesser
   7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
   8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
   9  * file except in compliance with the License. You can obtain a copy of the CDDL at
  10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
  11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
  12  * specific language governing permissions and limitations under the License. When
  13  * distributing the software, include this License Header Notice in each file and
  14  * include the full text of the License in the License file as well as the
  15  * following notice:
  16  *
  17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  18  * (CDDL)
  19  * For Covered Software in this distribution, this License shall be governed by the
  20  * laws of the State of California (excluding conflict-of-law provisions).
  21  * Any litigation relating to this License shall be subject to the jurisdiction of
  22  * the Federal Courts of the Northern District of California and the state courts
  23  * of the State of California, with venue lying in Santa Clara County, California.
  24  *
  25  * Contributor(s):
  26  *
  27  * If you wish your version of this file to be governed by only the CDDL or only
  28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
  29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
  30  * license." If you don't indicate a single choice of license, a recipient has the
  31  * option to distribute your version of this file under either the CDDL or the LGPL
  32  * Version 2.1, or to extend the choice of license to its licensees as provided
  33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
  34  * Version 2 license, then the option applies only if the new code is made subject
  35  * to such option by the copyright holder.
  36  */
  37
  38 #ifdef HAVE_CONFIG_H
  39 #include "config.h"
  40 #endif
  41
  42 #ifdef HAVE_ASSERT_H
  43 #include <assert.h>
  44 #endif
  45
  46 #ifdef HAVE_GETOPT_H
  47 #include <getopt.h>
  48 #endif
  49
  50 #include <stdio.h>
  51 #include <unistd.h>
  52 #include <locale.h>
  53
  54 #include <vector>
  55 #include <map>
  56 #include <algorithm>
  57
  58 #include "../sim_dict.h"
  59 #include "../sim_sen.h"
  60 #include "../slm.h"
  61
  62 static struct option long_options[] =
  63 {
  64     { "dict", 1, 0, 'd' },
  65     { "format", 1, 0, 'f' },
  66     { "show-id", 0, 0, 'i' },
  67     { "s-tok", 1, 0, 's' },
  68     { "model", 1, 0, 'm' },
  69     { 0, 0, 0, 0 }
  70 };
  71
  72 static char* s_strDictFile = NULL;
  73 static char* s_strSlmFile = NULL;
  74 static bool s_bTextOut = false;
  75 static bool s_bShowId = false;
  76 static TSIMWordId s_iSTOKID = 10;
  77
  78 static CSIMDict *s_dict = NULL;
  79 static CThreadSlm *s_tslm = NULL;
  80
  81 static void
  82 ShowUsage()
  83 {
  84     fprintf(stderr, "\nUsage:\n");
  85     fprintf(
  86         stderr,
  87         "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n");
  88     fprintf(stderr, "  -f --format:\n");
  89     fprintf(stderr,
  90             "    Output Format, can be 'text' or 'bin'. default 'bin'\n");
  91     fprintf(
  92         stderr,
  93         "    Normally, in text mode, word text are output, while in binary mode,\n");
  94     fprintf(stderr,
  95             "    binary short integer of the word-ids are writed to stdout.\n");
  96     fprintf(stderr, "  -s --stok:\n");
  97     fprintf(stderr, "    Sentence token id. Default 10.\n");
  98     fprintf(
  99         stderr,
 100         "    It will be write to output in binary mode after every sentence.\n");
 101     fprintf(stderr, "  -i --show-id:\n");
 102     fprintf(
 103         stderr,
 104         "    Show Id info. Under text output format mode, Attach id after known-words.\n");
 105     fprintf(stderr, "                  Under binary mode, print id in text.\n");
 106     fprintf(stderr, "  -m --model:\n");
 107     fprintf(stderr, "    Language model file name");
 108     fprintf(stderr, "\n");
 109     fprintf(stderr, "Notes:\n");
 110     fprintf(stderr,
 111             "  Under binary mode, consecutive id of 0 are merged into one 0.\n");
 112     fprintf(stderr,
 113             "  Under text mode, no space are insert between unknown-words. \n");
 114     fprintf(stderr, "\n");
 115     fprintf(stderr, "\n");
 116     exit(1000);
 117 }
 118
 119 static void
 120 getParameters(int argc, char* argv[])
 121 {
 122     int c;
 123     while ((c =
 124                 getopt_long(argc, argv, "d:if:s:m:", long_options,
 125                             NULL)) != -1) {
 126         switch (c) {
 127         case 'd':
 128             s_strDictFile = strdup(optarg);
 129             break;
 130         case 'i':
 131             s_bShowId = true;
 132             break;
 133         case 'f':
 134             s_bTextOut = (strcmp(optarg, "text") == 0);
 135             break;
 136         case 's':
 137             s_iSTOKID = atoi(optarg);
 138             break;
 139         case 'm':
 140             s_strSlmFile = strdup(optarg);
 141             break;
 142         default:
 143             ShowUsage();
 144             break;
 145         }
 146     }
 147     if (s_strDictFile == NULL)
 148         ShowUsage();
 149 }
 150
 151 static void
 152 output_stok(int& nWords)
 153 {
 154     if (s_bShowId) {
 155         if (nWords > 0)
 156             printf(" ");
 157         printf("%d", unsigned(s_iSTOKID));
 158     } else {
 159         fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
 160     }
 161     ++nWords;
 162 }
 163
 164 static void
 165 output(int len,
 166        const TWCHAR* p,
 167        TSIMWordId idprev,
 168        TSIMWordId idcur,
 169        int& nWords)
 170 {
 171     static char mbword[1024];
 172     static TWCHAR wcword[1024];
 173
 174     bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
 175     if (s_bTextOut) {
 176         for (int i = 0; i < len; ++i, ++p)
 177             wcword[i] = *p;
 178         wcword[len] = 0;
 179         WCSTOMBS(mbword, wcword, sizeof(mbword));
 180         if (bRealGap && idprev == SIM_ID_NOT_WORD)
 181             printf("(%d)", unsigned(idprev));
 182         if (bRealGap && (nWords > 0))
 183             printf(" ");
 184         printf("%s", mbword);
 185         if (s_bShowId && idcur != SIM_ID_NOT_WORD)
 186             printf("(%d)", unsigned(idcur));
 187     } else {
 188         if (bRealGap) {
 189             if (s_bShowId) {
 190                 if (nWords > 0)
 191                     printf(" ");
 192                 printf("%d", unsigned(idcur));
 193             } else
 194                 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
 195         }
 196     }
 197     if (bRealGap)
 198         ++nWords;
 199 }
 200
 201 struct TLatticeWord {
 202     int m_left;
 203     int m_right;
 204     int m_wordId;
 205
 206     TLatticeWord(int left = 0, int right = 0, int wid = 0)
 207         : m_left(left), m_right(right), m_wordId(wid)
 208     {
 209     }
 210 };
 211
 212 typedef std::vector<TLatticeWord> TLatticeWordVec;
 213
 214 struct TLatticeStateValue {
 215     double m_pr;
 216     TLatticeWord*         mp_btword;
 217     CThreadSlm::TState m_btstate;
 218
 219     TLatticeStateValue(double pr = 0.0,
 220                        TLatticeWord* btword = NULL,
 221                        CThreadSlm::TState btstate = CThreadSlm::TState())
 222         : m_pr(pr), mp_btword(btword), m_btstate(btstate)
 223     {
 224     }
 225 };
 226
 227 typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates;
 228
 229 struct TLatticeColumn {
 230     TLatticeWordVec m_wordstarting;
 231     TLatticeColumnStates m_states;
 232 };
 233
 234 typedef std::vector<TLatticeColumn> CLattice;
 235
 236 inline void
 237 insertLatticeWord(CLattice& lattice, TLatticeWord word)
 238 {
 239     lattice[word.m_left].m_wordstarting.push_back(word);
 240 }
 241
 242 int
 243 getAmbiLen(const TWCHAR* p, int word_len)
 244 {
 245     const CSIMDict::TState* pstate;
 246
 247     for (int i = 1; (i < word_len) && *(p + i) != WCH_NULL; ++i) {
 248         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p + i);
 249         if (word_len < i + len)
 250             word_len = i + len;
 251     }
 252
 253     return word_len;
 254 }
 255
 256 void
 257 fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice)
 258 {
 259     for (int right = left + len; left < right; ++left) {
 260         bool found = false;
 261
 262         const TWCHAR* p = sntnc.c_str() + left;
 263         const CSIMDict::TState* pds = s_dict->getRoot();
 264         for (len = 0; left + len < right; ++len) {
 265             if ((pds = s_dict->step(pds, *p++)) == NULL)
 266                 break;
 267             if (pds->word_id != SIM_ID_NOT_WORD) {
 268                 found = true;
 269                 insertLatticeWord(lattice,
 270                                   TLatticeWord(left, left + len + 1,
 271                                                pds->word_id));
 272             }
 273         }
 274         if (!found)
 275             insertLatticeWord(lattice,
 276                               TLatticeWord(left, left + 1, SIM_ID_NOT_WORD));
 277     }
 278 }
 279
 280 /**
 281  * Lattice head should have one state, with its TState using slm's root. its
 282  * pr = 0 and its mp_btword == NULL;
 283  * Lattice tail must contain no word, and it previous node contain only one word
 284  * with its right = left+1, right == tail.
 285  * The lattice should ensure the lattice path existing
 286  */
 287 void
 288 buildLattice(wstring &sntnc, CLattice& lattice)
 289 {
 290     lattice.clear();
 291     lattice.resize(sntnc.size() + 2);
 292
 293     unsigned int idcur = SIM_ID_NOT_WORD;
 294     lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(
 295         0.0,
 296         NULL,
 297         CThreadSlm::
 298         TState());
 299
 300     for (int i = 0, sz = sntnc.size(); i < sz; ) {
 301         const CSIMDict::TState* pstate;
 302         const TWCHAR* p = sntnc.c_str() + i;
 303         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
 304         if (len <= 0) {
 305             idcur = SIM_ID_NOT_WORD;
 306             len = 1;
 307         } else {
 308             idcur = pstate->word_id;
 309         }
 310         int ambilen = getAmbiLen(p, len);
 311
 312         if (ambilen <= len) {
 313             insertLatticeWord(lattice, TLatticeWord(i, i + len, idcur));
 314             i += len;
 315         } else {
 316             fullSegBuildLattice(sntnc, i, ambilen, lattice);
 317             i += ambilen;
 318         }
 319     }
 320     lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(),
 321                                                                 sntnc.size() +
 322                                                                 1, s_iSTOKID));
 323 }
 324
 325 void
 326 searchBest(CLattice& lattice)
 327 {
 328     for (int i = 0, sz = lattice.size(); i < sz; ++i) {
 329         TLatticeColumnStates & states = lattice[i].m_states;
 330         TLatticeColumnStates::iterator itss = states.begin();
 331         TLatticeColumnStates::iterator itse = states.end();
 332         for (; itss != itse; ++itss) {
 333             TLatticeWordVec::iterator itws = lattice[i].m_wordstarting.begin();
 334             TLatticeWordVec::iterator itwe = lattice[i].m_wordstarting.end();
 335             for (; itws != itwe; ++itws) {
 336                 CThreadSlm::TState his = itss->first;
 337                 double pr = itss->second.m_pr;
 338                 pr += s_tslm->transferNegLog(his, itws->m_wordId, his);
 339                 TLatticeColumnStates & rss = lattice[itws->m_right].m_states;
 340                 s_tslm->historify(his);
 341                 TLatticeColumnStates::iterator itn = rss.find(his);
 342                 if (itn == rss.end()) {
 343                     rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
 344                 } else {
 345                     if (itn->second.m_pr > pr) {
 346                         rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
 347                     }
 348                 }
 349             }
 350         }
 351     }
 352 }
 353
 354 void
 355 getBestPath(CLattice& lattice, TLatticeWordVec& segResult)
 356 {
 357     TLatticeColumnStates & states = lattice.back().m_states;
 358     TLatticeColumnStates::iterator its = states.begin();
 359
 360     TLatticeWord* pbtword = its->second.mp_btword;
 361     CThreadSlm::TState btstate = its->second.m_btstate;
 362     its = lattice[pbtword->m_left].m_states.find(btstate);
 363     assert(its != lattice[pbtword->m_left].m_states.end());
 364
 365     segResult.clear();
 366     while (true) {
 367         pbtword = its->second.mp_btword;
 368         if (pbtword != NULL) {
 369 #ifndef HOST_OS_GNUC_2
 370             segResult.push_back(*pbtword);
 371 #else // HOST_OS_GNUC_2
 372             segResult.insert(segResult.begin(), *pbtword);
 373 #endif // !HOST_OS_GNUC_2
 374             btstate = its->second.m_btstate;
 375             its = lattice[pbtword->m_left].m_states.find(btstate);
 376             assert(its != lattice[pbtword->m_left].m_states.end());
 377         } else {
 378             break;
 379         }
 380     }
 381 #ifndef HOST_OS_GNUC_2
 382     std::reverse(segResult.begin(), segResult.end());
 383 #endif // HOST_OS_GNUC_2
 384 }
 385
 386 static bool
 387 processSingleFile(FILE* fp, int &nWords, int &nAmbis)
 388 {
 389     nWords = 0;
 390     nAmbis = 0;
 391
 392     wstring sntnc;
 393     CSIMCharReader *pReader = new CSIMCharReader(fp);
 394     CSIMCharReader::iterator iter = pReader->begin();
 395     TSIMWordId idcur, idprev = s_iSTOKID;
 396
 397     if (!s_bTextOut)
 398         output_stok(nWords);
 399
 400     while (true) {
 401         if (ReadSentence(sntnc, iter, false) == false)
 402             break;
 403
 404         CLattice lattice;
 405         buildLattice(sntnc, lattice);
 406         searchBest(lattice);
 407
 408         TLatticeWordVec segResult;
 409         getBestPath(lattice, segResult);
 410
 411         for (int i = 0, sz = segResult.size(); i < sz; ++i) {
 412             const TWCHAR *p = sntnc.c_str() + segResult[i].m_left;
 413             int len = segResult[i].m_right - segResult[i].m_left;
 414             idcur = segResult[i].m_wordId;
 415
 416             output(len, p, idprev, idcur, nWords);
 417             idprev = idcur;
 418         }
 419
 420         if (!s_bTextOut) {
 421             output_stok(nWords);
 422             idprev = s_iSTOKID;
 423         }
 424     }
 425
 426     fflush(stdout);
 427     return true;
 428 }
 429
 430 int
 431 main(int argc, char *argv[])
 432 {
 433     int nWords, nAmbis;
 434
 435     setlocale(LC_ALL, "");
 436     getParameters(argc, argv);
 437     argc -= optind;
 438     argv += optind;
 439
 440     fprintf(stderr, "Loading lexicon...");
 441     fflush(stderr);
 442     s_dict = new CSIMDict();
 443     s_tslm = new CThreadSlm();
 444     if (!s_dict->parseText(s_strDictFile)) {
 445         fprintf(stderr, "fail to open Lexicon file!\n");
 446         fflush(stderr);
 447         exit(11);
 448     }
 449     if (!s_tslm->load(s_strSlmFile, true)) {
 450         fprintf(stderr, "fail to open slm file!\n");
 451         fflush(stderr);
 452         exit(12);
 453     }
 454     fprintf(stderr, "done");
 455     fflush(stderr);
 456
 457     if (argc == 0) {
 458         fprintf(stderr, "\nProcessing from stdin...");
 459         fflush(stderr);
 460         processSingleFile(stdin, nWords, nAmbis);
 461         fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
 462         fflush(stderr);
 463     } else {
 464         for (int i = 0; i < argc; ++i) {
 465             fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
 466             FILE *fp = fopen(argv[i], "r");
 467             if (fp != NULL) {
 468                 processSingleFile(fp, nWords, nAmbis);
 469                 fprintf(stderr, "@Offset %ld, %d words, %d ambiguious. Done!\n",
 470                         ftell(fp), nWords, nAmbis);
 471                 fflush(stderr);
 472             } else {
 473                 fprintf(stderr, "Can not Open!!!!!!!\n");
 474                 fflush(stderr);
 475             }
 476             fclose(fp);
 477         }
 478     }
 479
 480     s_tslm->free();
 481     delete s_tslm;
 482     s_tslm = NULL;
 483     s_dict->close();
 484     delete s_dict;
 485     s_dict = NULL;
 486     return 0;
 487 }