2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
59 #include "../sim_slmbuilder.h"
61 static struct option long_options[] =
63 { "ngram", 1, 0, 'n' },
66 { "discount", 1, 0, 'd' },
67 { "wordcount", 1, 0, 'w' },
68 { "breakid", 1, 0, 'b' },
69 { "excludeid", 1, 0, 'e' },
80 slmbuild options idngram\n\
83 This program generate language model from idngram file.\n\
86 -n --ngram N # 1 for unigram, 2 for bigram, 3 for trigram...\n\
87 -o --out output # output file name\n\
88 -l --log # using -log(pr), default use pr directly\n\
89 -w --wordcount N # Lexicon size, number of different word\n\
90 -b --brk id[,id...] # set the ids which should be treat as breaker\n\
91 -e --exclude id[,id...] # set the ids which should not be put into LM\n\
92 -c --cut c1[,c2...] # k-gram whose freq <= c[k] are droped\n\
93 -d --discount method,param # the k-th -d parm specify the discount method \n\
94 for k-gram. Possible values for method/param:\n\
95 GT,R,dis : GT discount for r <= R, r is the freq of a ngram.\n\
96 Linear discount for those r > R, i.e. r'=r*dis\n\
97 0 << dis < 1.0, for example 0.999 \n\
98 ABS,[dis] : Absolute discount r'=r-dis. And dis is optional\n\
99 0 < dis < cut[k]+1.0, normally dis < 1.0.\n\
100 LIN,[dis] : Linear discount r'=r*dis. And dis is optional\n\
104 -n must be given before -c -b. And -c must give right number of cut-off,\n\
105 also -d must appear exactly N times specify discount for 1-gram, 2-gram..., \n\
107 BREAKER-IDs could be SentenceTokens or ParagraphTokens. Concepturally,\n\
108 these ids has no meaning when they appeared in the middle of n-gram.\n\
109 EXCLUDE-IDs could be ambiguious-ids. Concepturally, n-grams which\n\
110 contain those ids are meaningless.\n\
111 We can not erase ngrams according to BREAKER-IDS and EXCLUDE-IDs directly\n\
112 from IDNGRAM file, because some low-level information still useful in it.\n\
115 Following example read 'all.id3gram' and write trigram model 'all.slm'.\n\
116 At 1-gram level, use Good-Turing discount with cut-off 0, R=8, dis=0.9995. At\n\
117 2-gram level, use Absolute discount with cut-off 3, dis auto-calc. At 3-gram\n\
118 level, use Absolute discount with cut-off 2, dis auto-calc. Word id 10,11,12\n\
119 are breakers (sentence/para/paper breaker, etc). Exclude-ID is 9. Lexicon \n\
120 contains 200000 words. The result languagme model use -log(pr).\n\
122 slmbuild -l -n 3 -o all.slm -w 200000 -c 0,3,2 -d GT,8,0.9995\n\
123 -d ABS -d ABS -b 10,11,12 -e 9 all.id3gram\n\
129 static CSlmBuilder builder;
130 static char* inputfilename = NULL;
131 static char* outfilename = NULL;
132 static std::vector<CSlmDiscounter *> discounter;
135 getParameters(int argc, char* argv[])
138 char *ac = NULL, *cuts = NULL, *idstring = NULL, *dis_str = NULL;
139 std::vector<TSIMWordId> ids;
140 std::vector<CSlmBuilder::FREQ_TYPE> threshold;
141 bool bUseLogPr = false;
144 getopt_long(argc, argv, "lw:n:c:d:o:b:e:", long_options,
153 outfilename = strdup(optarg);
161 builder.SetNumberOfWord(n);
164 cuts = strdup(optarg);
165 ac = strtok(cuts, ",");
168 threshold.push_back(CSlmBuilder::FREQ_TYPE(cut));
169 ac = strtok(NULL, ",");
171 builder.SetCut(&(threshold[0]));
175 idstring = strdup(optarg);
176 ac = strtok(idstring, ",");
180 ac = strtok(NULL, ",");
182 builder.SetBreakerIds(ids.size(), &(ids[0]));
186 idstring = strdup(optarg);
187 ac = strtok(idstring, ",");
191 ac = strtok(NULL, ",");
193 builder.SetExcludeIds(ids.size(), &(ids[0]));
197 dis_str = strdup(optarg);
198 ac = strtok(dis_str, ",");
199 if (strcmp(ac, "GT") == 0) {
200 ac = strtok(NULL, ",");
202 ac = strtok(NULL, ",");
204 discounter.push_back(new CSlmGTDiscounter(rmax, dis));
205 } else if (strcmp(ac, "ABS") == 0) {
206 if ((ac = strtok(NULL, ",")) != NULL)
208 discounter.push_back(new CSlmAbsoluteDiscounter(dis));
209 } else if (strcmp(ac, "LIN") == 0) {
210 if ((ac = strtok(NULL, ",")) != NULL)
212 discounter.push_back(new CSlmLinearDiscounter(dis));
220 builder.SetUseLogPr(((bUseLogPr) ? 1 : 0));
221 if (optind == argc - 1) {
222 inputfilename = strdup(argv[optind]);
223 builder.SetDiscounter(&(discounter[0]));
225 fprintf(stderr, "Parameter input_file error\n");
226 for (int i = optind; i < argc; ++i)
227 fprintf(stderr, "%s ", argv[i]);
228 fprintf(stderr, "\n");
234 main(int argc, char* argv[])
236 getParameters(argc, argv);
238 TSIMWordId * ngram = new TSIMWordId[N + 1];
239 CSlmBuilder::FREQ_TYPE freq;
241 printf("Reading and Processing raw idngram..."); fflush(stdout);
242 FILE *fp = fopen(inputfilename, "rb");
244 while (fread(ngram, sizeof(TSIMWordId), N, fp) == (size_t) N
245 && fread(&freq, sizeof(freq), 1, fp) == 1) {
246 builder.AddNGram(ngram, freq);
251 printf("%d ngrams.\n", nItems); fflush(stdout);
255 printf("\nWriting result file..."); fflush(stdout);
256 FILE *out = fopen(outfilename, "wb");
259 printf("\n"); fflush(stdout);