2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
57 #include "../sim_slm.h"
63 printf(" slminfo [options] slm_file\n");
64 printf("\nDescription:\n");
66 " slminfo tell information of back-off language model 'slm_file'. It can also print the model to ARPA format.\n");
68 " When no options given, slminfo will only print number of items in each level of the language model.\n");
69 printf("\nOptions:\n");
70 printf(" -v # Verbose mode, printing arpa format.\n");
72 " -p # Prefer normal probability than -log(Pr) which is default. Valid under -v option.\n");
74 " -l dict_file # Lexicon. Valid under -v option. Substitute the word-id with word-text in the output.\n");
79 static bool verbose = false;
80 static char* lexicon_filename = NULL;
81 static bool output_log_pr = true;
83 static struct option long_options[] =
85 { "verbose", 0, 0, 'v' },
87 { "lexicon", 1, 0, 'l' },
92 getParameters(int argc, char* argv[])
94 int c, option_index = 0;
96 getopt_long(argc, argv, "vpl:", long_options,
97 &option_index)) != -1) {
103 lexicon_filename = strdup(optarg);
106 output_log_pr = false;
112 if (output_log_pr == false && !verbose) ShowUsage();
113 if (lexicon_filename != NULL && !verbose) ShowUsage();
114 if (optind != argc - 1) ShowUsage();
117 typedef std::map<TSIMWordId, std::string> TReverseLexicon;
120 PrintARPALevel(int lvl, FILE* fp, TReverseLexicon* plexicon, bool output_log_pr)
124 fseek(fp, 0, SEEK_SET);
125 fread(&N, sizeof(int), 1, fp);
126 fread(&bLogPrFile, sizeof(bLogPrFile), 1, fp);
129 int sz[16]; //it should be N+1, Yet some compiler do not support int sz[N+1]
130 long level_offset[16]; //it should be N+1, Yet some compiler do not support it
132 fread(sz, sizeof(int), N + 1, fp);
133 long offset = ftell(fp);
134 for (int i = 0; i <= N; ++i) {
135 level_offset[i] = offset;
136 offset += sz[i] * sizeof(CSIMSlm::TNode);
139 // TSIMWordId ngram[16];
140 int idx[16]; //it should be N+1, Yet some compiler do not support it
141 CSIMSlm::TNode nodes[16][2]; //it should be N+1, Yet some compiler do not support it
142 for (int i = 0; i <= lvl; ++i) {
146 level_offset[i] + sizeof(CSIMSlm::TLeaf) * idx[i],
148 fread(&(nodes[i][0]), sizeof(CSIMSlm::TLeaf), 1, fp);
149 fread(&(nodes[i][1]), sizeof(CSIMSlm::TLeaf), 1, fp);
152 level_offset[i] + sizeof(CSIMSlm::TNode) * idx[i],
154 fread(&(nodes[i][0]), sizeof(CSIMSlm::TNode), 2, fp);
158 printf("/%d-gram:%d/\n", lvl, sz[lvl] - 1);
159 while (idx[lvl] < sz[lvl] - 1) {
160 for (int i = lvl - 1; i > 0; --i) {
162 while (nodes[i][1].child <= idx[i + 1]) {
166 nodes[i][0] = nodes[i][1];
167 fseek(fp, level_offset[i] + sizeof(CSIMSlm::TNode) *
168 (idx[i] + 1), SEEK_SET);
169 fread(&(nodes[i][1]), sizeof(CSIMSlm::TNode), 1, fp);
171 if (change == false) break;
174 for (int i = 1; i <= lvl; ++i) {
175 TSIMWordId word_id = nodes[i][0].id;
176 if (plexicon != NULL)
177 printf("%s ", (*plexicon)[word_id].c_str());
179 printf("%d ", int(word_id));
183 printf("%20.17lf ", double(nodes[lvl][0].pr));
185 printf("%20.17lf ", exp(-double(nodes[lvl][0].pr)));
188 printf("%20.17lf", double(nodes[lvl][0].bow));
190 printf("%20.17lf", exp(-double(nodes[lvl][0].bow)));
194 printf("%20.17lf ", -log(double(nodes[lvl][0].pr)));
196 printf("%20.17lf ", double(nodes[lvl][0].pr));
199 printf("%20.17lf", -log(double(nodes[lvl][0].bow)));
201 printf("%20.17lf", double(nodes[lvl][0].bow));
207 nodes[lvl][0] = nodes[lvl][1];
209 fseek(fp, level_offset[lvl] + sizeof(CSIMSlm::TLeaf) *
210 (idx[lvl] + 1), SEEK_SET);
211 fread(&(nodes[lvl][1]), sizeof(CSIMSlm::TLeaf), 1, fp);
213 fseek(fp, level_offset[lvl] + sizeof(CSIMSlm::TNode) *
214 (idx[lvl] + 1), SEEK_SET);
215 fread(&(nodes[lvl][1]), sizeof(CSIMSlm::TNode), 1, fp);
221 PrintARPA(FILE* fp, const char* lexicon_filename, bool output_log_pr)
224 static char word[10240];
227 TReverseLexicon* plexicon = NULL;
228 if (lexicon_filename != NULL) {
229 plexicon = new TReverseLexicon();
230 FILE* f_lex = fopen(lexicon_filename, "r");
231 while (fgets(word, 10240, f_lex) != NULL) {
232 if (strlen(word) > 0) {
234 while (*p == ' ' || *p == '\t')
236 while (*p != 0 && *p != ' ' && *p != '\t')
238 if (*p == 0) continue;
240 while (*p == ' ' || *p == '\t')
242 if (!(*p >= '0' && *p <= '9')) continue;
243 for (id = 0; *p >= '0' && *p <= '9'; ++p)
244 id = 10 * id + (*p - '0');
245 (*plexicon)[TSIMWordId(id)] = std::string(word);
250 fseek(fp, 0, SEEK_SET);
251 fread(&N, sizeof(N), 1, fp);
252 for (int lvl = 1; lvl <= N; ++lvl)
253 PrintARPALevel(lvl, fp, plexicon, output_log_pr);
257 PrintSimple(FILE* fp)
263 fseek(fp, 0, SEEK_SET);
264 fread(&N, sizeof(N), 1, fp);
265 fread(&bLogPrFile, sizeof(bLogPrFile), 1, fp);
266 printf("This is a %d-gram back-off model, ", N);
267 printf("%s\n", (bLogPrFile) ? ("using -log(pr)") : ("using direct pr"));
268 for (int i = 0; i <= N; ++i) {
269 fread(&nItem, sizeof(nItem), 1, fp);
270 printf(" %d items in %d-level\n", nItem - 1, i);
275 main(int argc, char* argv[])
279 getParameters(argc, argv);
281 if ((fp = fopen(argv[argc - 1], "rb+")) == NULL) {
282 printf("Can not open back-off language model file %s\n", argv[argc - 1]);
289 PrintARPA(fp, lexicon_filename, output_log_pr);