2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
61 class CIterateThreadSlm : public CThreadSlm {
63 typedef std::vector<TState> iterator;
68 return m_LevelSizes[lvl];
78 beginLevel(int lvl, iterator& it);
83 ++(it.back()); adjustIterator(it);
89 return (int) ((it.back().getIdx()) + 1)
90 == getLevelSize(it.back().getLevel());
97 mapPr(unsigned int pr_idx, bool log_format) const
99 double val = m_prTable[pr_idx];
101 return (m_UseLogPr) ? (val) : (-log(val));
103 return (m_UseLogPr) ? (exp(-val)) : (val);
108 mapBow(unsigned int bow_idx, bool log_format) const
110 double val = m_bowTable[bow_idx];
112 return (m_UseLogPr) ? (val) : (-log(val));
114 return (m_UseLogPr) ? (exp(-val)) : (val);
120 adjustIterator(iterator& it);
124 CIterateThreadSlm::beginLevel(int lvl, iterator& it)
127 if (lvl > (int) m_N) return false;
128 for (int i = 0; i <= lvl; ++i)
129 it.push_back(TState(i, 0));
135 CIterateThreadSlm::getNodePtr(TState s)
137 unsigned int lvl = s.getLevel();
139 return(((TLeaf*)m_Levels[lvl]) + s.getIdx());
141 return(((TNode*)m_Levels[lvl]) + s.getIdx());
146 CIterateThreadSlm::adjustIterator(iterator& it)
149 for (int lvl = it.size() - 2; lvl >= 0; --lvl) {
150 int sz = getLevelSize(lvl);
151 unsigned child = (it[lvl + 1]).getIdx();
152 while ((int) it[lvl].getIdx() < (sz - 1) &&
153 (((TNode*)getNodePtr(it[lvl])) + 1)->ch() <= child) {
164 printf(" tslminfo [options] threaded_slm_file\n");
165 printf("\nDescription:\n");
167 " tslminfo tell information of a threaded back-off language model 'threaded_slm_file'. It can also print the model to ARPA format.");
169 " When no options given, slminfo will only print number of items in each level of the language model.\n");
170 printf("\nOptions:\n");
171 printf(" -v # Verbose mode, printing arpa format.\n");
173 " -p # Prefer normal probability instead of -log(Pr) which is default. Valid under -v option.\n");
175 " -l dict_file # Lexicon. Valid under -v option. Substitute the word-id with word-text in the output.\n");
180 static bool verbose = false;
181 static char *lexicon_filename = NULL;
182 static bool use_log_pr = true;
184 static struct option long_options[] =
186 { "verbose", 0, 0, 'v' },
188 { "lexicon", 1, 0, 'l' },
193 getParameters(int argc, char* argv[])
195 int c, option_index = 0;
197 getopt_long(argc, argv, "vpl:", long_options,
198 &option_index)) != -1) {
204 lexicon_filename = strdup(optarg);
213 if (use_log_pr == false && !verbose) ShowUsage();
214 if (lexicon_filename != NULL && !verbose) ShowUsage();
215 if (optind != argc - 1) ShowUsage();
218 typedef std::map<unsigned int, std::string> TReverseLexicon;
222 PrintARPA(CIterateThreadSlm& itslm,
223 const char* lexicon_filename,
226 static unsigned int id;
227 static char word[10240];
229 TReverseLexicon* plexicon = NULL;
230 if (lexicon_filename != NULL) {
231 plexicon = new TReverseLexicon();
232 FILE* f_lex = fopen(lexicon_filename, "r");
233 while (fgets(word, 10240, f_lex) != NULL) {
234 if (strlen(word) > 0) {
236 while (*p == ' ' || *p == '\t')
238 while (*p != 0 && *p != ' ' && *p != '\t')
240 if (*p == 0) continue;
242 while (*p == ' ' || *p == '\t')
244 if (!(*p >= '0' && *p <= '9')) continue;
245 for (id = 0; *p >= '0' && *p <= '9'; ++p)
246 id = 10 * id + (*p - '0');
247 (*plexicon)[id] = std::string(word);
253 CIterateThreadSlm::iterator it;
254 for (int lvl = 0; lvl <= itslm.getN(); ++lvl) {
255 printf("\\%d-gram\\%d\n", lvl, itslm.getLevelSize(lvl) - 1);
256 for (itslm.beginLevel(lvl, it); !itslm.isEnd(it); itslm.next(it)) {
257 for (int i = 1; i < lvl; ++i) {
258 CIterateThreadSlm::TNode*pn =
259 (CIterateThreadSlm::TNode*)itslm.getNodePtr(it[i]);
260 if (plexicon != NULL)
261 printf("%s ", (*plexicon)[pn->wid()].c_str());
263 printf("%9d ", pn->wid());
265 if (lvl < itslm.getN()) {
266 CIterateThreadSlm::TNode*pn =
267 (CIterateThreadSlm::TNode*)itslm.getNodePtr(it[lvl]);
269 if (plexicon != NULL)
270 printf("%s ", ((*plexicon)[pn->wid()]).c_str());
272 printf("%9d ", pn->wid());
275 double pr = itslm.mapPr(pn->pr(), use_log_pr);
276 double bow = itslm.mapBow(pn->bow(), use_log_pr);
277 printf("%16.12lf %16.12lf ", pr, bow);
278 printf("(%1u,%u)\n", pn->bol(), pn->bon());
280 CIterateThreadSlm::TLeaf*pn =
281 (CIterateThreadSlm::TLeaf*)itslm.getNodePtr(it[lvl]);
283 if (plexicon != NULL)
284 printf("%s ", ((*plexicon)[pn->wid()]).c_str());
286 printf("%9d ", pn->wid());
289 double pr = itslm.mapPr(pn->pr(), use_log_pr);
290 printf("%16.12lf ", pr);
291 printf("(%1u,%u)\n", pn->bol(), pn->bon());
301 * tslminfo [-v] threaded_slm_file
304 main(int argc, char* argv[])
306 getParameters(argc, argv);
308 CIterateThreadSlm itslm;
310 if (itslm.load(argv[argc - 1], true)) {
312 printf("Total %d level ngram: ", itslm.getN());
313 for (int lvl = 1; lvl <= itslm.getN(); ++lvl)
314 printf("%d ", itslm.getLevelSize(lvl) - 1);
316 (itslm.isUseLogPr()) ? " using -log(pr)\n" :
317 " using direct pr\n");
319 PrintARPA(itslm, lexicon_filename, use_log_pr);