1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 2009 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
38 * \file sphinx_lm_convert.c
39 * Language model conversion tool.
41 #include <sphinxbase/logmath.h>
42 #include <sphinxbase/ngram_model.h>
43 #include <sphinxbase/cmd_ln.h>
44 #include <sphinxbase/ckd_alloc.h>
45 #include <sphinxbase/err.h>
46 #include <sphinxbase/pio.h>
47 #include <sphinxbase/strfuncs.h>
53 static const arg_t defn[] = {
57 "Shows the usage of the tool"},
62 "Base in which all log-likelihoods calculated" },
67 "Input language model file (required)"},
72 "Output language model file (required)"},
77 "Input language model format (will guess if not specified)"},
82 "Output language model file (will guess if not specified)"},
87 "Input language model text encoding (no conversion done if not specified)"},
92 "Output language model text encoding"},
97 "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
102 "Use memory-mapped I/O for reading binary LM files"},
107 "Verbosity level for debugging messages"
110 { NULL, 0, NULL, NULL }
116 E_INFO("Usage: %s -i <input.lm> \\\n", pgm);
117 E_INFOCONT("\t[-ifmt txt] [-ofmt dmp]\n");
118 E_INFOCONT("\t-o <output.lm.DMP>\n");
125 main(int argc, char *argv[])
128 ngram_model_t *lm = NULL;
133 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
136 if (cmd_ln_boolean_r(config, "-help")) {
140 err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
142 /* Create log math object. */
143 if ((lmath = logmath_init
144 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
145 E_FATAL("Failed to initialize log math\n");
148 if (cmd_ln_str_r(config, "-i") == NULL || cmd_ln_str_r(config, "-i") == NULL) {
149 E_ERROR("Please specify both input and output models\n");
154 /* Load the input language model. */
155 if (cmd_ln_str_r(config, "-ifmt")) {
156 if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
158 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
161 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
165 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
169 /* Guess or set the output language model type. */
170 if (cmd_ln_str_r(config, "-ofmt")) {
171 if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
173 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
178 otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
181 /* Recode the language model if desired. */
182 if (cmd_ln_str_r(config, "-ienc")) {
183 if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"),
184 cmd_ln_str_r(config, "-oenc")) != 0) {
185 E_ERROR("Failed to recode language model from %s to %s\n",
186 cmd_ln_str_r(config, "-ienc"),
187 cmd_ln_str_r(config, "-oenc"));
192 /* Case fold if requested. */
193 if ((kase = cmd_ln_str_r(config, "-case"))) {
194 if (0 == strcmp(kase, "lower")) {
195 ngram_model_casefold(lm, NGRAM_LOWER);
197 else if (0 == strcmp(kase, "upper")) {
198 ngram_model_casefold(lm, NGRAM_UPPER);
201 E_ERROR("Unknown value for -case: %s\n", kase);
206 /* Write the output language model. */
207 if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
208 E_ERROR("Failed to write language model in format %s to %s\n",
209 ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
213 /* That's all folks! */
214 ngram_model_free(lm);
218 ngram_model_free(lm);