1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
38 * \file ngram_model_arpa.c ARPA format language models
40 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
43 #include "sphinxbase/ckd_alloc.h"
48 #include "sphinxbase/err.h"
49 #include "sphinxbase/pio.h"
50 #include "sphinxbase/listelem_alloc.h"
51 #include "sphinxbase/strfuncs.h"
53 #include "ngram_model_arpa.h"
55 static ngram_funcs_t ngram_model_arpa_funcs;
57 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
58 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
59 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
62 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
65 ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
67 int32 ngram, ngram_cnt;
69 /* skip file until past the '\data\' marker */
71 string_trim((*li)->buf, STRING_BOTH);
72 if (strcmp((*li)->buf, "\\data\\") == 0)
74 *li = lineiter_next(*li);
76 if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
77 E_INFO("No \\data\\ mark in LM file\n");
81 *n_ug = *n_bg = *n_tg = 0;
82 while ((*li = lineiter_next(*li))) {
83 if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
96 E_ERROR("Unknown ngram (%d)\n", ngram);
101 E_ERROR("EOF while reading ngram counts\n");
105 /* Position iterator to the unigrams header '\1-grams:\' */
106 while ((*li = lineiter_next(*li))) {
107 string_trim((*li)->buf, STRING_BOTH);
108 if (strcmp((*li)->buf, "\\1-grams:") == 0)
112 E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
116 if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
117 E_ERROR("Bad or missing ngram count\n");
124 * Read in the unigrams from given file into the LM structure model.
125 * On entry to this procedure, the iterator is positioned to the
126 * header line '\1-grams:'.
129 ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
131 ngram_model_t *base = &model->base;
135 E_INFO("Reading unigrams\n");
138 while ((*li = lineiter_next(*li))) {
139 char *wptr[3], *name;
140 float32 bo_wt = 0.0f;
143 string_trim((*li)->buf, STRING_BOTH);
144 if (strcmp((*li)->buf, "\\2-grams:") == 0
145 || strcmp((*li)->buf, "\\end\\") == 0)
148 if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
149 if ((*li)->buf[0] != '\0')
150 E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
154 p1 = (float)atof_c(wptr[0]);
157 bo_wt = (float)atof_c(wptr[2]);
160 if (wcnt >= base->n_counts[0]) {
161 E_ERROR("Too many unigrams\n");
165 /* Associate name with word id */
166 base->word_str[wcnt] = ckd_salloc(name);
167 if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
168 != (void *)(long)wcnt) {
169 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
171 model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
172 model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
176 if (base->n_counts[0] != wcnt) {
177 E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
178 base->n_counts[0], wcnt);
179 base->n_counts[0] = wcnt;
180 base->n_words = wcnt;
186 * Read bigrams from given file into given model structure.
189 ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
191 ngram_model_t *base = &model->base;
192 int32 w1, w2, prev_w1, bgcount;
195 E_INFO("Reading bigrams\n");
198 bgptr = model->lm3g.bigrams;
201 while ((*li = lineiter_next(*li))) {
202 float32 p, bo_wt = 0.0f;
204 char *wptr[4], *word1, *word2;
207 string_trim((*li)->buf, STRING_BOTH);
209 if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
210 if ((*li)->buf[0] != '\0')
215 p = (float32)atof_c(wptr[0]);
219 bo_wt = (float32)atof_c(wptr[3]);
222 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
223 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
224 word1, word1, word2);
227 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
228 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
229 word2, word1, word2);
233 /* FIXME: Should use logmath_t quantization here. */
234 /* HACK!! to quantize probs to 4 decimal digits */
235 p = (float32)((int32)(p * 10000)) / 10000;
236 bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
238 p2 = logmath_log10_to_log(base->lmath, p);
239 bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
241 if (bgcount >= base->n_counts[1]) {
242 E_ERROR("Too many bigrams\n");
247 bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
248 if (base->n_counts[2] > 0)
249 bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
253 E_ERROR("Bigrams not in unigram order\n");
257 for (prev_w1++; prev_w1 <= w1; prev_w1++)
258 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
264 if ((bgcount & 0x0000ffff) == 0) {
268 if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
269 && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
270 E_ERROR("Bad bigram: %s\n", (*li)->buf);
274 for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
275 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
281 * Very similar to ReadBigrams.
284 ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
286 ngram_model_t *base = &model->base;
287 int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
288 int32 seg, prev_seg, prev_seg_lastbg;
292 E_INFO("Reading trigrams\n");
295 tgptr = model->lm3g.trigrams;
301 while ((*li = lineiter_next(*li))) {
304 char *wptr[4], *word1, *word2, *word3;
306 string_trim((*li)->buf, STRING_BOTH);
307 if (str2words((*li)->buf, wptr, 4) != 4) {
308 if ((*li)->buf[0] != '\0')
313 p = (float32)atof_c(wptr[0]);
319 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
320 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
321 word1, word1, word2, word3);
324 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
325 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
326 word2, word1, word2, word3);
329 if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
330 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
331 word3, word1, word2, word3);
335 /* FIXME: Should use logmath_t quantization here. */
336 /* HACK!! to quantize probs to 4 decimal digits */
337 p = (float32)((int32)(p * 10000)) / 10000;
338 p3 = logmath_log10_to_log(base->lmath, p);
340 if (tgcount >= base->n_counts[2]) {
341 E_ERROR("Too many trigrams\n");
346 tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
348 if ((w1 != prev_w1) || (w2 != prev_w2)) {
349 /* Trigram for a new bigram; update tg info for all previous bigrams */
350 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
351 E_ERROR("Trigrams not in bigram order\n");
356 prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
357 endbg = model->lm3g.unigrams[w1 + 1].bigrams;
358 bgptr = model->lm3g.bigrams + bg;
359 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
361 E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
365 /* bg = bigram entry index for <w1,w2>. Update tseg_base */
366 seg = bg >> LOG_BG_SEG_SZ;
367 for (i = prev_seg + 1; i <= seg; i++)
368 model->lm3g.tseg_base[i] = tgcount;
370 /* Update trigrams pointers for all bigrams until bg */
371 if (prev_seg < seg) {
375 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
377 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
382 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
383 bgptr = model->lm3g.bigrams + prev_bg;
384 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
386 bgptr->trigrams = tgoff;
388 for (; prev_bg <= bg; prev_bg++, bgptr++)
394 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
396 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
400 bgptr = model->lm3g.bigrams + prev_bg;
401 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
402 bgptr->trigrams = tgoff;
414 if ((tgcount & 0x0000ffff) == 0) {
418 if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
419 E_ERROR("Bad trigram: %s\n", (*li)->buf);
423 for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
424 if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
425 model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
426 if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
427 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
430 model->lm3g.bigrams[prev_bg].trigrams =
431 tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
437 new_unigram_table(int32 n_ug)
442 table = ckd_calloc(n_ug, sizeof(unigram_t));
443 for (i = 0; i < n_ug; i++) {
444 table[i].prob1.l = INT_MIN;
445 table[i].bo_wt1.l = INT_MIN;
451 ngram_model_arpa_read(cmd_ln_t *config,
452 const char *file_name,
462 ngram_model_arpa_t *model;
465 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
466 E_ERROR("File %s not found\n", file_name);
469 li = lineiter_start(fp);
471 /* Read #unigrams, #bigrams, #trigrams from file */
472 if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
474 fclose_comp(fp, is_pipe);
477 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
479 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
480 model = ckd_calloc(1, sizeof(*model));
484 else if (n_bigram > 0)
488 /* Initialize base model. */
489 ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
490 base->n_counts[0] = n_unigram;
491 base->n_counts[1] = n_bigram;
492 base->n_counts[2] = n_trigram;
493 base->writable = TRUE;
496 * Allocate one extra unigram and bigram entry: sentinels to terminate
497 * followers (bigrams and trigrams, respectively) of previous entry.
499 model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
500 model->lm3g.bigrams =
501 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
503 model->lm3g.trigrams =
504 ckd_calloc(n_trigram, sizeof(trigram_t));
507 model->lm3g.tseg_base =
508 ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
511 if (ReadUnigrams(&li, model) == -1) {
512 fclose_comp(fp, is_pipe);
513 ngram_model_free(base);
516 E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
518 init_sorted_list(&model->sorted_prob2);
519 if (base->n_counts[2] > 0)
520 init_sorted_list(&model->sorted_bo_wt2);
522 if (base->n_counts[1] > 0) {
523 if (ReadBigrams(&li, model) == -1) {
524 fclose_comp(fp, is_pipe);
525 ngram_model_free(base);
529 base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
530 model->lm3g.n_prob2 = model->sorted_prob2.free;
531 model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
532 free_sorted_list(&model->sorted_prob2);
533 E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
534 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
537 if (base->n_counts[2] > 0) {
538 /* Create trigram bo-wts array */
539 model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
540 model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
541 free_sorted_list(&model->sorted_bo_wt2);
542 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
544 init_sorted_list(&model->sorted_prob3);
546 if (ReadTrigrams(&li, model) == -1) {
547 fclose_comp(fp, is_pipe);
548 ngram_model_free(base);
552 base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
553 model->lm3g.n_prob3 = model->sorted_prob3.free;
554 model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
555 E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
556 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
558 free_sorted_list(&model->sorted_prob3);
560 /* Initialize tginfo */
561 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
562 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
566 fclose_comp(fp, is_pipe);
571 ngram_model_arpa_write(ngram_model_t *model,
572 const char *file_name)
578 if ((fh = fopen(file_name, "w")) == NULL) {
579 E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
582 fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
584 /* The ARPA format doesn't require any extra information that
585 * N-Gram iterators can't give us, so this is very
586 * straightforward compared with DMP writing. */
588 /* Write N-gram counts. */
589 fprintf(fh, "\\data\\\n");
590 for (i = 0; i < model->n; ++i) {
591 fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
595 for (i = 0; i < model->n; ++i) {
596 fprintf(fh, "\n\\%d-grams:\n", i + 1);
597 for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
602 wids = ngram_iter_get(itor, &score, &bowt);
603 fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
604 for (j = 0; j <= i; ++j) {
605 assert(wids[j] < model->n_counts[0]);
606 fprintf(fh, "%s ", model->word_str[wids[j]]);
609 fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
613 fprintf(fh, "\n\\end\\\n");
618 ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
619 float32 wip, float32 uw)
621 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
622 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
626 /* Lousy "templating" for things that are largely the same in DMP and
627 * ARPA models, except for the bigram and trigram types and some
629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t
630 #include "lm3g_templates.c"
633 ngram_model_arpa_free(ngram_model_t *base)
635 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
636 ckd_free(model->lm3g.unigrams);
637 ckd_free(model->lm3g.bigrams);
638 ckd_free(model->lm3g.trigrams);
639 ckd_free(model->lm3g.prob2);
640 ckd_free(model->lm3g.bo_wt2);
641 ckd_free(model->lm3g.prob3);
642 lm3g_tginfo_free(base, &model->lm3g);
643 ckd_free(model->lm3g.tseg_base);
646 static ngram_funcs_t ngram_model_arpa_funcs = {
647 ngram_model_arpa_free, /* free */
648 ngram_model_arpa_apply_weights, /* apply_weights */
649 lm3g_template_score, /* score */
650 lm3g_template_raw_score, /* raw_score */
651 lm3g_template_add_ug, /* add_ug */
652 lm3g_template_flush, /* flush */
653 lm3g_template_iter, /* iter */
654 lm3g_template_mgrams, /* mgrams */
655 lm3g_template_successors, /* successors */
656 lm3g_template_iter_get, /* iter_get */
657 lm3g_template_iter_next, /* iter_next */
658 lm3g_template_iter_free /* iter_free */