1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
38 * \file ngram_model_dmp.c DMP format language models
40 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
49 #include "sphinxbase/ckd_alloc.h"
50 #include "sphinxbase/pio.h"
51 #include "sphinxbase/err.h"
52 #include "sphinxbase/byteorder.h"
53 #include "sphinxbase/listelem_alloc.h"
55 #include "ngram_model_dmp.h"
57 static const char darpa_hdr[] = "Darpa Trigram LM";
58 static ngram_funcs_t ngram_model_dmp_funcs;
60 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65 new_unigram_table(int32 n_ug)
70 table = ckd_calloc(n_ug, sizeof(unigram_t));
71 for (i = 0; i < n_ug; i++) {
72 table[i].prob1.f = -99.0;
73 table[i].bo_wt1.f = -99.0;
79 ngram_model_dmp_read(cmd_ln_t *config,
80 const char *file_name,
84 ngram_model_dmp_t *model;
88 int32 i, j, k, vn, n, ts;
97 char *map_base = NULL;
98 size_t offset = 0, filesize;
103 do_mmap = cmd_ln_boolean_r(config, "-mmap");
105 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
106 E_ERROR("Dump file %s not found\n", file_name);
110 if (is_pipe && do_mmap) {
111 E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
116 if (fread(&k, sizeof(k), 1, fp) != 1)
118 if (k != strlen(darpa_hdr)+1) {
120 if (k != strlen(darpa_hdr)+1) {
121 E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
126 if (fread(str, 1, k, fp) != (size_t) k) {
127 E_ERROR("Cannot read header\n");
130 if (strncmp(str, darpa_hdr, k) != 0) {
131 E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
138 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
142 E_INFO("Will use memory-mapped I/O for LM file\n");
143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
144 E_FATAL("memory mapping is not supported at the moment.");
150 if (fread(&k, sizeof(k), 1, fp) != 1)
152 if (do_swap) SWAP_INT32(&k);
153 if (fread(str, 1, k, fp) != (size_t) k) {
154 E_ERROR("Cannot read LM filename in header\n");
158 /* read version#, if present (must be <= 0) */
159 if (fread(&vn, sizeof(vn), 1, fp) != 1)
161 if (do_swap) SWAP_INT32(&vn);
163 /* read and don't compare timestamps (we don't care) */
164 if (fread(&ts, sizeof(ts), 1, fp) != 1)
166 if (do_swap) SWAP_INT32(&ts);
168 /* read and skip format description */
170 if (fread(&k, sizeof(k), 1, fp) != 1)
172 if (do_swap) SWAP_INT32(&k);
175 if (fread(str, 1, k, fp) != (size_t) k) {
176 E_ERROR("Failed to read word\n");
180 /* read model->ucount */
181 if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
183 if (do_swap) SWAP_INT32(&n_unigram);
189 /* read model->bcount, tcount */
190 if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
192 if (do_swap) SWAP_INT32(&n_bigram);
193 if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
195 if (do_swap) SWAP_INT32(&n_trigram);
196 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
198 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
199 model = ckd_calloc(1, sizeof(*model));
203 else if (n_bigram > 0)
207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
208 base->n_counts[0] = n_unigram;
209 base->n_counts[1] = n_bigram;
210 base->n_counts[2] = n_trigram;
212 /* read unigrams (always in memory, as they contain dictionary
213 * mappings that can't be precomputed, and also could have OOVs added) */
214 model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
215 ugptr = model->lm3g.unigrams;
216 for (i = 0; i <= n_unigram; ++i) {
217 /* Skip over the mapping ID, we don't care about it. */
218 if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
219 E_ERROR("Failed to read maping id %d\n", i);
222 /* Read the actual unigram structure. */
223 if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) {
224 E_ERROR("Failed to read unigrams data\n");
225 ngram_model_free(base);
226 fclose_comp(fp, is_pipe);
229 /* Byte swap if necessary. */
231 SWAP_INT32(&ugptr->prob1.l);
232 SWAP_INT32(&ugptr->bo_wt1.l);
233 SWAP_INT32(&ugptr->bigrams);
235 /* Convert values to log. */
236 ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
237 ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
238 E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
239 i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
242 E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
244 /* Now mmap() the file and read in the rest of the (read-only) stuff. */
247 fseek(fp, 0, SEEK_END);
248 filesize = ftell(fp);
249 fseek(fp, offset, SEEK_SET);
251 /* Check for improper word alignment. */
253 E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n");
257 model->dump_mmap = mmio_file_read(file_name);
258 if (model->dump_mmap == NULL) {
262 map_base = mmio_file_ptr(model->dump_mmap);
270 model->lm3g.bigrams = (bigram_t *) (map_base + offset);
271 offset += (n_bigram + 1) * sizeof(bigram_t);
274 model->lm3g.bigrams =
275 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
276 if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
277 != (size_t) n_bigram + 1) {
278 E_ERROR("Failed to read bigrams data\n");
282 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
284 SWAP_INT16(&bgptr->wid);
285 SWAP_INT16(&bgptr->prob2);
286 SWAP_INT16(&bgptr->bo_wt2);
287 SWAP_INT16(&bgptr->trigrams);
291 E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
297 model->lm3g.trigrams = (trigram_t *) (map_base + offset);
298 offset += n_trigram * sizeof(trigram_t);
301 model->lm3g.trigrams =
302 ckd_calloc(n_trigram, sizeof(trigram_t));
304 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
305 != (size_t) n_trigram) {
306 E_ERROR("Failed to read trigrams data\n");
310 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
312 SWAP_INT16(&tgptr->wid);
313 SWAP_INT16(&tgptr->prob3);
317 E_INFO("%8d = LM.trigrams read\n", n_trigram);
318 /* Initialize tginfo */
319 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
320 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
324 /* read n_prob2 and prob2 array (in memory) */
326 fseek(fp, offset, SEEK_SET);
327 if (fread(&k, sizeof(k), 1, fp) != 1)
329 if (do_swap) SWAP_INT32(&k);
330 model->lm3g.n_prob2 = k;
331 model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
332 if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
333 E_ERROR("fread(prob2) failed\n");
336 for (i = 0; i < k; i++) {
338 SWAP_INT32(&model->lm3g.prob2[i].l);
339 /* Convert values to log. */
340 model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
342 E_INFO("%8d = LM.prob2 entries read\n", k);
345 /* read n_bo_wt2 and bo_wt2 array (in memory) */
347 if (fread(&k, sizeof(k), 1, fp) != 1)
349 if (do_swap) SWAP_INT32(&k);
350 model->lm3g.n_bo_wt2 = k;
351 model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
352 if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
353 E_ERROR("Failed to read backoff weights\n");
356 for (i = 0; i < k; i++) {
358 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
359 /* Convert values to log. */
360 model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
362 E_INFO("%8d = LM.bo_wt2 entries read\n", k);
365 /* read n_prob3 and prob3 array (in memory) */
367 if (fread(&k, sizeof(k), 1, fp) != 1)
369 if (do_swap) SWAP_INT32(&k);
370 model->lm3g.n_prob3 = k;
371 model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
372 if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
373 E_ERROR("Failed to read trigram probability\n");
376 for (i = 0; i < k; i++) {
378 SWAP_INT32(&model->lm3g.prob3[i].l);
379 /* Convert values to log. */
380 model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
382 E_INFO("%8d = LM.prob3 entries read\n", k);
385 /* read tseg_base size and tseg_base */
390 memcpy(&k, map_base + offset, sizeof(k));
391 offset += sizeof(int32);
392 model->lm3g.tseg_base = (int32 *) (map_base + offset);
393 offset += k * sizeof(int32);
396 k = (n_bigram + 1) / BG_SEG_SZ + 1;
397 if (fread(&k, sizeof(k), 1, fp) != 1)
399 if (do_swap) SWAP_INT32(&k);
400 model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
401 if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
403 E_ERROR("Failed to read trigram index\n");
407 for (i = 0; i < k; i++)
408 SWAP_INT32(&model->lm3g.tseg_base[i]);
410 E_INFO("%8d = LM.tseg_base entries read\n", k);
413 /* read ascii word strings */
415 memcpy(&k, map_base + offset, sizeof(k));
416 offset += sizeof(int32);
417 tmp_word_str = (char *) (map_base + offset);
421 base->writable = TRUE;
422 if (fread(&k, sizeof(k), 1, fp) != 1)
424 if (do_swap) SWAP_INT32(&k);
425 tmp_word_str = ckd_calloc(k, 1);
426 if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
427 E_ERROR("Failed to read words\n");
432 /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
433 for (i = 0, j = 0; i < k; i++)
434 if (tmp_word_str[i] == '\0')
436 if (j != n_unigram) {
437 E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
442 /* Break up string just read into words */
445 for (i = 0; i < n_unigram; i++) {
446 base->word_str[i] = tmp_word_str + j;
447 if (hash_table_enter(base->wid, base->word_str[i],
448 (void *)(long)i) != (void *)(long)i) {
449 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
451 j += strlen(base->word_str[i]) + 1;
456 for (i = 0; i < n_unigram; i++) {
457 base->word_str[i] = ckd_salloc(tmp_word_str + j);
458 if (hash_table_enter(base->wid, base->word_str[i],
459 (void *)(long)i) != (void *)(long)i) {
460 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
462 j += strlen(base->word_str[i]) + 1;
466 E_INFO("%8d = ascii word strings read\n", i);
468 fclose_comp(fp, is_pipe);
473 fclose_comp(fp, is_pipe);
474 ngram_model_free(base);
479 ngram_model_dmp_build(ngram_model_t *base)
481 ngram_model_dmp_t *model;
482 ngram_model_t *newbase;
484 sorted_list_t sorted_prob2;
485 sorted_list_t sorted_bo_wt2;
486 sorted_list_t sorted_prob3;
489 int i, bgcount, tgcount, seg;
491 if (base->funcs == &ngram_model_dmp_funcs) {
492 E_INFO("Using existing DMP model.\n");
493 return (ngram_model_dmp_t *)ngram_model_retain(base);
496 /* Initialize new base model structure with params from base. */
497 E_INFO("Building DMP model...\n");
498 model = ckd_calloc(1, sizeof(*model));
499 newbase = &model->base;
500 ngram_model_init(newbase, &ngram_model_dmp_funcs,
501 logmath_retain(base->lmath),
502 base->n, base->n_counts[0]);
503 /* Copy N-gram counts over. */
504 memcpy(newbase->n_counts, base->n_counts,
505 base->n * sizeof(*base->n_counts));
506 /* Make sure word strings are freed. */
507 newbase->writable = TRUE;
508 /* Initialize unigram table and string table. */
509 model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
510 for (itor = ngram_model_mgrams(base, 0); itor;
511 itor = ngram_iter_next(itor)) {
515 /* Can't guarantee they will go in unigram order, so just to
516 * be correct, we do this... */
517 wids = ngram_iter_get(itor, &prob1, &bo_wt1);
518 model->lm3g.unigrams[wids[0]].prob1.l = prob1;
519 model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
520 newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
521 if ((hash_table_enter_int32(newbase->wid,
522 newbase->word_str[wids[0]], wids[0]))
524 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
527 E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
532 /* Construct quantized probability table for bigrams and
533 * (optionally) trigrams. Hesitate to use the "sorted list" thing
534 * since it isn't so useful, but it's there already. */
535 init_sorted_list(&sorted_prob2);
536 if (newbase->n > 2) {
537 init_sorted_list(&sorted_bo_wt2);
538 init_sorted_list(&sorted_prob3);
540 /* Construct bigram and trigram arrays. */
541 bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
542 if (newbase->n > 2) {
543 tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
544 model->lm3g.tseg_base =
545 ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
549 /* Since bigrams and trigrams have to be contiguous with others
550 * with the same N-1-gram, we traverse them in depth-first order
551 * to build the bigram and trigram arrays. */
552 for (i = 0; i < newbase->n_counts[0]; ++i) {
554 bgcount = bgptr - model->lm3g.bigrams;
555 /* First bigram index (same as next if no bigrams...) */
556 model->lm3g.unigrams[i].bigrams = bgcount;
557 E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
558 /* All bigrams corresponding to unigram i */
559 uitor = ngram_ng_iter(base, i, NULL, 0);
560 for (itor = ngram_iter_successors(uitor);
561 itor; ++bgptr, itor = ngram_iter_next(itor)) {
566 wids = ngram_iter_get(itor, &prob2, &bo_wt2);
568 assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]);
570 bgptr->wid = wids[1];
571 bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
572 if (newbase->n > 2) {
573 tgcount = (tgptr - model->lm3g.trigrams);
574 bgcount = (bgptr - model->lm3g.bigrams);
576 /* Backoff weight (only if there are trigrams...) */
577 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
579 /* Find bigram segment for this bigram (this isn't
580 * used unless there are trigrams) */
581 seg = bgcount >> LOG_BG_SEG_SZ;
582 /* If we just crossed a bigram segment boundary, then
583 * point tseg_base for the new segment to the current
584 * trigram pointer. */
585 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
586 model->lm3g.tseg_base[seg] = tgcount;
587 /* Now calculate the trigram offset. */
588 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
589 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
591 newbase->word_str[wids[0]],
592 newbase->word_str[wids[1]],
593 seg, bgptr->trigrams));
595 /* And fill in successors' trigram info. */
596 for (titor = ngram_iter_successors(itor);
597 titor; ++tgptr, titor = ngram_iter_next(titor)) {
600 assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
601 wids = ngram_iter_get(titor, &prob3, &dummy);
602 tgptr->wid = wids[2];
603 tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
604 E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
606 newbase->word_str[wids[0]],
607 newbase->word_str[wids[1]],
608 newbase->word_str[wids[2]],
613 ngram_iter_free(uitor);
615 /* Add sentinal unigram and bigram records. */
616 bgcount = bgptr - model->lm3g.bigrams;
617 tgcount = tgptr - model->lm3g.trigrams;
618 seg = bgcount >> LOG_BG_SEG_SZ;
619 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
620 model->lm3g.tseg_base[seg] = tgcount;
621 model->lm3g.unigrams[i].bigrams = bgcount;
623 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
625 /* Now create probability tables. */
626 model->lm3g.n_prob2 = sorted_prob2.free;
627 model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
628 E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
629 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
630 free_sorted_list(&sorted_prob2);
631 if (newbase->n > 2) {
632 /* Create trigram bo-wts array. */
633 model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
634 model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
635 free_sorted_list(&sorted_bo_wt2);
636 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
637 /* Create trigram probability table. */
638 model->lm3g.n_prob3 = sorted_prob3.free;
639 model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
640 E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
641 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
642 free_sorted_list(&sorted_prob3);
643 /* Initialize tginfo */
644 model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
645 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
652 fwrite_int32(FILE *fh, int32 val)
654 fwrite(&val, 4, 1, fh);
658 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
663 /* Bogus dictionary mapping field. */
664 fwrite(&bogus, 4, 1, fh);
665 /* Convert values to log10. */
666 log10val = logmath_log_to_log10(lmath, ug->prob1.l);
667 fwrite(&log10val, 4, 1, fh);
668 log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
669 fwrite(&log10val, 4, 1, fh);
670 fwrite_int32(fh, ug->bigrams);
674 fwrite_bg(FILE *fh, bigram_t *bg)
676 fwrite(bg, sizeof(*bg), 1, fh);
680 fwrite_tg(FILE *fh, trigram_t *tg)
682 fwrite(tg, sizeof(*tg), 1, fh);
685 /** Please look at the definition of
687 static char const *fmtdesc[] = {
688 "BEGIN FILE FORMAT DESCRIPTION",
689 "Header string length (int32) and string (including trailing 0)",
690 "Original LM filename string-length (int32) and filename (including trailing 0)",
691 "(int32) version number (present iff value <= 0)",
692 "(int32) original LM file modification timestamp (iff version# present)",
693 "(int32) string-length and string (including trailing 0) (iff version# present)",
694 "... previous entry continued any number of times (iff version# present)",
695 "(int32) 0 (terminating sequence of strings) (iff version# present)",
696 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
697 "(int32) lm_t.ucount (must be > 0)",
698 "(int32) lm_t.bcount",
699 "(int32) lm_t.tcount",
700 "lm_t.ucount+1 unigrams (including sentinel)",
701 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
702 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
703 "(int32) lm_t.n_prob2",
704 "(int32) lm_t.prob2[]",
705 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
706 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
707 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
708 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
709 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
710 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
711 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
712 "All word strings (including trailing 0 for each)",
713 "END FILE FORMAT DESCRIPTION",
718 ngram_model_dmp_write_header(FILE * fh)
721 k = strlen(darpa_hdr) + 1;
723 fwrite(darpa_hdr, 1, k, fh);
727 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
731 k = strlen(lmfile) + 1;
733 fwrite(lmfile, 1, k, fh);
736 #define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which
737 is trigram or lower which used 16 bits in
738 bigram and trigram.*/
741 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
743 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */
744 fwrite_int32(fh, mtime);
748 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
750 fwrite_int32(fh, model->n_counts[0]);
751 fwrite_int32(fh, model->n_counts[1]);
752 fwrite_int32(fh, model->n_counts[2]);
756 ngram_model_dmp_write_fmtdesc(FILE * fh)
761 /* Write file format description into header */
762 for (i = 0; fmtdesc[i] != NULL; i++) {
763 k = strlen(fmtdesc[i]) + 1;
765 fwrite(fmtdesc[i], 1, k, fh);
767 /* Pad it out in order to achieve 32-bit alignment */
771 fwrite_int32(fh, 4-k);
772 fwrite("!!!!", 1, 4-k, fh);
778 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
780 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
783 for (i = 0; i <= model->n_counts[0]; i++) {
784 fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
790 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
792 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
795 for (i = 0; i <= model->n_counts[1]; i++) {
796 fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
802 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
804 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
807 for (i = 0; i < model->n_counts[2]; i++) {
808 fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
813 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
815 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
818 fwrite_int32(fh, lm->lm3g.n_prob2);
819 for (i = 0; i < lm->lm3g.n_prob2; i++) {
820 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
821 fwrite(&log10val, 4, 1, fh);
826 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
828 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
831 fwrite_int32(fh, lm->lm3g.n_bo_wt2);
832 for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
833 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
834 fwrite(&log10val, 4, 1, fh);
839 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
841 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
844 fwrite_int32(fh, lm->lm3g.n_prob3);
845 for (i = 0; i < lm->lm3g.n_prob3; i++) {
846 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
847 fwrite(&log10val, 4, 1, fh);
852 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
854 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
857 k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
859 for (i = 0; i < k; i++)
860 fwrite_int32(fh, lm->lm3g.tseg_base[i]);
864 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
869 for (i = 0; i < model->n_counts[0]; i++)
870 k += strlen(model->word_str[i]) + 1;
872 for (i = 0; i < model->n_counts[0]; i++)
873 fwrite(model->word_str[i], 1,
874 strlen(model->word_str[i]) + 1, fh);
878 ngram_model_dmp_write(ngram_model_t *base,
879 const char *file_name)
881 ngram_model_dmp_t *model;
882 ngram_model_t *newbase;
885 /* First, construct a DMP model from the base model. */
886 model = ngram_model_dmp_build(base);
887 newbase = &model->base;
889 /* Now write it, confident in the knowledge that it's the right
890 * kind of language model internally. */
891 if ((fh = fopen(file_name, "wb")) == NULL) {
892 E_ERROR("Cannot create file %s\n", file_name);
895 ngram_model_dmp_write_header(fh);
896 ngram_model_dmp_write_lm_filename(fh, file_name);
897 ngram_model_dmp_write_version(fh, 0);
898 ngram_model_dmp_write_fmtdesc(fh);
899 ngram_model_dmp_write_ngram_counts(fh, newbase);
900 ngram_model_dmp_write_unigram(fh, newbase);
901 if (newbase->n > 1) {
902 ngram_model_dmp_write_bigram(fh, newbase);
903 if (newbase->n > 2) {
904 ngram_model_dmp_write_trigram(fh, newbase);
906 ngram_model_dmp_write_bgprob(fh, newbase);
907 if (newbase->n > 2) {
908 ngram_model_dmp_write_tgbowt(fh, newbase);
909 ngram_model_dmp_write_tgprob(fh, newbase);
910 ngram_model_dmp_write_tg_segbase(fh, newbase);
913 ngram_model_dmp_write_wordstr(fh, newbase);
914 ngram_model_free(newbase);
920 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
921 float32 wip, float32 uw)
923 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
924 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
928 /* Lousy "templating" for things that are largely the same in DMP and
929 * ARPA models, except for the bigram and trigram types and some
931 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
932 #include "lm3g_templates.c"
935 ngram_model_dmp_free(ngram_model_t *base)
937 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
939 ckd_free(model->lm3g.unigrams);
940 ckd_free(model->lm3g.prob2);
941 if (model->dump_mmap) {
942 mmio_file_unmap(model->dump_mmap);
945 ckd_free(model->lm3g.bigrams);
947 ckd_free(model->lm3g.trigrams);
948 ckd_free(model->lm3g.tseg_base);
952 ckd_free(model->lm3g.bo_wt2);
953 ckd_free(model->lm3g.prob3);
956 lm3g_tginfo_free(base, &model->lm3g);
959 static ngram_funcs_t ngram_model_dmp_funcs = {
960 ngram_model_dmp_free, /* free */
961 ngram_model_dmp_apply_weights, /* apply_weights */
962 lm3g_template_score, /* score */
963 lm3g_template_raw_score, /* raw_score */
964 lm3g_template_add_ug, /* add_ug */
965 lm3g_template_flush, /* flush */
966 lm3g_template_iter, /* iter */
967 lm3g_template_mgrams, /* mgrams */
968 lm3g_template_successors, /* successors */
969 lm3g_template_iter_get, /* iter_get */
970 lm3g_template_iter_next, /* iter_next */
971 lm3g_template_iter_free /* iter_free */