src/libsphinxbase/lm/ngram_model_dmp.c

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37 /*
  38  * \file ngram_model_dmp.c DMP format language models
  39  *
  40  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
  41  */
  42
  43 #include <assert.h>
  44 #include <stdio.h>
  45 #include <string.h>
  46 #include <stdlib.h>
  47 #include <limits.h>
  48
  49 #include "sphinxbase/ckd_alloc.h"
  50 #include "sphinxbase/pio.h"
  51 #include "sphinxbase/err.h"
  52 #include "sphinxbase/byteorder.h"
  53 #include "sphinxbase/listelem_alloc.h"
  54
  55 #include "ngram_model_dmp.h"
  56
  57 static const char darpa_hdr[] = "Darpa Trigram LM";
  58 static ngram_funcs_t ngram_model_dmp_funcs;
  59
  60 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
  61 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
  62 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
  63
  64 static unigram_t *
  65 new_unigram_table(int32 n_ug)
  66 {
  67     unigram_t *table;
  68     int32 i;
  69
  70     table = ckd_calloc(n_ug, sizeof(unigram_t));
  71     for (i = 0; i < n_ug; i++) {
  72         table[i].prob1.f = -99.0;
  73         table[i].bo_wt1.f = -99.0;
  74     }
  75     return table;
  76 }
  77
  78 ngram_model_t *
  79 ngram_model_dmp_read(cmd_ln_t *config,
  80                      const char *file_name,
  81                      logmath_t *lmath)
  82 {
  83     ngram_model_t *base;
  84     ngram_model_dmp_t *model;
  85     FILE *fp;
  86     int do_mmap, do_swap;
  87     int32 is_pipe;
  88     int32 i, j, k, vn, n, ts;
  89     int32 n_unigram;
  90     int32 n_bigram;
  91     int32 n_trigram;
  92     char str[1024];
  93     unigram_t *ugptr;
  94     bigram_t *bgptr;
  95     trigram_t *tgptr;
  96     char *tmp_word_str;
  97     char *map_base = NULL;
  98     size_t offset = 0, filesize;
  99
 100     base = NULL;
 101     do_mmap = FALSE;
 102     if (config)
 103         do_mmap = cmd_ln_boolean_r(config, "-mmap");
 104
 105     if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
 106         E_ERROR("Dump file %s not found\n", file_name);
 107         goto error_out;
 108     }
 109
 110     if (is_pipe && do_mmap) {
 111         E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
 112         do_mmap = 0;
 113     }
 114
 115     do_swap = FALSE;
 116     if (fread(&k, sizeof(k), 1, fp) != 1)
 117         goto error_out;
 118     if (k != strlen(darpa_hdr)+1) {
 119         SWAP_INT32(&k);
 120         if (k != strlen(darpa_hdr)+1) {
 121             E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
 122             goto error_out;
 123         }
 124         do_swap = 1;
 125     }
 126     if (fread(str, 1, k, fp) != (size_t) k) {
 127         E_ERROR("Cannot read header\n");
 128         goto error_out;
 129     }
 130     if (strncmp(str, darpa_hdr, k) != 0) {
 131         E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
 132         goto error_out;
 133     }
 134
 135     if (do_mmap) {
 136         if (do_swap) {
 137             E_INFO
 138                 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
 139             do_mmap = 0;
 140         }
 141         else {
 142             E_INFO("Will use memory-mapped I/O for LM file\n");
 143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
 144             E_FATAL("memory mapping is not supported at the moment.");
 145 #else
 146 #endif
 147         }
 148     }
 149
 150     if (fread(&k, sizeof(k), 1, fp) != 1)
 151         goto error_out;
 152     if (do_swap) SWAP_INT32(&k);
 153     if (fread(str, 1, k, fp) != (size_t) k) {
 154         E_ERROR("Cannot read LM filename in header\n");
 155         goto error_out;
 156     }
 157
 158     /* read version#, if present (must be <= 0) */
 159     if (fread(&vn, sizeof(vn), 1, fp) != 1)
 160         goto error_out;
 161     if (do_swap) SWAP_INT32(&vn);
 162     if (vn <= 0) {
 163         /* read and don't compare timestamps (we don't care) */
 164         if (fread(&ts, sizeof(ts), 1, fp) != 1)
 165             goto error_out;
 166         if (do_swap) SWAP_INT32(&ts);
 167
 168         /* read and skip format description */
 169         for (;;) {
 170             if (fread(&k, sizeof(k), 1, fp) != 1)
 171                 goto error_out;
 172             if (do_swap) SWAP_INT32(&k);
 173             if (k == 0)
 174                 break;
 175             if (fread(str, 1, k, fp) != (size_t) k) {
 176                 E_ERROR("Failed to read word\n");
 177                 goto error_out;
 178             }
 179         }
 180         /* read model->ucount */
 181         if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
 182             goto error_out;
 183         if (do_swap) SWAP_INT32(&n_unigram);
 184     }
 185     else {
 186         n_unigram = vn;
 187     }
 188
 189     /* read model->bcount, tcount */
 190     if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
 191         goto error_out;
 192     if (do_swap) SWAP_INT32(&n_bigram);
 193     if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
 194         goto error_out;
 195     if (do_swap) SWAP_INT32(&n_trigram);
 196     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
 197
 198     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
 199     model = ckd_calloc(1, sizeof(*model));
 200     base = &model->base;
 201     if (n_trigram > 0)
 202         n = 3;
 203     else if (n_bigram > 0)
 204         n = 2;
 205     else
 206         n = 1;
 207     ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
 208     base->n_counts[0] = n_unigram;
 209     base->n_counts[1] = n_bigram;
 210     base->n_counts[2] = n_trigram;
 211
 212     /* read unigrams (always in memory, as they contain dictionary
 213      * mappings that can't be precomputed, and also could have OOVs added) */
 214     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
 215     ugptr = model->lm3g.unigrams;
 216     for (i = 0; i <= n_unigram; ++i) {
 217         /* Skip over the mapping ID, we don't care about it. */
 218         if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
 219             E_ERROR("Failed to read maping id %d\n", i);
 220             goto error_out;
 221         }
 222         /* Read the actual unigram structure. */
 223         if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1)  {
 224             E_ERROR("Failed to read unigrams data\n");
 225             ngram_model_free(base);
 226             fclose_comp(fp, is_pipe);
 227             return NULL;
 228         }
 229         /* Byte swap if necessary. */
 230         if (do_swap) {
 231             SWAP_INT32(&ugptr->prob1.l);
 232             SWAP_INT32(&ugptr->bo_wt1.l);
 233             SWAP_INT32(&ugptr->bigrams);
 234         }
 235         /* Convert values to log. */
 236         ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
 237         ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
 238         E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
 239                     i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
 240         ++ugptr;
 241     }
 242     E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
 243
 244     /* Now mmap() the file and read in the rest of the (read-only) stuff. */
 245     if (do_mmap) {
 246         offset = ftell(fp);
 247         fseek(fp, 0, SEEK_END);
 248         filesize = ftell(fp);
 249         fseek(fp, offset, SEEK_SET);
 250
 251         /* Check for improper word alignment. */
 252         if (offset & 0x3) {
 253             E_WARN("-mmap specified, but trigram index is not word-aligned.  Will not memory-map.\n");
 254             do_mmap = FALSE;
 255         }
 256         else {
 257             model->dump_mmap = mmio_file_read(file_name);
 258             if (model->dump_mmap == NULL) {
 259                 do_mmap = FALSE;
 260             }
 261             else {
 262                 map_base = mmio_file_ptr(model->dump_mmap);
 263             }
 264         }
 265     }
 266
 267     if (n_bigram > 0) {
 268         /* read bigrams */
 269         if (do_mmap) {
 270             model->lm3g.bigrams = (bigram_t *) (map_base + offset);
 271             offset += (n_bigram + 1) * sizeof(bigram_t);
 272         }
 273         else {
 274             model->lm3g.bigrams =
 275                 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
 276             if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
 277                 != (size_t) n_bigram + 1) {
 278                 E_ERROR("Failed to read bigrams data\n");
 279                 goto error_out;
 280             }
 281             if (do_swap) {
 282                 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
 283                      i++, bgptr++) {
 284                     SWAP_INT16(&bgptr->wid);
 285                     SWAP_INT16(&bgptr->prob2);
 286                     SWAP_INT16(&bgptr->bo_wt2);
 287                     SWAP_INT16(&bgptr->trigrams);
 288                 }
 289             }
 290         }
 291         E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
 292     }
 293
 294     /* read trigrams */
 295     if (n_trigram > 0) {
 296         if (do_mmap) {
 297             model->lm3g.trigrams = (trigram_t *) (map_base + offset);
 298             offset += n_trigram * sizeof(trigram_t);
 299         }
 300         else {
 301             model->lm3g.trigrams =
 302                 ckd_calloc(n_trigram, sizeof(trigram_t));
 303             if (fread
 304                 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
 305                 != (size_t) n_trigram) {
 306                 E_ERROR("Failed to read trigrams data\n");
 307                 goto error_out;
 308             }
 309             if (do_swap) {
 310                 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
 311                      i++, tgptr++) {
 312                     SWAP_INT16(&tgptr->wid);
 313                     SWAP_INT16(&tgptr->prob3);
 314                 }
 315             }
 316         }
 317         E_INFO("%8d = LM.trigrams read\n", n_trigram);
 318         /* Initialize tginfo */
 319         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
 320         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
 321     }
 322
 323     if (n_bigram > 0) {
 324         /* read n_prob2 and prob2 array (in memory) */
 325         if (do_mmap)
 326             fseek(fp, offset, SEEK_SET);
 327         if (fread(&k, sizeof(k), 1, fp) != 1)
 328             goto error_out;
 329         if (do_swap) SWAP_INT32(&k);
 330         model->lm3g.n_prob2 = k;
 331         model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
 332         if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
 333             E_ERROR("fread(prob2) failed\n");
 334             goto error_out;
 335         }
 336         for (i = 0; i < k; i++) {
 337             if (do_swap)
 338                 SWAP_INT32(&model->lm3g.prob2[i].l);
 339             /* Convert values to log. */
 340             model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
 341         }
 342         E_INFO("%8d = LM.prob2 entries read\n", k);
 343     }
 344
 345     /* read n_bo_wt2 and bo_wt2 array (in memory) */
 346     if (base->n > 2) {
 347         if (fread(&k, sizeof(k), 1, fp) != 1)
 348             goto error_out;
 349         if (do_swap) SWAP_INT32(&k);
 350         model->lm3g.n_bo_wt2 = k;
 351         model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
 352         if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
 353             E_ERROR("Failed to read backoff weights\n");
 354             goto error_out;
 355         }
 356         for (i = 0; i < k; i++) {
 357             if (do_swap)
 358                 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
 359             /* Convert values to log. */
 360             model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
 361         }
 362         E_INFO("%8d = LM.bo_wt2 entries read\n", k);
 363     }
 364
 365     /* read n_prob3 and prob3 array (in memory) */
 366     if (base->n > 2) {
 367         if (fread(&k, sizeof(k), 1, fp) != 1)
 368                 goto error_out;
 369         if (do_swap) SWAP_INT32(&k);
 370         model->lm3g.n_prob3 = k;
 371         model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
 372         if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
 373             E_ERROR("Failed to read trigram probability\n");
 374             goto error_out;
 375         }
 376         for (i = 0; i < k; i++) {
 377             if (do_swap)
 378                 SWAP_INT32(&model->lm3g.prob3[i].l);
 379             /* Convert values to log. */
 380             model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
 381         }
 382         E_INFO("%8d = LM.prob3 entries read\n", k);
 383     }
 384
 385     /* read tseg_base size and tseg_base */
 386     if (do_mmap)
 387         offset = ftell(fp);
 388     if (n_trigram > 0) {
 389         if (do_mmap) {
 390             memcpy(&k, map_base + offset, sizeof(k));
 391             offset += sizeof(int32);
 392             model->lm3g.tseg_base = (int32 *) (map_base + offset);
 393             offset += k * sizeof(int32);
 394         }
 395         else {
 396             k = (n_bigram + 1) / BG_SEG_SZ + 1;
 397             if (fread(&k, sizeof(k), 1, fp) != 1)
 398                 goto error_out;
 399             if (do_swap) SWAP_INT32(&k);
 400             model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
 401             if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
 402                 (size_t) k) {
 403                 E_ERROR("Failed to read trigram index\n");
 404                 goto error_out;
 405             }
 406             if (do_swap)
 407                 for (i = 0; i < k; i++)
 408                     SWAP_INT32(&model->lm3g.tseg_base[i]);
 409         }
 410         E_INFO("%8d = LM.tseg_base entries read\n", k);
 411     }
 412
 413     /* read ascii word strings */
 414     if (do_mmap) {
 415         memcpy(&k, map_base + offset, sizeof(k));
 416         offset += sizeof(int32);
 417         tmp_word_str = (char *) (map_base + offset);
 418         offset += k;
 419     }
 420     else {
 421         base->writable = TRUE;
 422         if (fread(&k, sizeof(k), 1, fp) != 1)
 423             goto error_out;
 424         if (do_swap) SWAP_INT32(&k);
 425         tmp_word_str = ckd_calloc(k, 1);
 426         if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
 427             E_ERROR("Failed to read words\n");
 428             goto error_out;
 429         }
 430     }
 431
 432     /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
 433     for (i = 0, j = 0; i < k; i++)
 434         if (tmp_word_str[i] == '\0')
 435             j++;
 436     if (j != n_unigram) {
 437         E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
 438                 j, n_unigram);
 439         goto error_out;
 440     }
 441
 442     /* Break up string just read into words */
 443     if (do_mmap) {
 444         j = 0;
 445         for (i = 0; i < n_unigram; i++) {
 446             base->word_str[i] = tmp_word_str + j;
 447             if (hash_table_enter(base->wid, base->word_str[i],
 448                                  (void *)(long)i) != (void *)(long)i) {
 449                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
 450             }
 451             j += strlen(base->word_str[i]) + 1;
 452         }
 453     }
 454     else {
 455         j = 0;
 456         for (i = 0; i < n_unigram; i++) {
 457             base->word_str[i] = ckd_salloc(tmp_word_str + j);
 458             if (hash_table_enter(base->wid, base->word_str[i],
 459                                  (void *)(long)i) != (void *)(long)i) {
 460                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
 461             }
 462             j += strlen(base->word_str[i]) + 1;
 463         }
 464         free(tmp_word_str);
 465     }
 466     E_INFO("%8d = ascii word strings read\n", i);
 467
 468     fclose_comp(fp, is_pipe);
 469     return base;
 470
 471 error_out:
 472     if (fp)
 473         fclose_comp(fp, is_pipe);
 474     ngram_model_free(base);
 475     return NULL;
 476 }
 477
 478 ngram_model_dmp_t *
 479 ngram_model_dmp_build(ngram_model_t *base)
 480 {
 481     ngram_model_dmp_t *model;
 482     ngram_model_t *newbase;
 483     ngram_iter_t *itor;
 484     sorted_list_t sorted_prob2;
 485     sorted_list_t sorted_bo_wt2;
 486     sorted_list_t sorted_prob3;
 487     bigram_t *bgptr;
 488     trigram_t *tgptr;
 489     int i, bgcount, tgcount, seg;
 490
 491     if (base->funcs == &ngram_model_dmp_funcs) {
 492         E_INFO("Using existing DMP model.\n");
 493         return (ngram_model_dmp_t *)ngram_model_retain(base);
 494     }
 495
 496     /* Initialize new base model structure with params from base. */
 497     E_INFO("Building DMP model...\n");
 498     model = ckd_calloc(1, sizeof(*model));
 499     newbase = &model->base;
 500     ngram_model_init(newbase, &ngram_model_dmp_funcs,
 501                      logmath_retain(base->lmath),
 502                      base->n, base->n_counts[0]);
 503     /* Copy N-gram counts over. */
 504     memcpy(newbase->n_counts, base->n_counts,
 505            base->n * sizeof(*base->n_counts));
 506     /* Make sure word strings are freed. */
 507     newbase->writable = TRUE;
 508     /* Initialize unigram table and string table. */
 509     model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
 510     for (itor = ngram_model_mgrams(base, 0); itor;
 511          itor = ngram_iter_next(itor)) {
 512         int32 prob1, bo_wt1;
 513         int32 const *wids;
 514
 515         /* Can't guarantee they will go in unigram order, so just to
 516          * be correct, we do this... */
 517         wids = ngram_iter_get(itor, &prob1, &bo_wt1);
 518         model->lm3g.unigrams[wids[0]].prob1.l = prob1;
 519         model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
 520         newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
 521         if ((hash_table_enter_int32(newbase->wid,
 522                                     newbase->word_str[wids[0]], wids[0]))
 523             != wids[0]) {
 524                 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
 525         }
 526     }
 527     E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
 528
 529     if (newbase->n < 2)
 530         return model;
 531
 532     /* Construct quantized probability table for bigrams and
 533      * (optionally) trigrams.  Hesitate to use the "sorted list" thing
 534      * since it isn't so useful, but it's there already. */
 535     init_sorted_list(&sorted_prob2);
 536     if (newbase->n > 2) {
 537         init_sorted_list(&sorted_bo_wt2);
 538         init_sorted_list(&sorted_prob3);
 539     }
 540     /* Construct bigram and trigram arrays. */
 541     bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
 542     if (newbase->n > 2) {
 543         tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
 544         model->lm3g.tseg_base =
 545             ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
 546     }
 547     else
 548         tgptr = NULL;
 549     /* Since bigrams and trigrams have to be contiguous with others
 550      * with the same N-1-gram, we traverse them in depth-first order
 551      * to build the bigram and trigram arrays. */
 552     for (i = 0; i < newbase->n_counts[0]; ++i) {
 553         ngram_iter_t *uitor;
 554         bgcount = bgptr - model->lm3g.bigrams;
 555         /* First bigram index (same as next if no bigrams...) */
 556         model->lm3g.unigrams[i].bigrams = bgcount;
 557         E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
 558         /* All bigrams corresponding to unigram i */
 559         uitor = ngram_ng_iter(base, i, NULL, 0);
 560         for (itor = ngram_iter_successors(uitor);
 561              itor; ++bgptr, itor = ngram_iter_next(itor)) {
 562             int32 prob2, bo_wt2;
 563             int32 const *wids;
 564             ngram_iter_t *titor;
 565
 566             wids = ngram_iter_get(itor, &prob2, &bo_wt2);
 567
 568             assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]);
 569
 570             bgptr->wid = wids[1];
 571             bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
 572             if (newbase->n > 2) {
 573                 tgcount = (tgptr - model->lm3g.trigrams);
 574                 bgcount = (bgptr - model->lm3g.bigrams);
 575
 576                 /* Backoff weight (only if there are trigrams...) */
 577                 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
 578
 579                 /* Find bigram segment for this bigram (this isn't
 580                  * used unless there are trigrams) */
 581                 seg = bgcount >> LOG_BG_SEG_SZ;
 582                 /* If we just crossed a bigram segment boundary, then
 583                  * point tseg_base for the new segment to the current
 584                  * trigram pointer. */
 585                 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
 586                     model->lm3g.tseg_base[seg] = tgcount;
 587                 /* Now calculate the trigram offset. */
 588                 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
 589                 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
 590                             bgcount,
 591                             newbase->word_str[wids[0]],
 592                             newbase->word_str[wids[1]],
 593                             seg, bgptr->trigrams));
 594
 595                 /* And fill in successors' trigram info. */
 596                 for (titor = ngram_iter_successors(itor);
 597                      titor; ++tgptr, titor = ngram_iter_next(titor)) {
 598                     int32 prob3, dummy;
 599
 600                     assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
 601                     wids = ngram_iter_get(titor, &prob3, &dummy);
 602                     tgptr->wid = wids[2];
 603                     tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
 604                     E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
 605                                 tgcount,
 606                                 newbase->word_str[wids[0]],
 607                                 newbase->word_str[wids[1]],
 608                                 newbase->word_str[wids[2]],
 609                                 tgptr->prob3));
 610                 }
 611             }
 612         }
 613         ngram_iter_free(uitor);
 614     }
 615     /* Add sentinal unigram and bigram records. */
 616     bgcount = bgptr - model->lm3g.bigrams;
 617     tgcount = tgptr - model->lm3g.trigrams;
 618     seg = bgcount >> LOG_BG_SEG_SZ;
 619     if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
 620         model->lm3g.tseg_base[seg] = tgcount;
 621     model->lm3g.unigrams[i].bigrams = bgcount;
 622     if (newbase->n > 2)
 623         bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
 624
 625     /* Now create probability tables. */
 626     model->lm3g.n_prob2 = sorted_prob2.free;
 627     model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
 628     E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
 629     E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
 630     free_sorted_list(&sorted_prob2);
 631     if (newbase->n > 2) {
 632         /* Create trigram bo-wts array. */
 633         model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
 634         model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
 635         free_sorted_list(&sorted_bo_wt2);
 636         E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
 637         /* Create trigram probability table. */
 638         model->lm3g.n_prob3 = sorted_prob3.free;
 639         model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
 640         E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
 641         E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
 642         free_sorted_list(&sorted_prob3);
 643         /* Initialize tginfo */
 644         model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
 645         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
 646     }
 647
 648     return model;
 649 }
 650
 651 static void
 652 fwrite_int32(FILE *fh, int32 val)
 653 {
 654     fwrite(&val, 4, 1, fh);
 655 }
 656
 657 static void
 658 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
 659 {
 660     int32 bogus = -1;
 661     float32 log10val;
 662
 663     /* Bogus dictionary mapping field. */
 664     fwrite(&bogus, 4, 1, fh);
 665     /* Convert values to log10. */
 666     log10val = logmath_log_to_log10(lmath, ug->prob1.l);
 667     fwrite(&log10val, 4, 1, fh);
 668     log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
 669     fwrite(&log10val, 4, 1, fh);
 670     fwrite_int32(fh, ug->bigrams);
 671 }
 672
 673 static void
 674 fwrite_bg(FILE *fh, bigram_t *bg)
 675 {
 676     fwrite(bg, sizeof(*bg), 1, fh);
 677 }
 678
 679 static void
 680 fwrite_tg(FILE *fh, trigram_t *tg)
 681 {
 682     fwrite(tg, sizeof(*tg), 1, fh);
 683 }
 684
 685 /** Please look at the definition of
 686  */
 687 static char const *fmtdesc[] = {
 688     "BEGIN FILE FORMAT DESCRIPTION",
 689     "Header string length (int32) and string (including trailing 0)",
 690     "Original LM filename string-length (int32) and filename (including trailing 0)",
 691     "(int32) version number (present iff value <= 0)",
 692     "(int32) original LM file modification timestamp (iff version# present)",
 693     "(int32) string-length and string (including trailing 0) (iff version# present)",
 694     "... previous entry continued any number of times (iff version# present)",
 695     "(int32) 0 (terminating sequence of strings) (iff version# present)",
 696     "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
 697     "(int32) lm_t.ucount (must be > 0)",
 698     "(int32) lm_t.bcount",
 699     "(int32) lm_t.tcount",
 700     "lm_t.ucount+1 unigrams (including sentinel)",
 701     "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
 702     "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
 703     "(int32) lm_t.n_prob2",
 704     "(int32) lm_t.prob2[]",
 705     "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
 706     "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
 707     "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
 708     "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
 709     "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
 710     "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
 711     "(int32) Sum(all word string-lengths, including trailing 0 for each)",
 712     "All word strings (including trailing 0 for each)",
 713     "END FILE FORMAT DESCRIPTION",
 714     NULL,
 715 };
 716
 717 static void
 718 ngram_model_dmp_write_header(FILE * fh)
 719 {
 720     int32 k;
 721     k = strlen(darpa_hdr) + 1;
 722     fwrite_int32(fh, k);
 723     fwrite(darpa_hdr, 1, k, fh);
 724 }
 725
 726 static void
 727 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
 728 {
 729     int32 k;
 730
 731     k = strlen(lmfile) + 1;
 732     fwrite_int32(fh, k);
 733     fwrite(lmfile, 1, k, fh);
 734 }
 735
 736 #define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which
 737                                      is trigram or lower which used 16 bits in
 738                                      bigram and trigram.*/
 739
 740 static void
 741 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
 742 {
 743     fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);   /* version # */
 744     fwrite_int32(fh, mtime);
 745 }
 746
 747 static void
 748 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
 749 {
 750     fwrite_int32(fh, model->n_counts[0]);
 751     fwrite_int32(fh, model->n_counts[1]);
 752     fwrite_int32(fh, model->n_counts[2]);
 753 }
 754
 755 static void
 756 ngram_model_dmp_write_fmtdesc(FILE * fh)
 757 {
 758     int32 i, k;
 759     long pos;
 760
 761     /* Write file format description into header */
 762     for (i = 0; fmtdesc[i] != NULL; i++) {
 763         k = strlen(fmtdesc[i]) + 1;
 764         fwrite_int32(fh, k);
 765         fwrite(fmtdesc[i], 1, k, fh);
 766     }
 767     /* Pad it out in order to achieve 32-bit alignment */
 768     pos = ftell(fh);
 769     k = pos & 3;
 770     if (k) {
 771         fwrite_int32(fh, 4-k);
 772         fwrite("!!!!", 1, 4-k, fh);
 773     }
 774     fwrite_int32(fh, 0);
 775 }
 776
 777 static void
 778 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
 779 {
 780     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 781     int32 i;
 782
 783     for (i = 0; i <= model->n_counts[0]; i++) {
 784         fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
 785     }
 786 }
 787
 788
 789 static void
 790 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
 791 {
 792     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 793     int32 i;
 794
 795     for (i = 0; i <= model->n_counts[1]; i++) {
 796         fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
 797     }
 798
 799 }
 800
 801 static void
 802 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
 803 {
 804     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 805     int32 i;
 806
 807     for (i = 0; i < model->n_counts[2]; i++) {
 808         fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
 809     }
 810 }
 811
 812 static void
 813 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
 814 {
 815     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 816     int32 i;
 817
 818     fwrite_int32(fh, lm->lm3g.n_prob2);
 819     for (i = 0; i < lm->lm3g.n_prob2; i++) {
 820         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
 821         fwrite(&log10val, 4, 1, fh);
 822     }
 823 }
 824
 825 static void
 826 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
 827 {
 828     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 829     int32 i;
 830
 831     fwrite_int32(fh, lm->lm3g.n_bo_wt2);
 832     for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
 833         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
 834         fwrite(&log10val, 4, 1, fh);
 835     }
 836 }
 837
 838 static void
 839 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
 840 {
 841     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 842     int32 i;
 843
 844     fwrite_int32(fh, lm->lm3g.n_prob3);
 845     for (i = 0; i < lm->lm3g.n_prob3; i++) {
 846         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
 847         fwrite(&log10val, 4, 1, fh);
 848     }
 849 }
 850
 851 static void
 852 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
 853 {
 854     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
 855     int32 i, k;
 856
 857     k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
 858     fwrite_int32(fh, k);
 859     for (i = 0; i < k; i++)
 860         fwrite_int32(fh, lm->lm3g.tseg_base[i]);
 861 }
 862
 863 static void
 864 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
 865 {
 866     int32 i, k;
 867
 868     k = 0;
 869     for (i = 0; i < model->n_counts[0]; i++)
 870         k += strlen(model->word_str[i]) + 1;
 871     fwrite_int32(fh, k);
 872     for (i = 0; i < model->n_counts[0]; i++)
 873         fwrite(model->word_str[i], 1,
 874                strlen(model->word_str[i]) + 1, fh);
 875 }
 876
 877 int
 878 ngram_model_dmp_write(ngram_model_t *base,
 879                       const char *file_name)
 880 {
 881     ngram_model_dmp_t *model;
 882     ngram_model_t *newbase;
 883     FILE *fh;
 884
 885     /* First, construct a DMP model from the base model. */
 886     model = ngram_model_dmp_build(base);
 887     newbase = &model->base;
 888
 889     /* Now write it, confident in the knowledge that it's the right
 890      * kind of language model internally. */
 891     if ((fh = fopen(file_name, "wb")) == NULL) {
 892         E_ERROR("Cannot create file %s\n", file_name);
 893         return -1;
 894     }
 895     ngram_model_dmp_write_header(fh);
 896     ngram_model_dmp_write_lm_filename(fh, file_name);
 897     ngram_model_dmp_write_version(fh, 0);
 898     ngram_model_dmp_write_fmtdesc(fh);
 899     ngram_model_dmp_write_ngram_counts(fh, newbase);
 900     ngram_model_dmp_write_unigram(fh, newbase);
 901     if (newbase->n > 1) {
 902         ngram_model_dmp_write_bigram(fh, newbase);
 903         if (newbase->n > 2) {
 904             ngram_model_dmp_write_trigram(fh, newbase);
 905         }
 906         ngram_model_dmp_write_bgprob(fh, newbase);
 907         if (newbase->n > 2) {
 908                 ngram_model_dmp_write_tgbowt(fh, newbase);
 909                 ngram_model_dmp_write_tgprob(fh, newbase);
 910                 ngram_model_dmp_write_tg_segbase(fh, newbase);
 911         }
 912     }
 913     ngram_model_dmp_write_wordstr(fh, newbase);
 914     ngram_model_free(newbase);
 915
 916     return fclose(fh);
 917 }
 918
 919 static int
 920 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
 921                               float32 wip, float32 uw)
 922 {
 923     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
 924     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
 925     return 0;
 926 }
 927
 928 /* Lousy "templating" for things that are largely the same in DMP and
 929  * ARPA models, except for the bigram and trigram types and some
 930  * names. */
 931 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
 932 #include "lm3g_templates.c"
 933
 934 static void
 935 ngram_model_dmp_free(ngram_model_t *base)
 936 {
 937     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
 938
 939     ckd_free(model->lm3g.unigrams);
 940     ckd_free(model->lm3g.prob2);
 941     if (model->dump_mmap) {
 942         mmio_file_unmap(model->dump_mmap);
 943     }
 944     else {
 945         ckd_free(model->lm3g.bigrams);
 946         if (base->n > 2) {
 947             ckd_free(model->lm3g.trigrams);
 948             ckd_free(model->lm3g.tseg_base);
 949         }
 950     }
 951     if (base->n > 2) {
 952         ckd_free(model->lm3g.bo_wt2);
 953         ckd_free(model->lm3g.prob3);
 954     }
 955
 956     lm3g_tginfo_free(base, &model->lm3g);
 957 }
 958
 959 static ngram_funcs_t ngram_model_dmp_funcs = {
 960     ngram_model_dmp_free,          /* free */
 961     ngram_model_dmp_apply_weights, /* apply_weights */
 962     lm3g_template_score,           /* score */
 963     lm3g_template_raw_score,       /* raw_score */
 964     lm3g_template_add_ug,          /* add_ug */
 965     lm3g_template_flush,           /* flush */
 966     lm3g_template_iter,             /* iter */
 967     lm3g_template_mgrams,          /* mgrams */
 968     lm3g_template_successors,      /* successors */
 969     lm3g_template_iter_get,        /* iter_get */
 970     lm3g_template_iter_next,       /* iter_next */
 971     lm3g_template_iter_free        /* iter_free */
 972 };