src/libsphinxbase/lm/ngram_model_arpa.c

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37 /*
  38  * \file ngram_model_arpa.c ARPA format language models
  39  *
  40  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
  41  */
  42
  43 #include "sphinxbase/ckd_alloc.h"
  44 #include <string.h>
  45 #include <limits.h>
  46 #include <assert.h>
  47
  48 #include "sphinxbase/err.h"
  49 #include "sphinxbase/pio.h"
  50 #include "sphinxbase/listelem_alloc.h"
  51 #include "sphinxbase/strfuncs.h"
  52
  53 #include "ngram_model_arpa.h"
  54
  55 static ngram_funcs_t ngram_model_arpa_funcs;
  56
  57 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
  58 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
  59 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
  60
  61 /*
  62  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
  63  */
  64 static int
  65 ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
  66 {
  67     int32 ngram, ngram_cnt;
  68
  69     /* skip file until past the '\data\' marker */
  70     while (*li) {
  71         string_trim((*li)->buf, STRING_BOTH);
  72         if (strcmp((*li)->buf, "\\data\\") == 0)
  73             break;
  74         *li = lineiter_next(*li);
  75     }
  76     if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
  77         E_INFO("No \\data\\ mark in LM file\n");
  78         return -1;
  79     }
  80
  81     *n_ug = *n_bg = *n_tg = 0;
  82     while ((*li = lineiter_next(*li))) {
  83         if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
  84             break;
  85         switch (ngram) {
  86         case 1:
  87             *n_ug = ngram_cnt;
  88             break;
  89         case 2:
  90             *n_bg = ngram_cnt;
  91             break;
  92         case 3:
  93             *n_tg = ngram_cnt;
  94             break;
  95         default:
  96             E_ERROR("Unknown ngram (%d)\n", ngram);
  97             return -1;
  98         }
  99     }
 100     if (*li == NULL) {
 101         E_ERROR("EOF while reading ngram counts\n");
 102         return -1;
 103     }
 104
 105     /* Position iterator to the unigrams header '\1-grams:\' */
 106     while ((*li = lineiter_next(*li))) {
 107         string_trim((*li)->buf, STRING_BOTH);
 108         if (strcmp((*li)->buf, "\\1-grams:") == 0)
 109             break;
 110     }
 111     if (*li == NULL) {
 112         E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
 113         return -1;
 114     }
 115
 116     if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
 117         E_ERROR("Bad or missing ngram count\n");
 118         return -1;
 119     }
 120     return 0;
 121 }
 122
 123 /*
 124  * Read in the unigrams from given file into the LM structure model.
 125  * On entry to this procedure, the iterator is positioned to the
 126  * header line '\1-grams:'.
 127  */
 128 static int
 129 ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
 130 {
 131     ngram_model_t *base = &model->base;
 132     int32 wcnt;
 133     float p1;
 134
 135     E_INFO("Reading unigrams\n");
 136
 137     wcnt = 0;
 138     while ((*li = lineiter_next(*li))) {
 139         char *wptr[3], *name;
 140         float32 bo_wt = 0.0f;
 141         int n;
 142
 143         string_trim((*li)->buf, STRING_BOTH);
 144         if (strcmp((*li)->buf, "\\2-grams:") == 0
 145             || strcmp((*li)->buf, "\\end\\") == 0)
 146             break;
 147
 148         if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
 149             if ((*li)->buf[0] != '\0')
 150                 E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
 151             continue;
 152         }
 153         else {
 154             p1 = (float)atof_c(wptr[0]);
 155             name = wptr[1];
 156             if (n == 3)
 157                 bo_wt = (float)atof_c(wptr[2]);
 158         }
 159
 160         if (wcnt >= base->n_counts[0]) {
 161             E_ERROR("Too many unigrams\n");
 162             return -1;
 163         }
 164
 165         /* Associate name with word id */
 166         base->word_str[wcnt] = ckd_salloc(name);
 167         if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
 168             != (void *)(long)wcnt) {
 169                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
 170         }
 171         model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
 172         model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
 173         wcnt++;
 174     }
 175
 176     if (base->n_counts[0] != wcnt) {
 177         E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
 178                base->n_counts[0], wcnt);
 179         base->n_counts[0] = wcnt;
 180         base->n_words = wcnt;
 181     }
 182     return 0;
 183 }
 184
 185 /*
 186  * Read bigrams from given file into given model structure.
 187  */
 188 static int
 189 ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
 190 {
 191     ngram_model_t *base = &model->base;
 192     int32 w1, w2, prev_w1, bgcount;
 193     bigram_t *bgptr;
 194
 195     E_INFO("Reading bigrams\n");
 196
 197     bgcount = 0;
 198     bgptr = model->lm3g.bigrams;
 199     prev_w1 = -1;
 200
 201     while ((*li = lineiter_next(*li))) {
 202         float32 p, bo_wt = 0.0f;
 203         int32 p2, bo_wt2;
 204         char *wptr[4], *word1, *word2;
 205         int n;
 206
 207         string_trim((*li)->buf, STRING_BOTH);
 208         wptr[3] = NULL;
 209         if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
 210             if ((*li)->buf[0] != '\0')
 211                 break;
 212             continue;
 213         }
 214         else {
 215             p = (float32)atof_c(wptr[0]);
 216             word1 = wptr[1];
 217             word2 = wptr[2];
 218             if (wptr[3])
 219                 bo_wt = (float32)atof_c(wptr[3]);
 220         }
 221
 222         if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
 223             E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
 224                     word1, word1, word2);
 225             continue;
 226         }
 227         if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
 228             E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
 229                     word2, word1, word2);
 230             continue;
 231         }
 232
 233         /* FIXME: Should use logmath_t quantization here. */
 234         /* HACK!! to quantize probs to 4 decimal digits */
 235         p = (float32)((int32)(p * 10000)) / 10000;
 236         bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
 237
 238         p2 = logmath_log10_to_log(base->lmath, p);
 239         bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
 240
 241         if (bgcount >= base->n_counts[1]) {
 242             E_ERROR("Too many bigrams\n");
 243             return -1;
 244         }
 245
 246         bgptr->wid = w2;
 247         bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
 248         if (base->n_counts[2] > 0)
 249             bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
 250
 251         if (w1 != prev_w1) {
 252             if (w1 < prev_w1) {
 253                 E_ERROR("Bigrams not in unigram order\n");
 254                 return -1;
 255             }
 256
 257             for (prev_w1++; prev_w1 <= w1; prev_w1++)
 258                 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
 259             prev_w1 = w1;
 260         }
 261         bgcount++;
 262         bgptr++;
 263
 264         if ((bgcount & 0x0000ffff) == 0) {
 265             E_INFOCONT(".");
 266         }
 267     }
 268     if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
 269                         && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
 270         E_ERROR("Bad bigram: %s\n", (*li)->buf);
 271         return -1;
 272     }
 273
 274     for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
 275         model->lm3g.unigrams[prev_w1].bigrams = bgcount;
 276
 277     return 0;
 278 }
 279
 280 /*
 281  * Very similar to ReadBigrams.
 282  */
 283 static int
 284 ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
 285 {
 286     ngram_model_t *base = &model->base;
 287     int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
 288     int32 seg, prev_seg, prev_seg_lastbg;
 289     trigram_t *tgptr;
 290     bigram_t *bgptr;
 291
 292     E_INFO("Reading trigrams\n");
 293
 294     tgcount = 0;
 295     tgptr = model->lm3g.trigrams;
 296     prev_w1 = -1;
 297     prev_w2 = -1;
 298     prev_bg = -1;
 299     prev_seg = -1;
 300
 301     while ((*li = lineiter_next(*li))) {
 302         float32 p;
 303         int32 p3;
 304         char *wptr[4], *word1, *word2, *word3;
 305
 306         string_trim((*li)->buf, STRING_BOTH);
 307         if (str2words((*li)->buf, wptr, 4) != 4) {
 308             if ((*li)->buf[0] != '\0')
 309                 break;
 310             continue;
 311         }
 312         else {
 313             p = (float32)atof_c(wptr[0]);
 314             word1 = wptr[1];
 315             word2 = wptr[2];
 316             word3 = wptr[3];
 317         }
 318
 319         if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
 320             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
 321                     word1, word1, word2, word3);
 322             continue;
 323         }
 324         if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
 325             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
 326                     word2, word1, word2, word3);
 327             continue;
 328         }
 329         if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
 330             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
 331                     word3, word1, word2, word3);
 332             continue;
 333         }
 334
 335         /* FIXME: Should use logmath_t quantization here. */
 336         /* HACK!! to quantize probs to 4 decimal digits */
 337         p = (float32)((int32)(p * 10000)) / 10000;
 338         p3 = logmath_log10_to_log(base->lmath, p);
 339
 340         if (tgcount >= base->n_counts[2]) {
 341             E_ERROR("Too many trigrams\n");
 342             return -1;
 343         }
 344
 345         tgptr->wid = w3;
 346         tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
 347
 348         if ((w1 != prev_w1) || (w2 != prev_w2)) {
 349             /* Trigram for a new bigram; update tg info for all previous bigrams */
 350             if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
 351                 E_ERROR("Trigrams not in bigram order\n");
 352                 return -1;
 353             }
 354
 355             bg = (w1 !=
 356                   prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
 357             endbg = model->lm3g.unigrams[w1 + 1].bigrams;
 358             bgptr = model->lm3g.bigrams + bg;
 359             for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
 360             if (bg >= endbg) {
 361                 E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
 362                 return -1;
 363             }
 364
 365             /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
 366             seg = bg >> LOG_BG_SEG_SZ;
 367             for (i = prev_seg + 1; i <= seg; i++)
 368                 model->lm3g.tseg_base[i] = tgcount;
 369
 370             /* Update trigrams pointers for all bigrams until bg */
 371             if (prev_seg < seg) {
 372                 int32 tgoff = 0;
 373
 374                 if (prev_seg >= 0) {
 375                     tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
 376                     if (tgoff > 65535) {
 377                         E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
 378                         return -1;
 379                     }
 380                 }
 381
 382                 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
 383                 bgptr = model->lm3g.bigrams + prev_bg;
 384                 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
 385                      prev_bg++, bgptr++)
 386                     bgptr->trigrams = tgoff;
 387
 388                 for (; prev_bg <= bg; prev_bg++, bgptr++)
 389                     bgptr->trigrams = 0;
 390             }
 391             else {
 392                 int32 tgoff;
 393
 394                 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
 395                 if (tgoff > 65535) {
 396                     E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
 397                     return -1;
 398                 }
 399
 400                 bgptr = model->lm3g.bigrams + prev_bg;
 401                 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
 402                     bgptr->trigrams = tgoff;
 403             }
 404
 405             prev_w1 = w1;
 406             prev_w2 = w2;
 407             prev_bg = bg;
 408             prev_seg = seg;
 409         }
 410
 411         tgcount++;
 412         tgptr++;
 413
 414         if ((tgcount & 0x0000ffff) == 0) {
 415             E_INFOCONT(".");
 416         }
 417     }
 418     if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
 419         E_ERROR("Bad trigram: %s\n", (*li)->buf);
 420         return -1;
 421     }
 422
 423     for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
 424         if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
 425             model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
 426         if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
 427             E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
 428             return -1;
 429         }
 430         model->lm3g.bigrams[prev_bg].trigrams =
 431             tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
 432     }
 433     return 0;
 434 }
 435
 436 static unigram_t *
 437 new_unigram_table(int32 n_ug)
 438 {
 439     unigram_t *table;
 440     int32 i;
 441
 442     table = ckd_calloc(n_ug, sizeof(unigram_t));
 443     for (i = 0; i < n_ug; i++) {
 444         table[i].prob1.l = INT_MIN;
 445         table[i].bo_wt1.l = INT_MIN;
 446     }
 447     return table;
 448 }
 449
 450 ngram_model_t *
 451 ngram_model_arpa_read(cmd_ln_t *config,
 452                       const char *file_name,
 453                       logmath_t *lmath)
 454 {
 455     lineiter_t *li;
 456     FILE *fp;
 457     int32 is_pipe;
 458     int32 n_unigram;
 459     int32 n_bigram;
 460     int32 n_trigram;
 461     int32 n;
 462     ngram_model_arpa_t *model;
 463     ngram_model_t *base;
 464
 465     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
 466         E_ERROR("File %s not found\n", file_name);
 467         return NULL;
 468     }
 469     li = lineiter_start(fp);
 470
 471     /* Read #unigrams, #bigrams, #trigrams from file */
 472     if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
 473         lineiter_free(li);
 474         fclose_comp(fp, is_pipe);
 475         return NULL;
 476     }
 477     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
 478
 479     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
 480     model = ckd_calloc(1, sizeof(*model));
 481     base = &model->base;
 482     if (n_trigram > 0)
 483         n = 3;
 484     else if (n_bigram > 0)
 485         n = 2;
 486     else
 487         n = 1;
 488     /* Initialize base model. */
 489     ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
 490     base->n_counts[0] = n_unigram;
 491     base->n_counts[1] = n_bigram;
 492     base->n_counts[2] = n_trigram;
 493     base->writable = TRUE;
 494
 495     /*
 496      * Allocate one extra unigram and bigram entry: sentinels to terminate
 497      * followers (bigrams and trigrams, respectively) of previous entry.
 498      */
 499     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
 500     model->lm3g.bigrams =
 501         ckd_calloc(n_bigram + 1, sizeof(bigram_t));
 502     if (n_trigram > 0)
 503         model->lm3g.trigrams =
 504             ckd_calloc(n_trigram, sizeof(trigram_t));
 505
 506     if (n_trigram > 0) {
 507         model->lm3g.tseg_base =
 508             ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
 509                        sizeof(int32));
 510     }
 511     if (ReadUnigrams(&li, model) == -1) {
 512         fclose_comp(fp, is_pipe);
 513         ngram_model_free(base);
 514         return NULL;
 515     }
 516     E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
 517
 518     init_sorted_list(&model->sorted_prob2);
 519     if (base->n_counts[2] > 0)
 520         init_sorted_list(&model->sorted_bo_wt2);
 521
 522     if (base->n_counts[1] > 0) {
 523         if (ReadBigrams(&li, model) == -1) {
 524             fclose_comp(fp, is_pipe);
 525             ngram_model_free(base);
 526             return NULL;
 527         }
 528
 529         base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
 530         model->lm3g.n_prob2 = model->sorted_prob2.free;
 531         model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
 532         free_sorted_list(&model->sorted_prob2);
 533         E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
 534         E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
 535     }
 536
 537     if (base->n_counts[2] > 0) {
 538         /* Create trigram bo-wts array */
 539         model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
 540         model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
 541         free_sorted_list(&model->sorted_bo_wt2);
 542         E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
 543
 544         init_sorted_list(&model->sorted_prob3);
 545
 546         if (ReadTrigrams(&li, model) == -1) {
 547             fclose_comp(fp, is_pipe);
 548             ngram_model_free(base);
 549             return NULL;
 550         }
 551
 552         base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
 553         model->lm3g.n_prob3 = model->sorted_prob3.free;
 554         model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
 555         E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
 556         E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
 557
 558         free_sorted_list(&model->sorted_prob3);
 559
 560         /* Initialize tginfo */
 561         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
 562         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
 563     }
 564
 565     lineiter_free(li);
 566     fclose_comp(fp, is_pipe);
 567     return base;
 568 }
 569
 570 int
 571 ngram_model_arpa_write(ngram_model_t *model,
 572                        const char *file_name)
 573 {
 574     ngram_iter_t *itor;
 575     FILE *fh;
 576     int i;
 577
 578     if ((fh = fopen(file_name, "w")) == NULL) {
 579         E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
 580         return -1;
 581     }
 582     fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
 583
 584     /* The ARPA format doesn't require any extra information that
 585      * N-Gram iterators can't give us, so this is very
 586      * straightforward compared with DMP writing. */
 587
 588     /* Write N-gram counts. */
 589     fprintf(fh, "\\data\\\n");
 590     for (i = 0; i < model->n; ++i) {
 591         fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
 592     }
 593
 594     /* Write N-grams */
 595     for (i = 0; i < model->n; ++i) {
 596         fprintf(fh, "\n\\%d-grams:\n", i + 1);
 597         for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
 598             int32 const *wids;
 599             int32 score, bowt;
 600             int j;
 601
 602             wids = ngram_iter_get(itor, &score, &bowt);
 603             fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
 604             for (j = 0; j <= i; ++j) {
 605                 assert(wids[j] < model->n_counts[0]);
 606                 fprintf(fh, "%s ", model->word_str[wids[j]]);
 607             }
 608             if (i < model->n-1)
 609                 fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
 610             fprintf(fh, "\n");
 611         }
 612     }
 613     fprintf(fh, "\n\\end\\\n");
 614     return fclose(fh);
 615 }
 616
 617 static int
 618 ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
 619                               float32 wip, float32 uw)
 620 {
 621     ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
 622     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
 623     return 0;
 624 }
 625
 626 /* Lousy "templating" for things that are largely the same in DMP and
 627  * ARPA models, except for the bigram and trigram types and some
 628  * names. */
 629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t
 630 #include "lm3g_templates.c"
 631
 632 static void
 633 ngram_model_arpa_free(ngram_model_t *base)
 634 {
 635     ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
 636     ckd_free(model->lm3g.unigrams);
 637     ckd_free(model->lm3g.bigrams);
 638     ckd_free(model->lm3g.trigrams);
 639     ckd_free(model->lm3g.prob2);
 640     ckd_free(model->lm3g.bo_wt2);
 641     ckd_free(model->lm3g.prob3);
 642     lm3g_tginfo_free(base, &model->lm3g);
 643     ckd_free(model->lm3g.tseg_base);
 644 }
 645
 646 static ngram_funcs_t ngram_model_arpa_funcs = {
 647     ngram_model_arpa_free,          /* free */
 648     ngram_model_arpa_apply_weights, /* apply_weights */
 649     lm3g_template_score,            /* score */
 650     lm3g_template_raw_score,        /* raw_score */
 651     lm3g_template_add_ug,           /* add_ug */
 652     lm3g_template_flush,            /* flush */
 653     lm3g_template_iter,             /* iter */
 654     lm3g_template_mgrams,           /* mgrams */
 655     lm3g_template_successors,       /* successors */
 656     lm3g_template_iter_get,         /* iter_get */
 657     lm3g_template_iter_next,        /* iter_next */
 658     lm3g_template_iter_free         /* iter_free */
 659 };