src/sphinx_lmtools/sphinx_lm_eval.c

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 2008 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37 /**
  38  * \file sphinx_lm_eval.c
  39  * Language model evaluation tool.
  40  */
  41 #include <sphinxbase/logmath.h>
  42 #include <sphinxbase/ngram_model.h>
  43 #include <sphinxbase/cmd_ln.h>
  44 #include <sphinxbase/ckd_alloc.h>
  45 #include <sphinxbase/err.h>
  46 #include <sphinxbase/pio.h>
  47 #include <sphinxbase/strfuncs.h>
  48
  49 #include <stdio.h>
  50 #include <string.h>
  51 #include <math.h>
  52
  53 static const arg_t defn[] = {
  54   { "-help",
  55     ARG_BOOLEAN,
  56     "no",
  57     "Shows the usage of the tool"},
  58
  59   { "-logbase",
  60     ARG_FLOAT64,
  61     "1.0001",
  62     "Base in which all log-likelihoods calculated" },
  63
  64   { "-lm",
  65     ARG_STRING,
  66     NULL,
  67     "Language model file"},
  68
  69   { "-probdef",
  70     ARG_STRING,
  71     NULL,
  72     "Probability definition file for classes in LM"},
  73
  74   { "-lmctlfn",
  75     ARG_STRING,
  76     NULL,
  77     "Control file listing a set of language models"},
  78
  79   { "-lmname",
  80     ARG_STRING,
  81     NULL,
  82     "Name of language model in -lmctlfn to use for all utterances" },
  83
  84   { "-lsn",
  85     ARG_STRING,
  86     NULL,
  87     "Transcription file to evaluate"},
  88
  89   { "-text",
  90     ARG_STRING,
  91     "Text string to evaluate"},
  92
  93   { "-mmap",
  94     ARG_BOOLEAN,
  95     "no",
  96     "Use memory-mapped I/O for reading binary LM files"},
  97
  98   { "-lw",
  99     ARG_FLOAT32,
 100     "1.0",
 101     "Language model weight" },
 102
 103   { "-wip",
 104     ARG_FLOAT32,
 105     "1.0",
 106     "Word insertion probability" },
 107
 108   { "-uw",
 109     ARG_FLOAT32,
 110     "1.0",
 111     "Unigram probability weight (interpolated with uniform distribution)"},
 112
 113   { "-verbose",
 114     ARG_BOOLEAN,
 115     "no",
 116     "Print details of perplexity calculation" },
 117
 118   /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
 119   { NULL, 0, NULL, NULL }
 120 };
 121
 122 static int verbose;
 123
 124 static int
 125 calc_entropy(ngram_model_t *lm, char **words, int32 n,
 126              int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
 127 {
 128         int32 *wids;
 129         int32 startwid;
 130         int32 i, ch, nccs, noovs, unk;
 131
 132         if (n == 0)
 133             return 0;
 134
 135         unk = ngram_unknown_wid(lm);
 136
 137         /* Reverse this array into an array of word IDs. */
 138         wids = ckd_calloc(n, sizeof(*wids));
 139         for (i = 0; i < n; ++i)
 140                 wids[n-i-1] = ngram_wid(lm, words[i]);
 141         /* Skip <s> as it's a context cue (HACK, this should be configurable). */
 142         startwid = ngram_wid(lm, "<s>");
 143
 144         /* Now evaluate the list of words in reverse using the
 145          * remainder of the array as the history. */
 146         ch = noovs = nccs = 0;
 147         for (i = 0; i < n; ++i) {
 148                 int32 n_used;
 149                 int32 prob;
 150
 151                 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
 152                 if (wids[i] == startwid) {
 153                         ++nccs;
 154                         continue;
 155                 }
 156                 /* Skip and count OOVs. */
 157                 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
 158                         ++noovs;
 159                         continue;
 160                 }
 161                 /* Sum up information for each N-gram */
 162                 prob = ngram_ng_score(lm,
 163                                       wids[i], wids + i + 1,
 164                                       n - i - 1, &n_used);
 165                 if (verbose) {
 166                     int m;
 167                     printf("log P(%s|", ngram_word(lm, wids[i]));
 168                     m = i + ngram_model_get_size(lm) - 1;
 169                     if (m >= n)
 170                         m = n - 1;
 171                     while (m > i) {
 172                         printf("%s ", ngram_word(lm, wids[m--]));
 173                     }
 174                     printf(") = %d\n", prob);
 175                 }
 176                 ch -= prob;
 177         }
 178
 179         if (out_n_ccs) *out_n_ccs = nccs;
 180         if (out_n_oovs) *out_n_oovs = noovs;
 181
 182         /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
 183         n -= (nccs + noovs);
 184         if (n <= 0)
 185             return 0;
 186         if (out_lm_score)
 187             *out_lm_score = -ch;
 188         return ch / n;
 189 }
 190
 191 static void
 192 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
 193 {
 194         FILE *fh;
 195         lineiter_t *litor;
 196         int32 nccs, noovs, nwords, lscr;
 197         float64 ch, log_to_log2;;
 198
 199         if ((fh = fopen(lsnfn, "r")) == NULL)
 200                 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
 201
 202         /* We have to keep ch in floating-point to avoid overflows, so
 203          * we might as well use log2. */
 204         log_to_log2 = log(logmath_get_base(lmath)) / log(2);
 205         nccs = noovs = nwords = 0;
 206         ch = 0.0;
 207         for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
 208                 char **words;
 209                 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;
 210
 211                 n = str2words(litor->buf, NULL, 0);
 212                 if (n < 0)
 213                         E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
 214                 if (n == 0) /* Do nothing! */
 215                         continue;
 216                 words = ckd_calloc(n, sizeof(*words));
 217                 str2words(litor->buf, words, n);
 218
 219                 /* Remove any utterance ID (FIXME: has to be a single "word") */
 220                 if (words[n-1][0] == '('
 221                     && words[n-1][strlen(words[n-1])-1] == ')')
 222                         n = n - 1;
 223
 224                 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
 225                                       &tmp_noovs, &tmp_lscr);
 226
 227                 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
 228                 nccs += tmp_nccs;
 229                 noovs += tmp_noovs;
 230                 lscr += tmp_lscr;
 231                 nwords += n;
 232
 233                 ckd_free(words);
 234         }
 235
 236         ch /= (nwords - nccs - noovs);
 237         printf("cross-entropy: %f bits\n", ch);
 238
 239         /* Calculate perplexity pplx = exp CH */
 240         printf("perplexity: %f\n", pow(2.0, ch));
 241         printf("lm score: %d\n", lscr);
 242
 243         /* Report OOVs and CCs */
 244         printf("%d words evaluated\n", nwords);
 245         printf("%d OOVs (%.2f%%), %d context cues removed\n",
 246                noovs, (double)noovs / nwords * 100, nccs);
 247 }
 248
 249 static void
 250 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
 251 {
 252         char *textfoo;
 253         char **words;
 254         int32 n, ch, noovs, nccs, lscr;
 255
 256         /* Split it into an array of strings. */
 257         textfoo = ckd_salloc(text);
 258         n = str2words(textfoo, NULL, 0);
 259         if (n < 0)
 260                 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
 261         if (n == 0) /* Do nothing! */
 262                 return;
 263         words = ckd_calloc(n, sizeof(*words));
 264         str2words(textfoo, words, n);
 265
 266         ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);
 267
 268         printf("input: %s\n", text);
 269         printf("cross-entropy: %f bits\n",
 270                ch * log(logmath_get_base(lmath)) / log(2));
 271
 272         /* Calculate perplexity pplx = exp CH */
 273         printf("perplexity: %f\n", logmath_exp(lmath, ch));
 274         printf("lm score: %d\n", lscr);
 275
 276         /* Report OOVs and CCs */
 277         printf("%d words evaluated\n", n);
 278         printf("%d OOVs, %d context cues removed\n",
 279               noovs, nccs);
 280
 281         ckd_free(textfoo);
 282         ckd_free(words);
 283 }
 284
 285 int
 286 main(int argc, char *argv[])
 287 {
 288         cmd_ln_t *config;
 289         ngram_model_t *lm = NULL;
 290         logmath_t *lmath;
 291         const char *lmfn, *probdefn, *lsnfn, *text;
 292
 293         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
 294                 return 1;
 295
 296         verbose = cmd_ln_boolean_r(config, "-verbose");
 297
 298         /* Create log math object. */
 299         if ((lmath = logmath_init
 300              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
 301                 E_FATAL("Failed to initialize log math\n");
 302         }
 303
 304         /* Load the language model. */
 305         lmfn = cmd_ln_str_r(config, "-lm");
 306         if (lmfn == NULL
 307             || (lm = ngram_model_read(config, lmfn,
 308                                       NGRAM_AUTO, lmath)) == NULL) {
 309                 E_FATAL("Failed to load language model from %s\n",
 310                         cmd_ln_str_r(config, "-lm"));
 311         }
 312         if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
 313             ngram_model_read_classdef(lm, probdefn);
 314         ngram_model_apply_weights(lm,
 315                                   cmd_ln_float32_r(config, "-lw"),
 316                                   cmd_ln_float32_r(config, "-wip"),
 317                                   cmd_ln_float32_r(config, "-uw"));
 318
 319         /* Now evaluate some text. */
 320         lsnfn = cmd_ln_str_r(config, "-lsn");
 321         text = cmd_ln_str_r(config, "-text");
 322         if (lsnfn) {
 323                 evaluate_file(lm, lmath, lsnfn);
 324         }
 325         else if (text) {
 326                 evaluate_string(lm, lmath, text);
 327         }
 328
 329         return 0;
 330 }