python/sphinxbase.pyx

   1 # Copyright (c) 2008 Carnegie Mellon University. All rights
   2 # reserved.
   3 #
   4 # You may copy, modify, and distribute this code under the same terms
   5 # as PocketSphinx or Python, at your convenience, as long as this
   6 # notice is not removed.
   7 #
   8 # Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
   9
  10 cdef class LogMath:
  11     """
  12     Log-space math class.
  13
  14     This class provides fast logarithmic math functions in base
  15     1.000+epsilon, useful for fixed point speech recognition.
  16
  17     @param base: The base B in which computation is to be done.
  18     @type base: float
  19     @param shift: Log values are shifted right by this many bits.
  20     @type shift: int
  21     @param use_table Whether to use an add table or not
  22     @type use_table: bool
  23     """
  24     def __init__(self, base=1.0001, shift=0, use_table=1):
  25         self.lmath = logmath_init(base, shift, use_table)
  26
  27     def __dealloc__(self):
  28         """
  29         Destructor for LogMath class.
  30         """
  31         logmath_free(self.lmath)
  32
  33     def get_zero(self):
  34         """
  35         Get the log-zero value.
  36
  37         @return: Smallest number representable by this object.
  38         @rtype: int
  39         """
  40         return logmath_get_zero(self.lmath)
  41
  42     def add(self, a, b):
  43         """
  44         Add two numbers in log-space.
  45
  46         @param a: Logarithm A.
  47         @type a: int
  48         @param b: Logarithm B.
  49         @type b: int
  50         @return: log(exp(a)+exp(b))
  51         @rtype: int
  52         """
  53         return logmath_add(self.lmath, a, b)
  54
  55     def log(self, x):
  56         """
  57         Return log-value of a number.
  58
  59         @param x: Number (in linear space)
  60         @type x: float
  61         @return: Log-value of x.
  62         @rtype: int
  63         """
  64         return logmath_log(self.lmath, x)
  65
  66     def exp(self, x):
  67         """
  68         Return linear of a log-value
  69
  70         @param x: Logarithm X (in this object's base)
  71         @type x: int
  72         @return: Exponent (linear value) of X.
  73         @rtype: float
  74         """
  75         return logmath_exp(self.lmath, x)
  76
  77     def log_to_ln(self, x):
  78         """
  79         Return natural logarithm of a log-value.
  80
  81         @param x: Logarithm X (in this object's base)
  82         @type x: int
  83         @return: Natural log equivalent of x.
  84         @rtype: float
  85         """
  86         return logmath_log_to_ln(self.lmath, x)
  87
  88     def log_to_log10(self, x):
  89         """
  90         Return logarithm in base 10 of a log-value.
  91
  92         @param x: Logarithm X (in this object's base)
  93         @type x: int
  94         @return: log10 equivalent of x.
  95         @rtype: float
  96         """
  97         return logmath_log_to_log10(self.lmath, x)
  98
  99     def ln_to_log(self, x):
 100         """
 101         Return log-value of a natural logarithm.
 102
 103         @param x: Logarithm X (in base e)
 104         @type x: float
 105         @return: Log-value equivalent of x.
 106         @rtype: int
 107         """
 108         return logmath_ln_to_log(self.lmath, x)
 109
 110     def log10_to_log(self, x):
 111         """
 112         Return log-value of a base 10 logarithm.
 113
 114         @param x: Logarithm X (in base 10)
 115         @type x: float
 116         @return: Log-value equivalent of x.
 117         @rtype: int
 118         """
 119         return logmath_log10_to_log(self.lmath, x)
 120
 121 # Unfortunately, Cython doesn't actually export enums to Python...
 122 AUTO = NGRAM_AUTO
 123 ARPA = NGRAM_ARPA
 124 DMP = NGRAM_DMP
 125 UPPER = NGRAM_UPPER
 126 LOWER = NGRAM_LOWER
 127
 128 cdef class NGramModel:
 129     """
 130     N-Gram language model class.
 131
 132     This class provides access to N-Gram language models stored on
 133     disk.  These can be in ARPABO text format or Sphinx DMP format.
 134     Methods are provided for scoring N-Grams based on the model
 135     and looking up words in the model.
 136
 137     @param file: Path to an N-Gram model file.
 138     @type file: string
 139     @param lw: Language weight to apply to model probabilities.
 140     @type lw: float
 141     @param wip: Word insertion penalty to add to model probabilities
 142     @type wip: float
 143     @param uw: Weight to give unigrams when interpolating with uniform distribution.
 144     @type uw: float
 145     """
 146     def __init__(self, file=None, lw=1.0, wip=1.0, uw=1.0, lmctl=None):
 147         self.lmath = logmath_init(1.0001, 0, 0)
 148         if file:
 149             self.lm = ngram_model_read(NULL, file, NGRAM_AUTO, self.lmath)
 150             ngram_model_apply_weights(self.lm, lw, wip, uw)
 151         elif lmctl:
 152             self.lm = ngram_model_set_read(NULL, lmctl, self.lmath)
 153         else:
 154             self.lm = NULL
 155         self.lw = lw
 156         self.wip = wip
 157         self.uw = uw
 158
 159     cdef set_lm(NGramModel self, ngram_model_t *lm):
 160         ngram_model_retain(lm)
 161         ngram_model_free(self.lm)
 162         self.lm = lm
 163
 164     cdef set_lmath(NGramModel self, logmath_t *lmath):
 165         logmath_retain(lmath)
 166         logmath_free(self.lmath)
 167         self.lmath = lmath
 168
 169     def __dealloc__(self):
 170         """
 171         Destructor for N-Gram model class.
 172         """
 173         logmath_free(self.lmath)
 174         ngram_model_free(self.lm)
 175
 176     def apply_weights(self, lw=1.0, wip=1.0, uw=1.0):
 177         """
 178         Change the language model weights applied in L{score}.
 179
 180         @param lw: Language weight to apply to model probabilities.
 181         @type lw: float
 182         @param wip: Word insertion penalty to add to model probabilities
 183         @type wip: float
 184         @param uw: Weight to give unigrams when interpolating with uniform distribution.
 185         @type uw: float
 186         """
 187         self.lw = lw
 188         self.wip = wip
 189         self.uw = uw
 190         ngram_model_apply_weights(self.lm, lw, wip, uw)
 191
 192     def get_size(self):
 193         """
 194         Get the order of this model (i.e. the 'N' in 'N-gram')
 195
 196         @return: Order of this model
 197         @rtype: int
 198         """
 199         return ngram_model_get_size(self.lm)
 200
 201     def get_counts(self):
 202         """
 203         Get the counts of each size of N-gram.
 204
 205         @return: Counts of 1, 2, ..., N grams
 206         @rtype: tuple(int)
 207         """
 208         cdef int *counts
 209         counts = ngram_model_get_counts(self.lm)
 210         return tuple([counts[i] for i in range(ngram_model_get_size(self.lm))])
 211
 212     def unknown_wid(self):
 213         """
 214         Get the ID for an unknown word.
 215
 216         In the case of a closed-vocabulary language model this will be -1.
 217
 218         @return: Word ID for the unknown word.
 219         @rtype: int
 220         """
 221         return ngram_unknown_wid(self.lm)
 222
 223     def zero(self):
 224         """
 225         Get the log-zero value for this language model.
 226
 227         @return: Log value used to represent zero.
 228         @rtype: float
 229         """
 230         return logmath_log_to_ln(self.lmath, ngram_zero(self.lm))
 231
 232     def wid(self, word):
 233         """
 234         Get the internal ID for a word.
 235
 236         @param word: Word in question
 237         @type word: string
 238         @return: Internal ID for word, or -1 if not present
 239         @rtype: int
 240         """
 241         return ngram_wid(self.lm, word)
 242
 243     def word(self, wid):
 244         """
 245         Get the string corresponding to an internal word ID.
 246
 247         @param word: Word ID in question
 248         @type word: int
 249         @return: String for word, or None if not present
 250         @rtype: string
 251         """
 252         return ngram_word(self.lm, wid)
 253
 254     # Note that this and prob() are almost exactly the same...
 255     def score(self, word, *args):
 256         """
 257         Get the score for an N-Gram.
 258
 259         The argument list consists of the history words (as
 260         null-terminated strings) of the N-Gram, in reverse order.
 261         Therefore, if you wanted to get the N-Gram score for 'a whole
 262         joy', you would call::
 263
 264          score, n_used = model.score('joy', 'whole', 'a')
 265
 266         This function returns a tuple, consisting of the score and the
 267         number of words used in computing it (i.e. the effective size
 268         of the N-Gram).  The score is returned in logarithmic form,
 269         using base e.
 270
 271         If one of the words is not in the LM's vocabulary, the result
 272         will depend on whether this is an open or closed vocabulary
 273         language model.  For an open-vocabulary model, unknown words
 274         are all mapped to the unigram <UNK> which has a non-zero
 275         probability and also participates in higher-order N-Grams.
 276         Therefore, you will get a score of some sort in this case.
 277
 278         For a closed-vocabulary model, unknown words are impossible
 279         and thus have zero probability.  Therefore, if C{word} is
 280         unknown, this function will return a 'zero' log-probability,
 281         i.e. a large negative number.
 282         """
 283         cdef int32 wid
 284         cdef int32 *hist
 285         cdef int32 n_hist
 286         cdef int32 n_used
 287         cdef int32 score
 288         wid = ngram_wid(self.lm, word)
 289         n_hist = len(args)
 290         hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
 291         for i from 0 <= i < n_hist:
 292             spam = args[i]
 293             hist[i] = ngram_wid(self.lm, spam)
 294         score = ngram_ng_score(self.lm, wid, hist, n_hist, &n_used)
 295         ckd_free(hist)
 296         return logmath_log_to_ln(self.lmath, score), n_used
 297
 298     def prob(self, word, *args):
 299         """
 300         Get the log-probability for an N-Gram.
 301
 302         This works effectively the same way as L{score}, except that
 303         any weights (language weight, insertion penalty) applied to
 304         the language model are ignored and the 'raw' probability value
 305         is returned.
 306         """
 307         cdef int32 wid
 308         cdef int32 *hist
 309         cdef int32 n_hist
 310         cdef int32 n_used
 311         cdef int32 score
 312         wid = ngram_wid(self.lm, word)
 313         n_hist = len(args)
 314         hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
 315         for i from 0 <= i < n_hist:
 316             spam = args[i]
 317             hist[i] = ngram_wid(self.lm, spam)
 318         score = ngram_ng_prob(self.lm, wid, hist, n_hist, &n_used)
 319         ckd_free(hist)
 320         return logmath_log_to_ln(self.lmath, score), n_used
 321
 322     def mgrams(self, m):
 323         """
 324         Return an iterator over each N-gram of order m+1.
 325
 326         This allows Pythonic iteration over the parameters of an
 327         N-Gram model.
 328
 329         @param m: Order of requested N-grams minus one
 330         @type m: int
 331         @return: Iterator over M+1-grams
 332         @rtype: NGramIter
 333         """
 334         cdef NGramIter itor
 335         itor = NGramIter(self, m)
 336         itor.itor = ngram_model_mgrams(self.lm, m)
 337         return itor
 338
 339     def ngram(self, word, *args):
 340         """
 341         Return an N-Gram iterator pointing to a given N-gram.
 342
 343         This allows you to iterate over its successors among other
 344         things.
 345
 346         @param word: Head word of requested N-gram.
 347         @type word: str
 348         @param args: History words of requested N-gram
 349         @type args: str
 350         @return: Iterator pointing to N-gram.
 351         """
 352         cdef NGramIter itor
 353         cdef int32 wid
 354         cdef int32 *hist
 355         cdef int32 n_hist
 356         wid = ngram_wid(self.lm, word)
 357         n_hist = len(args)
 358         hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
 359         for i from 0 <= i < n_hist:
 360             spam = args[i]
 361             hist[i] = ngram_wid(self.lm, spam)
 362         itor = NGramIter(self, n_hist)
 363         # We do set_iter here, because we're returning something the
 364         # user is immediately going to do stuff with.
 365         itor.set_iter(ngram_ng_iter(self.lm, wid, hist, n_hist))
 366         ckd_free(hist)
 367         return itor
 368
 369     def add_word(self, word, weight=1.0):
 370         return ngram_model_add_word(self.lm, word, weight)
 371
 372     def recode(self, frum, too):
 373         cdef int rv
 374         rv = ngram_model_recode(self.lm, frum, too)
 375         if rv == -1:
 376             raise ValueError, "Recode from %s to %s failed" % (frum, too)
 377
 378     def casefold(self, kase):
 379         cdef int rv
 380         rv = ngram_model_casefold(self.lm, kase)
 381         if rv == -1:
 382             raise ValueError, "Casefolding failed"
 383
 384     def write(self, file_name, format=NGRAM_AUTO):
 385         cdef int rv
 386         rv = ngram_model_write(self.lm, file_name, format)
 387         if rv == -1:
 388             raise ValueError, "Write %s to file failed" % file_name
 389
 390 cdef class NGramIter:
 391     """
 392     N-Gram language model iterator class.
 393
 394     This class provides access to the individual N-grams stored in a
 395     language model.
 396     """
 397     def __cinit__(self, NGramModel lm, int m):
 398         self.itor = NULL
 399         self.lm = lm
 400         self.m = m
 401         self.first_item = True
 402
 403     def __iter__(self):
 404         self.first_item = True
 405         return self
 406
 407     cdef set_iter(NGramIter self, ngram_iter_t *itor):
 408         cdef int32 prob, bowt
 409         cdef int32 *wids
 410
 411         if itor == NULL:
 412             raise StopIteration
 413         self.itor = itor
 414         if self.first_item:
 415             self.first_item = False
 416         wids = ngram_iter_get(itor, &prob, &bowt)
 417         self.log_prob = logmath_log_to_ln(self.lm.lmath, prob)
 418         self.log_bowt = logmath_log_to_ln(self.lm.lmath, bowt)
 419         self.words = []
 420         for i in range(0, self.m+1):
 421             self.words.append(ngram_word(self.lm.lm, wids[i]))
 422
 423     def __next__(self):
 424         if self.first_item:
 425             self.set_iter(self.itor)
 426         else:
 427             self.set_iter(ngram_iter_next(self.itor))
 428         return self
 429
 430     def successors(self):
 431         """
 432         Get an iterator over N+1-gram successors of an N-gram.
 433         """
 434         cdef NGramIter itor
 435         itor = NGramIter(self.lm, self.m + 1)
 436         itor.itor = ngram_iter_successors(self.itor)
 437         return itor
 438
 439 def binstr(str val, int nbits):
 440     """
 441     Silly function to format a string as a binary string
 442     """
 443     cdef int i
 444     outstr = ""
 445     for c in val:
 446         cval = ord(c)
 447         cnb = min(8, nbits)
 448         for i in range(0,cnb):
 449             outstr += "%d" % ((cval & (1 << 7-i)) != 0)
 450         nbits -= 8
 451     return outstr
 452
 453 def bincw(int cw, int nbits):
 454     """
 455     Silly function to format an int as a binary string
 456     """
 457     cdef int i
 458     outstr = ""
 459     for i in range(0,nbits):
 460         outstr = "%s" % (cw & 1) + outstr
 461         cw >>= 1
 462     return outstr
 463
 464 # FIXME: Due to the style of IO in huff_code API this part of the code
 465 # is not compatible with Python 3. This needs to be converted to
 466 # the new Python io module.
 467
 468 cdef class HuffCode:
 469     """
 470     Huffman coding class.
 471
 472     You can either construct a Huffman code from an alphabet of
 473     symbols with frequencies, or read one from a file.  Either the
 474     alphabet or infile argument (but not both) must be passed to the
 475     constructor.
 476
 477     @param alphabet: Alphabet of (symbol, frequency) pairs
 478     @type alphabet: [(str, int)]
 479     @param infile: File handle or filename to read from
 480     @type infile: file | str
 481     """
 482     def __init__(self, alphabet=None, infile=None):
 483         cdef char **symbols
 484         cdef int *frequencies
 485         cdef int nsym
 486
 487         if alphabet == None and infile == None:
 488             raise ValueError, "One of alphabet or infile must be passed to constructor"
 489         if alphabet != None and infile != None:
 490             raise ValueError, "Only one of alphabet or infile must be passed to constructor"
 491
 492         self.fh = None
 493         if infile:
 494             self.read(infile)
 495             return
 496
 497         nsym = len(alphabet)
 498         frequencies = <int *>ckd_calloc(nsym, sizeof(int))
 499         symbols = <char **>ckd_calloc(nsym, sizeof(char *))
 500         # Need to create separate Python objects for each string,
 501         # otherwise we get random duplicates of the codewords...
 502         bogus = []
 503         for i, spam in enumerate(alphabet):
 504             sym, freq = spam
 505             bogus.append(repr(sym))
 506             frequencies[i] = freq
 507             symbols[i] = bogus[-1]
 508         self.hc = huff_code_build_str(symbols, frequencies, nsym)
 509         ckd_free(frequencies)
 510         ckd_free(symbols)
 511
 512     def read(self, infile):
 513         if not isinstance(infile, file):
 514             infile = file(infile, "rb")
 515         huff_code_free(self.hc)
 516         self.hc = huff_code_read(PyFile_AsFile(infile))
 517
 518     def write(self, outfile):
 519         if not isinstance(outfile, file):
 520             outfile = file(outfile, "wb")
 521         huff_code_write(self.hc, PyFile_AsFile(outfile))
 522
 523     def dump(self, outfile):
 524         if not isinstance(outfile, file):
 525             outfile = file(outfile, "w")
 526         huff_code_dump(self.hc, PyFile_AsFile(outfile))
 527
 528     def encode(self, seq):
 529         """
 530         Encode a sequence of symbols to a byte array, returning that
 531         array and the bit offset of the next codeword in the last
 532         byte (i.e. 8 minutes the number of extra zero bits)
 533         """
 534         cdef unsigned int cw
 535         cdef int cwlen, nbits = 0, nbytes, offset, i
 536         cdef unsigned char buf = 0
 537         cdef char *output
 538
 539         for sym in seq:
 540             sss = repr(sym)
 541             cwlen = huff_code_encode_str(self.hc, sss, &cw)
 542             nbits += cwlen
 543         nbytes = int((nbits + 7) / 8)
 544         offset = 0
 545         output = <char *>PyMem_Malloc(nbytes + 1)
 546         output[nbytes] = 0
 547         i = 0
 548         nbits = 0
 549         for sym in seq:
 550             sss = repr(sym)
 551             cwlen = huff_code_encode_str(self.hc, sss, &cw)
 552             #print "sym: %s cw: %s buf: %s output: %s" \
 553             #      % (sym, bincw(cw, cwlen), bincw(buf >> (8-offset), offset),
 554             #         binstr(output, nbits))
 555             #print "cwlen",cwlen
 556             # Do one byte at a time while full bytes are available
 557             while cwlen >= 8:
 558                 # Fill low bits of buf with high bits of cw
 559                 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
 560                 # Append buf to output
 561                 output[i] = buf
 562                 i += 1
 563                 nbits += 8
 564                 # Fill high bits of buf with low bits of this byte
 565                 cwlen -= 8
 566                 buf = (cw >> cwlen) & ((1 << offset) - 1)
 567                 buf <<= (8-offset)
 568                 #print "cwlen",cwlen
 569             # Now cwlen will be less than 8, but it might still be
 570             # more than the available space in buf.
 571             if cwlen >= (8 - offset):
 572                 # Fill low bits of buf with (8-offset) highest bits of cw
 573                 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
 574                 # Append buf to output
 575                 output[i] = buf
 576                 i += 1
 577                 nbits += 8
 578                 # cwlen is down to the remaining bits
 579                 cwlen -= (8 - offset)
 580                 # Offset is now zero since we just completed and emptied buf
 581                 offset = 0
 582                 # buf is zero, because we just emptied it without putting stuff in
 583                 buf = 0
 584                 #print "cwlen",cwlen
 585                 # Any remaining  bits will be taken care of below (we hope)
 586             # Add remaining high bits of cw to low bits of buf
 587             #print "cwlen",cwlen
 588             buf |= ((cw & ((1 << cwlen) - 1)) << (8 - offset - cwlen))
 589             offset += cwlen
 590             #print "after buf: %s output: %s" \
 591             #      % (bincw(buf >> (8-offset), offset), binstr(output, nbits))
 592         if offset > 0:
 593             # Append buf to output
 594             output[i] = buf
 595             nbits += offset
 596             i += 1
 597         #print "output:", binstr(output, nbits)
 598         outstr = PyString_FromStringAndSize(output, nbytes)
 599         PyMem_Free(output)
 600         return (outstr, offset)
 601
 602     def decode(self, data):
 603         """
 604         Decode a sequence of symbols from a string, returning the
 605         sequence and the bit offset of the next codeword in the last
 606         byte (i.e. 8 minutes the number of remaining bits)
 607         """
 608         cdef int offset
 609         cdef char *dptr
 610         cdef char *strval
 611         cdef size_t dlen
 612
 613         dlen = len(data)
 614         offset = 0
 615         dptr = data
 616         output = []
 617         while True:
 618             strval = huff_code_decode_str(self.hc, &dptr, &dlen, &offset)
 619             if strval == NULL:
 620                 break
 621             output.append(strval)
 622         if dlen > 1:
 623             raise ValueError, "Invalid data at position %d" % (len(data) - dlen)
 624         return (output, offset)
 625
 626     def attach(self, fh, char *mode):
 627         if not isinstance(fh, file):
 628             fh = file(fh, mode)
 629         self.fh = fh
 630         huff_code_attach(self.hc, PyFile_AsFile(fh), mode)
 631
 632     def detach(self):
 633         huff_code_detach(self.hc)
 634         self.fh = None
 635
 636     def encode_to_file(self, seq):
 637         if self.fh == None:
 638             raise RuntimeError, "No file is attached"
 639         for sym in seq:
 640             strsym = repr(sym)
 641             huff_code_encode_str(self.hc, strsym, NULL)
 642
 643     def decode_from_file(self):
 644         cdef char *sym
 645         if self.fh == None:
 646             raise RuntimeError, "No file is attached"
 647         sym = huff_code_decode_str(self.hc, NULL, NULL, NULL)
 648         if sym == NULL:
 649             return None
 650         else:
 651             return sym
 652
 653     def __dealloc__(self):
 654         if self.fh:
 655             self.detach()
 656         huff_code_free(self.hc)