1 # Copyright (c) 2008 Carnegie Mellon University. All rights
4 # You may copy, modify, and distribute this code under the same terms
5 # as PocketSphinx or Python, at your convenience, as long as this
6 # notice is not removed.
8 # Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
14 This class provides fast logarithmic math functions in base
15 1.000+epsilon, useful for fixed point speech recognition.
17 @param base: The base B in which computation is to be done.
19 @param shift: Log values are shifted right by this many bits.
21 @param use_table Whether to use an add table or not
24 def __init__(self, base=1.0001, shift=0, use_table=1):
25 self.lmath = logmath_init(base, shift, use_table)
27 def __dealloc__(self):
29 Destructor for LogMath class.
31 logmath_free(self.lmath)
35 Get the log-zero value.
37 @return: Smallest number representable by this object.
40 return logmath_get_zero(self.lmath)
44 Add two numbers in log-space.
46 @param a: Logarithm A.
48 @param b: Logarithm B.
50 @return: log(exp(a)+exp(b))
53 return logmath_add(self.lmath, a, b)
57 Return log-value of a number.
59 @param x: Number (in linear space)
61 @return: Log-value of x.
64 return logmath_log(self.lmath, x)
68 Return linear of a log-value
70 @param x: Logarithm X (in this object's base)
72 @return: Exponent (linear value) of X.
75 return logmath_exp(self.lmath, x)
77 def log_to_ln(self, x):
79 Return natural logarithm of a log-value.
81 @param x: Logarithm X (in this object's base)
83 @return: Natural log equivalent of x.
86 return logmath_log_to_ln(self.lmath, x)
88 def log_to_log10(self, x):
90 Return logarithm in base 10 of a log-value.
92 @param x: Logarithm X (in this object's base)
94 @return: log10 equivalent of x.
97 return logmath_log_to_log10(self.lmath, x)
99 def ln_to_log(self, x):
101 Return log-value of a natural logarithm.
103 @param x: Logarithm X (in base e)
105 @return: Log-value equivalent of x.
108 return logmath_ln_to_log(self.lmath, x)
110 def log10_to_log(self, x):
112 Return log-value of a base 10 logarithm.
114 @param x: Logarithm X (in base 10)
116 @return: Log-value equivalent of x.
119 return logmath_log10_to_log(self.lmath, x)
121 # Unfortunately, Cython doesn't actually export enums to Python...
128 cdef class NGramModel:
130 N-Gram language model class.
132 This class provides access to N-Gram language models stored on
133 disk. These can be in ARPABO text format or Sphinx DMP format.
134 Methods are provided for scoring N-Grams based on the model
135 and looking up words in the model.
137 @param file: Path to an N-Gram model file.
139 @param lw: Language weight to apply to model probabilities.
141 @param wip: Word insertion penalty to add to model probabilities
143 @param uw: Weight to give unigrams when interpolating with uniform distribution.
146 def __init__(self, file=None, lw=1.0, wip=1.0, uw=1.0, lmctl=None):
147 self.lmath = logmath_init(1.0001, 0, 0)
149 self.lm = ngram_model_read(NULL, file, NGRAM_AUTO, self.lmath)
150 ngram_model_apply_weights(self.lm, lw, wip, uw)
152 self.lm = ngram_model_set_read(NULL, lmctl, self.lmath)
159 cdef set_lm(NGramModel self, ngram_model_t *lm):
160 ngram_model_retain(lm)
161 ngram_model_free(self.lm)
164 cdef set_lmath(NGramModel self, logmath_t *lmath):
165 logmath_retain(lmath)
166 logmath_free(self.lmath)
169 def __dealloc__(self):
171 Destructor for N-Gram model class.
173 logmath_free(self.lmath)
174 ngram_model_free(self.lm)
176 def apply_weights(self, lw=1.0, wip=1.0, uw=1.0):
178 Change the language model weights applied in L{score}.
180 @param lw: Language weight to apply to model probabilities.
182 @param wip: Word insertion penalty to add to model probabilities
184 @param uw: Weight to give unigrams when interpolating with uniform distribution.
190 ngram_model_apply_weights(self.lm, lw, wip, uw)
194 Get the order of this model (i.e. the 'N' in 'N-gram')
196 @return: Order of this model
199 return ngram_model_get_size(self.lm)
201 def get_counts(self):
203 Get the counts of each size of N-gram.
205 @return: Counts of 1, 2, ..., N grams
209 counts = ngram_model_get_counts(self.lm)
210 return tuple([counts[i] for i in range(ngram_model_get_size(self.lm))])
212 def unknown_wid(self):
214 Get the ID for an unknown word.
216 In the case of a closed-vocabulary language model this will be -1.
218 @return: Word ID for the unknown word.
221 return ngram_unknown_wid(self.lm)
225 Get the log-zero value for this language model.
227 @return: Log value used to represent zero.
230 return logmath_log_to_ln(self.lmath, ngram_zero(self.lm))
234 Get the internal ID for a word.
236 @param word: Word in question
238 @return: Internal ID for word, or -1 if not present
241 return ngram_wid(self.lm, word)
245 Get the string corresponding to an internal word ID.
247 @param word: Word ID in question
249 @return: String for word, or None if not present
252 return ngram_word(self.lm, wid)
254 # Note that this and prob() are almost exactly the same...
255 def score(self, word, *args):
257 Get the score for an N-Gram.
259 The argument list consists of the history words (as
260 null-terminated strings) of the N-Gram, in reverse order.
261 Therefore, if you wanted to get the N-Gram score for 'a whole
262 joy', you would call::
264 score, n_used = model.score('joy', 'whole', 'a')
266 This function returns a tuple, consisting of the score and the
267 number of words used in computing it (i.e. the effective size
268 of the N-Gram). The score is returned in logarithmic form,
271 If one of the words is not in the LM's vocabulary, the result
272 will depend on whether this is an open or closed vocabulary
273 language model. For an open-vocabulary model, unknown words
274 are all mapped to the unigram <UNK> which has a non-zero
275 probability and also participates in higher-order N-Grams.
276 Therefore, you will get a score of some sort in this case.
278 For a closed-vocabulary model, unknown words are impossible
279 and thus have zero probability. Therefore, if C{word} is
280 unknown, this function will return a 'zero' log-probability,
281 i.e. a large negative number.
288 wid = ngram_wid(self.lm, word)
290 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
291 for i from 0 <= i < n_hist:
293 hist[i] = ngram_wid(self.lm, spam)
294 score = ngram_ng_score(self.lm, wid, hist, n_hist, &n_used)
296 return logmath_log_to_ln(self.lmath, score), n_used
298 def prob(self, word, *args):
300 Get the log-probability for an N-Gram.
302 This works effectively the same way as L{score}, except that
303 any weights (language weight, insertion penalty) applied to
304 the language model are ignored and the 'raw' probability value
312 wid = ngram_wid(self.lm, word)
314 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
315 for i from 0 <= i < n_hist:
317 hist[i] = ngram_wid(self.lm, spam)
318 score = ngram_ng_prob(self.lm, wid, hist, n_hist, &n_used)
320 return logmath_log_to_ln(self.lmath, score), n_used
324 Return an iterator over each N-gram of order m+1.
326 This allows Pythonic iteration over the parameters of an
329 @param m: Order of requested N-grams minus one
331 @return: Iterator over M+1-grams
335 itor = NGramIter(self, m)
336 itor.itor = ngram_model_mgrams(self.lm, m)
339 def ngram(self, word, *args):
341 Return an N-Gram iterator pointing to a given N-gram.
343 This allows you to iterate over its successors among other
346 @param word: Head word of requested N-gram.
348 @param args: History words of requested N-gram
350 @return: Iterator pointing to N-gram.
356 wid = ngram_wid(self.lm, word)
358 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
359 for i from 0 <= i < n_hist:
361 hist[i] = ngram_wid(self.lm, spam)
362 itor = NGramIter(self, n_hist)
363 # We do set_iter here, because we're returning something the
364 # user is immediately going to do stuff with.
365 itor.set_iter(ngram_ng_iter(self.lm, wid, hist, n_hist))
369 def add_word(self, word, weight=1.0):
370 return ngram_model_add_word(self.lm, word, weight)
372 def recode(self, frum, too):
374 rv = ngram_model_recode(self.lm, frum, too)
376 raise ValueError, "Recode from %s to %s failed" % (frum, too)
378 def casefold(self, kase):
380 rv = ngram_model_casefold(self.lm, kase)
382 raise ValueError, "Casefolding failed"
384 def write(self, file_name, format=NGRAM_AUTO):
386 rv = ngram_model_write(self.lm, file_name, format)
388 raise ValueError, "Write %s to file failed" % file_name
390 cdef class NGramIter:
392 N-Gram language model iterator class.
394 This class provides access to the individual N-grams stored in a
397 def __cinit__(self, NGramModel lm, int m):
401 self.first_item = True
404 self.first_item = True
407 cdef set_iter(NGramIter self, ngram_iter_t *itor):
408 cdef int32 prob, bowt
415 self.first_item = False
416 wids = ngram_iter_get(itor, &prob, &bowt)
417 self.log_prob = logmath_log_to_ln(self.lm.lmath, prob)
418 self.log_bowt = logmath_log_to_ln(self.lm.lmath, bowt)
420 for i in range(0, self.m+1):
421 self.words.append(ngram_word(self.lm.lm, wids[i]))
425 self.set_iter(self.itor)
427 self.set_iter(ngram_iter_next(self.itor))
430 def successors(self):
432 Get an iterator over N+1-gram successors of an N-gram.
435 itor = NGramIter(self.lm, self.m + 1)
436 itor.itor = ngram_iter_successors(self.itor)
439 def binstr(str val, int nbits):
441 Silly function to format a string as a binary string
448 for i in range(0,cnb):
449 outstr += "%d" % ((cval & (1 << 7-i)) != 0)
453 def bincw(int cw, int nbits):
455 Silly function to format an int as a binary string
459 for i in range(0,nbits):
460 outstr = "%s" % (cw & 1) + outstr
464 # FIXME: Due to the style of IO in huff_code API this part of the code
465 # is not compatible with Python 3. This needs to be converted to
466 # the new Python io module.
470 Huffman coding class.
472 You can either construct a Huffman code from an alphabet of
473 symbols with frequencies, or read one from a file. Either the
474 alphabet or infile argument (but not both) must be passed to the
477 @param alphabet: Alphabet of (symbol, frequency) pairs
478 @type alphabet: [(str, int)]
479 @param infile: File handle or filename to read from
480 @type infile: file | str
482 def __init__(self, alphabet=None, infile=None):
484 cdef int *frequencies
487 if alphabet == None and infile == None:
488 raise ValueError, "One of alphabet or infile must be passed to constructor"
489 if alphabet != None and infile != None:
490 raise ValueError, "Only one of alphabet or infile must be passed to constructor"
498 frequencies = <int *>ckd_calloc(nsym, sizeof(int))
499 symbols = <char **>ckd_calloc(nsym, sizeof(char *))
500 # Need to create separate Python objects for each string,
501 # otherwise we get random duplicates of the codewords...
503 for i, spam in enumerate(alphabet):
505 bogus.append(repr(sym))
506 frequencies[i] = freq
507 symbols[i] = bogus[-1]
508 self.hc = huff_code_build_str(symbols, frequencies, nsym)
509 ckd_free(frequencies)
512 def read(self, infile):
513 if not isinstance(infile, file):
514 infile = file(infile, "rb")
515 huff_code_free(self.hc)
516 self.hc = huff_code_read(PyFile_AsFile(infile))
518 def write(self, outfile):
519 if not isinstance(outfile, file):
520 outfile = file(outfile, "wb")
521 huff_code_write(self.hc, PyFile_AsFile(outfile))
523 def dump(self, outfile):
524 if not isinstance(outfile, file):
525 outfile = file(outfile, "w")
526 huff_code_dump(self.hc, PyFile_AsFile(outfile))
528 def encode(self, seq):
530 Encode a sequence of symbols to a byte array, returning that
531 array and the bit offset of the next codeword in the last
532 byte (i.e. 8 minutes the number of extra zero bits)
535 cdef int cwlen, nbits = 0, nbytes, offset, i
536 cdef unsigned char buf = 0
541 cwlen = huff_code_encode_str(self.hc, sss, &cw)
543 nbytes = int((nbits + 7) / 8)
545 output = <char *>PyMem_Malloc(nbytes + 1)
551 cwlen = huff_code_encode_str(self.hc, sss, &cw)
552 #print "sym: %s cw: %s buf: %s output: %s" \
553 # % (sym, bincw(cw, cwlen), bincw(buf >> (8-offset), offset),
554 # binstr(output, nbits))
556 # Do one byte at a time while full bytes are available
558 # Fill low bits of buf with high bits of cw
559 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
560 # Append buf to output
564 # Fill high bits of buf with low bits of this byte
566 buf = (cw >> cwlen) & ((1 << offset) - 1)
569 # Now cwlen will be less than 8, but it might still be
570 # more than the available space in buf.
571 if cwlen >= (8 - offset):
572 # Fill low bits of buf with (8-offset) highest bits of cw
573 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
574 # Append buf to output
578 # cwlen is down to the remaining bits
579 cwlen -= (8 - offset)
580 # Offset is now zero since we just completed and emptied buf
582 # buf is zero, because we just emptied it without putting stuff in
585 # Any remaining bits will be taken care of below (we hope)
586 # Add remaining high bits of cw to low bits of buf
588 buf |= ((cw & ((1 << cwlen) - 1)) << (8 - offset - cwlen))
590 #print "after buf: %s output: %s" \
591 # % (bincw(buf >> (8-offset), offset), binstr(output, nbits))
593 # Append buf to output
597 #print "output:", binstr(output, nbits)
598 outstr = PyString_FromStringAndSize(output, nbytes)
600 return (outstr, offset)
602 def decode(self, data):
604 Decode a sequence of symbols from a string, returning the
605 sequence and the bit offset of the next codeword in the last
606 byte (i.e. 8 minutes the number of remaining bits)
618 strval = huff_code_decode_str(self.hc, &dptr, &dlen, &offset)
621 output.append(strval)
623 raise ValueError, "Invalid data at position %d" % (len(data) - dlen)
624 return (output, offset)
626 def attach(self, fh, char *mode):
627 if not isinstance(fh, file):
630 huff_code_attach(self.hc, PyFile_AsFile(fh), mode)
633 huff_code_detach(self.hc)
636 def encode_to_file(self, seq):
638 raise RuntimeError, "No file is attached"
641 huff_code_encode_str(self.hc, strsym, NULL)
643 def decode_from_file(self):
646 raise RuntimeError, "No file is attached"
647 sym = huff_code_decode_str(self.hc, NULL, NULL, NULL)
653 def __dealloc__(self):
656 huff_code_free(self.hc)