1 # Copyright (c) 2008 Carnegie Mellon University. All rights
4 # You may copy, modify, and distribute this code under the same terms
5 # as PocketSphinx or Python, at your convenience, as long as this
6 # notice is not removed.
8 # Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
12 Node in a word lattice.
14 @ivar word: Word this node corresponds to (with pronunciation variant).
16 @ivar baseword: Base word (no pronunciation variant) this node corresponds to.
18 @ivar sf: Start frame for this node.
20 @ivar fef: First ending frame for this node.
22 @ivar lef: Last ending frame for this node.
24 @ivar best_exit: Best scoring exit link from this node
25 @type best_exit: LatLink
26 @ivar prob: Posterior probability for this node.
32 cdef set_node(LatNode self, ps_lattice_t *dag, ps_latnode_t *node):
34 Internal function - binds this to a PocketSphinx lattice node.
37 cdef ps_latlink_t *best_exit
40 self.word = ps_latnode_word(dag, node)
41 self.baseword = ps_latnode_baseword(dag, node)
42 self.sf = ps_latnode_times(node, &fef, &lef)
47 self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
48 ps_latnode_prob(dag, node, &best_exit))
50 self.best_exit = LatLink()
51 self.best_exit.set_link(dag, best_exit)
55 Obtain an iterator over arcs exiting this node.
57 @return: Iterator over arcs exiting this node
58 @rtype: LatLinkIterator
60 cdef LatLinkIterator itor
61 cdef ps_latlink_iter_t *citor
63 citor = ps_latnode_exits(self.node)
64 itor = LatLinkIterator()
71 Obtain an iterator over arcs entering this node.
73 @return: Iterator over arcs entering this node
74 @rtype: LatLinkIterator
76 cdef LatLinkIterator itor
77 cdef ps_latlink_iter_t *citor
79 citor = ps_latnode_entries(self.node)
80 itor = LatLinkIterator()
85 cdef class LatNodeIterator:
87 Iterator over word lattice nodes.
89 def __init__(self, start, end):
91 self.first_node = True
100 Advance iterator and return the next node.
102 @return: Next lattice node in this iterator.
107 cdef ps_latnode_t *cnode
109 # Make sure we keep raising exceptions at the end
110 if self.itor == NULL:
112 # Advance the iterator if this isn't the first item
114 self.first_node = False
116 self.itor = ps_latnode_iter_next(self.itor)
117 if self.itor == NULL:
119 # Look for the next node within the given time range
120 cnode = ps_latnode_iter_node(self.itor)
121 start = ps_latnode_times(cnode, NULL, NULL)
122 while start < self.start or start >= self.end:
123 self.itor = ps_latnode_iter_next(self.itor)
124 if self.itor == NULL:
126 cnode = ps_latnode_iter_node(self.itor)
127 start = ps_latnode_times(cnode, NULL, NULL)
129 node.set_node(self.dag, cnode)
134 Link (edge) in a word lattice, connecting two nodes.
136 @ivar word: Word (with pronunciation variant) for this link.
138 @ivar baseword: Base word (no pronunciation variant) for this link.
140 @ivar sf: Start frame for this link.
142 @ivar fef: Ending frame for this link.
144 @ivar prob: Posterior probability for this link.
150 cdef set_link(LatLink self, ps_lattice_t *dag, ps_latlink_t *link):
152 Internal function - binds this to a PocketSphinx lattice link.
157 self.word = ps_latlink_word(dag, link)
158 self.baseword = ps_latlink_baseword(dag, link)
159 self.ef = ps_latlink_times(link, &sf)
161 self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
162 ps_latlink_prob(dag, link, NULL))
166 Get source and destination nodes for this link.
168 @return: Source and destination nodes for this link
169 @rtype: (LatNode, LatNode)
171 cdef LatNode src, dest
172 cdef ps_latnode_t *csrc, *cdest
174 cdest = ps_latlink_nodes(self.link, &csrc)
176 src.set_node(self.dag, csrc)
178 dest.set_node(self.dag, cdest)
183 Get backpointer from this link.
185 @return: Backpointer from this link, set by bestpath search.
189 cdef ps_latlink_t *cpred
191 cpred = ps_latlink_pred(self.link)
195 pred.set_link(self.dag, cpred)
198 cdef class LatLinkIterator:
200 Iterator over word lattice links.
204 self.first_link = True
211 Advance iterator and return the next link.
213 @return: Next lattice link in this iterator.
218 self.first_link = False
220 self.itor = ps_latlink_iter_next(self.itor)
221 if self.itor == NULL:
224 link.set_link(self.dag, ps_latlink_iter_link(self.itor))
231 The word lattice is a compact representation of the set of
232 hypotheses considered by the decoder when recognizing an
235 A lattice object can be constructed either from a lattice file
236 on disk or from a 'boxed' object passed in from GStreamer (or,
237 in theory, anything else that uses GLib). In the first case,
238 the C{ps} argument is required.
240 @param ps: PocketSphinx decoder.
242 @param latfile: Filename of lattice file to read.
244 @param boxed: Boxed pointer from GStreamer containing a lattice
245 @type boxed: PyGBoxed
247 @ivar n_frames: Number of frames of audio covered by this lattice
249 @ivar start: Start node
254 def __init__(self, ps=None, latfile=None, boxed=None):
257 self.read_dag(ps, latfile)
259 self.set_boxed(boxed)
261 cdef read_dag(Lattice self, Decoder ps, latfile):
263 self.dag = ps_lattice_read(ps.ps, latfile)
265 self.dag = ps_lattice_read(NULL, latfile)
266 self.n_frames = ps_lattice_n_frames(self.dag)
268 raise RuntimeError, "Failed to read lattice from %s" % latfile
270 cdef set_dag(Lattice self, ps_lattice_t *dag):
271 ps_lattice_retain(dag)
272 ps_lattice_free(self.dag)
274 self.n_frames = ps_lattice_n_frames(dag)
276 cdef set_boxed(Lattice self, box):
277 cdef ps_lattice_t *dag
278 dag = <ps_lattice_t *>(<PyGBoxed *>box).boxed
279 ps_lattice_retain(dag)
280 ps_lattice_free(self.dag)
282 self.n_frames = ps_lattice_n_frames(self.dag)
284 def __dealloc__(self):
285 ps_lattice_free(self.dag)
287 def bestpath(self, NGramModel lmset, float lwf, float ascale):
289 Find the best path through the lattice, optionally using a
292 This function performs best-path search on the lattice, and
293 returns the final link in the best path found. The existing
294 acoustic scores on the lattice links are used in conjunction
295 with an optional language model. A scaling factor can be
296 applied to the acoustic scores to produce more useful
297 posterior probabilities (in conjunction with C{posterior()},
300 @param lmset: Language model (set) to use for rescoring
301 @type lmset: sphinxbase.NGramModel
302 @param lwf: Weight to apply to language model scores (on top
303 of any existing language model weight set in C{lmset}).
305 @param ascale: Weight to apply to acoustic model scores.
307 @return: Final link in best path.
310 cdef ps_latlink_t *end
312 end = ps_lattice_bestpath(self.dag, lmset.lm, lwf, ascale)
314 link.set_link(self.dag, end)
317 def posterior(self, NGramModel lmset, float ascale):
319 Calculate posterior probabilities of all links in a lattice.
321 This function performs the backward part of forward-backward
322 calculation of posterior probabilities for all links in the
323 lattice. It assumes that C{bestpath()} has already been
324 called on the lattice.
326 @param lmset: Language model (set) to use for rescoring
327 @type lmset: sphinxbase.NGramModel
328 @param ascale: Weight to apply to acoustic model scores.
330 @return: Log-probability of the lattice as a whole.
333 cdef logmath_t *lmath
334 lmath = ps_lattice_get_logmath(self.dag)
335 return sb.logmath_log_to_ln(lmath,
336 ps_lattice_posterior(self.dag, lmset.lm, ascale))
338 def nodes(self, start=0, end=-1):
340 Get an iterator over all nodes in the lattice.
342 @param start: First frame to iterate over.
344 @param end: Last frame to iterate over, or -1 for all remaining
346 @return: Iterator over nodes.
347 @rtype: LatNodeIterator
349 cdef LatNodeIterator itor
352 end = ps_lattice_n_frames(self.dag)
353 itor = LatNodeIterator(start, end)
355 itor.itor = ps_latnode_iter(self.dag)
358 def write(self, outfile):
360 Write the lattice to an output file.
362 @param outfile: Name of file to write to.
367 rv = ps_lattice_write(self.dag, outfile)
369 raise RuntimeError, "Failed to write lattice to %s" % outfile
373 PocketSphinx speech decoder.
375 To initialize the PocketSphinx decoder, pass a list of keyword
376 arguments to the constructor::
378 d = pocketsphinx.Decoder(hmm='/path/to/acoustic/model',
379 lm='/path/to/language/model',
380 dict='/path/to/dictionary',
383 If no arguments are passed, the default acoustic and language
384 models will be loaded, which may be acceptable for general English
385 speech. Any arguments supported by the PocketSphinx decoder are
386 allowed here. Only the most frequent ones are described below.
388 @param boxed: Boxed pointer from GStreamer containing a decoder
389 @type boxed: PyGBoxed
390 @param hmm: Path to acoustic model directory
392 @param dict: Path to dictionary file
394 @param lm: Path to language model file
396 @param jsgf: Path to JSGF grammar file
399 def __init__(self, **kwargs):
400 cdef cmd_ln_t *config
403 # Construct from an existing GObject pointer if given
404 if 'boxed' in kwargs:
406 self.set_boxed(kwargs['boxed'])
409 # A much more concise version of what pocketsphinx_parse_argdict used to do
410 self.argc = len(kwargs) * 2
411 self.argv = <char **>sb.ckd_calloc(self.argc, sizeof(char *))
413 for k, v in kwargs.iteritems():
416 self.argv[i] = sb.ckd_salloc(k)
417 self.argv[i+1] = sb.ckd_salloc(v)
419 config = sb.cmd_ln_parse_r(NULL, ps_args(), self.argc, self.argv, 0)
421 raise RuntimeError, "Failed to parse argument list"
422 self.ps = ps_init(config)
424 raise RuntimeError, "Failed to initialize PocketSphinx"
426 cdef set_boxed(Decoder self, box):
427 cdef ps_decoder_t *ps
428 ps = <ps_decoder_t *>(<PyGBoxed *>box).boxed
433 def __dealloc__(self):
435 for i from 0 <= i < self.argc:
436 sb.ckd_free(self.argv[i])
437 sb.ckd_free(self.argv)
441 def decode_raw(self, fh, uttid=None, maxsamps=-1):
443 Decode raw audio from a file.
445 @param fh: Filehandle to read audio from.
447 @param uttid: Identifier to give to this utterance.
449 @param maxsamps: Maximum number of samples to read. If not
450 specified or -1, the rest of the file will be read.
457 cfh = PyFile_AsFile(fh)
462 return ps_decode_raw(self.ps, cfh, cuttid, maxsamps)
464 def decode_senscr(self, fh, uttid=None):
466 Decode senone scores from a file.
468 @param fh: Filehandle to read senone scores from.
470 @param uttid: Identifier to give to this utterance.
476 cfh = PyFile_AsFile(fh)
481 return ps_decode_senscr(self.ps, cfh, cuttid)
483 def start_utt(self, uttid=None):
485 Prepare the decoder to recognize an utterance.
487 @param uttid: Identifier to give to this utterance.
496 if ps_start_utt(self.ps, cuttid) < 0:
497 raise RuntimeError, "Failed to start utterance processing"
499 def process_raw(self, data, no_search=False, full_utt=False):
501 Process (decode) some audio data.
503 @param data: Audio data to process. This is packed binary
504 data, which consists of single-channel, 16-bit PCM audio, at
505 the sample rate specified when the decoder was initialized.
507 @param no_search: Buffer the data without actually processing it (default is to process the
508 data as it is received).
509 @type no_search: bool
510 @param full_utt: This block of data is an entire utterance.
511 Processing an entire utterance at once may improve
512 recognition, particularly for the first utterance passed to
518 cdef raw_data_ptr cdata
520 PyString_AsStringAndSize(data, &strdata, &len)
522 if ps_process_raw(self.ps, cdata, len, no_search, full_utt) < 0:
523 raise RuntimeError, "Failed to process %d samples of audio data" % len
525 def ps_end_utt(self):
527 Finish processing an utterance.
529 if ps_end_utt(self.ps) < 0:
530 raise RuntimeError, "Failed to stop utterance processing"
534 Get a hypothesis string.
536 This function returns the text which has been recognized so
537 far, or, if C{ps_end_utt()} has been called, the final
540 @return: Hypothesis string, utterance ID, recognition score
541 @rtype: (str, str, int)
543 cdef const_char_ptr hyp
544 cdef const_char_ptr uttid
547 hyp = ps_get_hyp(self.ps, &score, &uttid)
551 return None, uttid, 0
553 return hyp, uttid, score
557 Get a posterior probability.
559 Returns the posterior in linear scale.
561 @return: posterior probability of the result
564 cdef logmath_t *lmath
565 cdef const_char_ptr uttid
566 lmath = ps_get_logmath(self.ps)
567 return sb.logmath_exp(lmath, ps_get_prob(self.ps, &uttid))
569 def get_lattice(self):
571 Get the word lattice.
573 This function returns all hypotheses which have been
574 considered so far, in the form of a word lattice.
576 @return: Word lattice
579 cdef ps_lattice_t *dag
582 dag = ps_get_lattice(self.ps)
584 raise RuntimeError, "Failed to create word lattice"
591 Get the language model set.
593 This function returns the language model set, which allows you
594 to obtain language model scores or switch language models.
596 @return: Language model set
597 @rtype: sphinxbase.NGramModel
599 cdef ngram_model_t *clm
600 cdef logmath_t *lmath
601 cdef cmd_ln_t *config
604 clm = ps_get_lmset(self.ps)
607 lmath = sb.logmath_retain(ps_get_logmath(self.ps))
609 config = ps_get_config(self.ps)
610 # This is not necessarily true but it will have to do
611 lm.lw = sb.cmd_ln_float32_r(config, "-lw")
612 lm.wip = sb.cmd_ln_float32_r(config, "-wip")
613 lm.uw = sb.cmd_ln_float32_r(config, "-uw")
616 def add_word(self, word, phones, update=True):
618 Add a word to the dictionary and current language model.
620 @param word: Name of the word to add.
622 @param phones: Pronunciation of the word, a space-separated list of phones.
624 @param update: Update the decoder to recognize this new word.
625 If adding a number of words at once you may wish to pass
629 return ps_add_word(self.ps, word, phones, update)
631 def load_dict(self, dictfile, fdictfile=None, format=None):
633 Load a new pronunciation dictionary.
635 @param dictfile: Dictionary filename.
637 @param fdictfile: Filler dictionary filename.
639 @param format: Dictionary format, currently unused.
642 return ps_load_dict(self.ps, dictfile, fdictfile, format)
644 def save_dict(self, dictfile, format=None):
646 Save current pronunciation dictionary to a file.
648 @param dictfile: Dictionary filename.
650 @param format: Dictionary format, currently unused.
653 return ps_save_dict(self.ps, dictfile, format)