# Copyright (c) 2008 Carnegie Mellon University. All rights # reserved. # # You may copy, modify, and distribute this code under the same terms # as PocketSphinx or Python, at your convenience, as long as this # notice is not removed. # # Author: David Huggins-Daines cdef class LatNode: """ Node in a word lattice. @ivar word: Word this node corresponds to (with pronunciation variant). @type word: str @ivar baseword: Base word (no pronunciation variant) this node corresponds to. @type baseword: str @ivar sf: Start frame for this node. @type sf: int @ivar fef: First ending frame for this node. @type fef: int @ivar lef: Last ending frame for this node. @type lef: int @ivar best_exit: Best scoring exit link from this node @type best_exit: LatLink @ivar prob: Posterior probability for this node. @type prob: float """ def __cinit__(self): self.node = NULL cdef set_node(LatNode self, ps_lattice_t *dag, ps_latnode_t *node): """ Internal function - binds this to a PocketSphinx lattice node. """ cdef short fef, lef cdef ps_latlink_t *best_exit self.dag = dag self.node = node self.word = ps_latnode_word(dag, node) self.baseword = ps_latnode_baseword(dag, node) self.sf = ps_latnode_times(node, &fef, &lef) self.fef = fef self.lef = lef self.best_exit = None best_exit = NULL self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag), ps_latnode_prob(dag, node, &best_exit)) if best_exit != NULL: self.best_exit = LatLink() self.best_exit.set_link(dag, best_exit) def exits(self): """ Obtain an iterator over arcs exiting this node. @return: Iterator over arcs exiting this node @rtype: LatLinkIterator """ cdef LatLinkIterator itor cdef ps_latlink_iter_t *citor citor = ps_latnode_exits(self.node) itor = LatLinkIterator() itor.itor = citor itor.dag = self.dag return itor def entries(self): """ Obtain an iterator over arcs entering this node. @return: Iterator over arcs entering this node @rtype: LatLinkIterator """ cdef LatLinkIterator itor cdef ps_latlink_iter_t *citor citor = ps_latnode_entries(self.node) itor = LatLinkIterator() itor.itor = citor itor.dag = self.dag return itor cdef class LatNodeIterator: """ Iterator over word lattice nodes. """ def __init__(self, start, end): self.itor = NULL self.first_node = True self.start = start self.end = end def __iter__(self): return self def __next__(self): """ Advance iterator and return the next node. @return: Next lattice node in this iterator. @rtype: LatNode """ cdef LatNode node cdef int start cdef ps_latnode_t *cnode # Make sure we keep raising exceptions at the end if self.itor == NULL: raise StopIteration # Advance the iterator if this isn't the first item if self.first_node: self.first_node = False else: self.itor = ps_latnode_iter_next(self.itor) if self.itor == NULL: raise StopIteration # Look for the next node within the given time range cnode = ps_latnode_iter_node(self.itor) start = ps_latnode_times(cnode, NULL, NULL) while start < self.start or start >= self.end: self.itor = ps_latnode_iter_next(self.itor) if self.itor == NULL: raise StopIteration cnode = ps_latnode_iter_node(self.itor) start = ps_latnode_times(cnode, NULL, NULL) node = LatNode() node.set_node(self.dag, cnode) return node cdef class LatLink: """ Link (edge) in a word lattice, connecting two nodes. @ivar word: Word (with pronunciation variant) for this link. @type word: str @ivar baseword: Base word (no pronunciation variant) for this link. @type baseword: str @ivar sf: Start frame for this link. @type sf: int @ivar fef: Ending frame for this link. @type fef: int @ivar prob: Posterior probability for this link. @type prob: float """ def __cinit__(self): self.link = NULL cdef set_link(LatLink self, ps_lattice_t *dag, ps_latlink_t *link): """ Internal function - binds this to a PocketSphinx lattice link. """ cdef short sf self.dag = dag self.link = link self.word = ps_latlink_word(dag, link) self.baseword = ps_latlink_baseword(dag, link) self.ef = ps_latlink_times(link, &sf) self.sf = sf self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag), ps_latlink_prob(dag, link, NULL)) def nodes(self): """ Get source and destination nodes for this link. @return: Source and destination nodes for this link @rtype: (LatNode, LatNode) """ cdef LatNode src, dest cdef ps_latnode_t *csrc, *cdest cdest = ps_latlink_nodes(self.link, &csrc) src = LatNode() src.set_node(self.dag, csrc) dest = LatNode() dest.set_node(self.dag, cdest) return src, dest def pred(self): """ Get backpointer from this link. @return: Backpointer from this link, set by bestpath search. @rtype: LatLink """ cdef LatLink pred cdef ps_latlink_t *cpred cpred = ps_latlink_pred(self.link) if cpred == NULL: return None pred = LatLink() pred.set_link(self.dag, cpred) return pred cdef class LatLinkIterator: """ Iterator over word lattice links. """ def __cinit__(self): self.itor = NULL self.first_link = True def __iter__(self): return self def __next__(self): """ Advance iterator and return the next link. @return: Next lattice link in this iterator. @rtype: LatLink """ cdef LatLink link if self.first_link: self.first_link = False else: self.itor = ps_latlink_iter_next(self.itor) if self.itor == NULL: raise StopIteration link = LatLink() link.set_link(self.dag, ps_latlink_iter_link(self.itor)) return link cdef class Lattice: """ Word lattice. The word lattice is a compact representation of the set of hypotheses considered by the decoder when recognizing an utterance. A lattice object can be constructed either from a lattice file on disk or from a 'boxed' object passed in from GStreamer (or, in theory, anything else that uses GLib). In the first case, the C{ps} argument is required. @param ps: PocketSphinx decoder. @type ps: Decoder @param latfile: Filename of lattice file to read. @type latfile: str @param boxed: Boxed pointer from GStreamer containing a lattice @type boxed: PyGBoxed @ivar n_frames: Number of frames of audio covered by this lattice @type n_frames: int @ivar start: Start node @type start: LatNode @ivar end: End node @type end: LatNode """ def __init__(self, ps=None, latfile=None, boxed=None): self.dag = NULL if latfile: self.read_dag(ps, latfile) if boxed: self.set_boxed(boxed) cdef read_dag(Lattice self, Decoder ps, latfile): if ps: self.dag = ps_lattice_read(ps.ps, latfile) else: self.dag = ps_lattice_read(NULL, latfile) self.n_frames = ps_lattice_n_frames(self.dag) if self.dag == NULL: raise RuntimeError, "Failed to read lattice from %s" % latfile cdef set_dag(Lattice self, ps_lattice_t *dag): ps_lattice_retain(dag) ps_lattice_free(self.dag) self.dag = dag self.n_frames = ps_lattice_n_frames(dag) cdef set_boxed(Lattice self, box): cdef ps_lattice_t *dag dag = (box).boxed ps_lattice_retain(dag) ps_lattice_free(self.dag) self.dag = dag self.n_frames = ps_lattice_n_frames(self.dag) def __dealloc__(self): ps_lattice_free(self.dag) def bestpath(self, NGramModel lmset, float lwf, float ascale): """ Find the best path through the lattice, optionally using a language model. This function performs best-path search on the lattice, and returns the final link in the best path found. The existing acoustic scores on the lattice links are used in conjunction with an optional language model. A scaling factor can be applied to the acoustic scores to produce more useful posterior probabilities (in conjunction with C{posterior()}, below). @param lmset: Language model (set) to use for rescoring @type lmset: sphinxbase.NGramModel @param lwf: Weight to apply to language model scores (on top of any existing language model weight set in C{lmset}). @type lwf: float @param ascale: Weight to apply to acoustic model scores. @type ascale: float @return: Final link in best path. @rtype: LatLink """ cdef ps_latlink_t *end cdef LatLink link end = ps_lattice_bestpath(self.dag, lmset.lm, lwf, ascale) link = LatLink() link.set_link(self.dag, end) return link def posterior(self, NGramModel lmset, float ascale): """ Calculate posterior probabilities of all links in a lattice. This function performs the backward part of forward-backward calculation of posterior probabilities for all links in the lattice. It assumes that C{bestpath()} has already been called on the lattice. @param lmset: Language model (set) to use for rescoring @type lmset: sphinxbase.NGramModel @param ascale: Weight to apply to acoustic model scores. @type ascale: float @return: Log-probability of the lattice as a whole. @rtype: float """ cdef logmath_t *lmath lmath = ps_lattice_get_logmath(self.dag) return sb.logmath_log_to_ln(lmath, ps_lattice_posterior(self.dag, lmset.lm, ascale)) def nodes(self, start=0, end=-1): """ Get an iterator over all nodes in the lattice. @param start: First frame to iterate over. @type start: int @param end: Last frame to iterate over, or -1 for all remaining @type end: int @return: Iterator over nodes. @rtype: LatNodeIterator """ cdef LatNodeIterator itor if end == -1: end = ps_lattice_n_frames(self.dag) itor = LatNodeIterator(start, end) itor.dag = self.dag itor.itor = ps_latnode_iter(self.dag) return itor def write(self, outfile): """ Write the lattice to an output file. @param outfile: Name of file to write to. @type outfile: str """ cdef int rv rv = ps_lattice_write(self.dag, outfile) if rv < 0: raise RuntimeError, "Failed to write lattice to %s" % outfile cdef class Decoder: """ PocketSphinx speech decoder. To initialize the PocketSphinx decoder, pass a list of keyword arguments to the constructor:: d = pocketsphinx.Decoder(hmm='/path/to/acoustic/model', lm='/path/to/language/model', dict='/path/to/dictionary', beam='1e-80') If no arguments are passed, the default acoustic and language models will be loaded, which may be acceptable for general English speech. Any arguments supported by the PocketSphinx decoder are allowed here. Only the most frequent ones are described below. @param boxed: Boxed pointer from GStreamer containing a decoder @type boxed: PyGBoxed @param hmm: Path to acoustic model directory @type hmm: str @param dict: Path to dictionary file @type dict: str @param lm: Path to language model file @type lm: str @param jsgf: Path to JSGF grammar file @type jsgf str """ def __init__(self, **kwargs): cdef cmd_ln_t *config cdef int i # Construct from an existing GObject pointer if given if 'boxed' in kwargs: self.argc = 0 self.set_boxed(kwargs['boxed']) return # A much more concise version of what pocketsphinx_parse_argdict used to do self.argc = len(kwargs) * 2 self.argv = sb.ckd_calloc(self.argc, sizeof(char *)) i = 0 for k, v in kwargs.iteritems(): if k[0] != '-': k = '-' + k self.argv[i] = sb.ckd_salloc(k) self.argv[i+1] = sb.ckd_salloc(v) i = i + 2 config = sb.cmd_ln_parse_r(NULL, ps_args(), self.argc, self.argv, 0) if config == NULL: raise RuntimeError, "Failed to parse argument list" self.ps = ps_init(config) if self.ps == NULL: raise RuntimeError, "Failed to initialize PocketSphinx" cdef set_boxed(Decoder self, box): cdef ps_decoder_t *ps ps = (box).boxed ps_retain(ps) ps_free(self.ps) self.ps = ps def __dealloc__(self): ps_free(self.ps) for i from 0 <= i < self.argc: sb.ckd_free(self.argv[i]) sb.ckd_free(self.argv) self.argv = NULL self.argc = 0 def decode_raw(self, fh, uttid=None, maxsamps=-1): """ Decode raw audio from a file. @param fh: Filehandle to read audio from. @type fh: file @param uttid: Identifier to give to this utterance. @type uttid: str @param maxsamps: Maximum number of samples to read. If not specified or -1, the rest of the file will be read. @type maxsamps: int """ cdef FILE *cfh cdef int nsamp cdef char *cuttid cfh = PyFile_AsFile(fh) if uttid == None: cuttid = NULL else: cuttid = uttid return ps_decode_raw(self.ps, cfh, cuttid, maxsamps) def decode_senscr(self, fh, uttid=None): """ Decode senone scores from a file. @param fh: Filehandle to read senone scores from. @type fh: file @param uttid: Identifier to give to this utterance. @type uttid: str """ cdef FILE *cfh cdef char *cuttid cfh = PyFile_AsFile(fh) if uttid == None: cuttid = NULL else: cuttid = uttid return ps_decode_senscr(self.ps, cfh, cuttid) def start_utt(self, uttid=None): """ Prepare the decoder to recognize an utterance. @param uttid: Identifier to give to this utterance. @type uttid: str """ cdef char *cuttid if uttid == None: cuttid = NULL else: cuttid = uttid if ps_start_utt(self.ps, cuttid) < 0: raise RuntimeError, "Failed to start utterance processing" def process_raw(self, data, no_search=False, full_utt=False): """ Process (decode) some audio data. @param data: Audio data to process. This is packed binary data, which consists of single-channel, 16-bit PCM audio, at the sample rate specified when the decoder was initialized. @type data: str @param no_search: Buffer the data without actually processing it (default is to process the data as it is received). @type no_search: bool @param full_utt: This block of data is an entire utterance. Processing an entire utterance at once may improve recognition, particularly for the first utterance passed to the decoder. @type full_utt: bool """ cdef Py_ssize_t len cdef char* strdata cdef raw_data_ptr cdata PyString_AsStringAndSize(data, &strdata, &len) cdata = strdata if ps_process_raw(self.ps, cdata, len, no_search, full_utt) < 0: raise RuntimeError, "Failed to process %d samples of audio data" % len def ps_end_utt(self): """ Finish processing an utterance. """ if ps_end_utt(self.ps) < 0: raise RuntimeError, "Failed to stop utterance processing" def get_hyp(self): """ Get a hypothesis string. This function returns the text which has been recognized so far, or, if C{ps_end_utt()} has been called, the final recognition result. @return: Hypothesis string, utterance ID, recognition score @rtype: (str, str, int) """ cdef const_char_ptr hyp cdef const_char_ptr uttid cdef int score hyp = ps_get_hyp(self.ps, &score, &uttid) # No result if hyp == NULL: return None, uttid, 0 return hyp, uttid, score def get_prob(self): """ Get a posterior probability. Returns the posterior in linear scale. @return: posterior probability of the result @rtype: float """ cdef logmath_t *lmath cdef const_char_ptr uttid lmath = ps_get_logmath(self.ps) return sb.logmath_exp(lmath, ps_get_prob(self.ps, &uttid)) def get_lattice(self): """ Get the word lattice. This function returns all hypotheses which have been considered so far, in the form of a word lattice. @return: Word lattice @rtype: Lattice """ cdef ps_lattice_t *dag cdef Lattice lat dag = ps_get_lattice(self.ps) if dag == NULL: raise RuntimeError, "Failed to create word lattice" lat = Lattice() lat.set_dag(dag) return lat def get_lmset(self): """ Get the language model set. This function returns the language model set, which allows you to obtain language model scores or switch language models. @return: Language model set @rtype: sphinxbase.NGramModel """ cdef ngram_model_t *clm cdef logmath_t *lmath cdef cmd_ln_t *config cdef NGramModel lm clm = ps_get_lmset(self.ps) lm = NGramModel() lm.set_lm(clm) lmath = sb.logmath_retain(ps_get_logmath(self.ps)) lm.set_lmath(lmath) config = ps_get_config(self.ps) # This is not necessarily true but it will have to do lm.lw = sb.cmd_ln_float32_r(config, "-lw") lm.wip = sb.cmd_ln_float32_r(config, "-wip") lm.uw = sb.cmd_ln_float32_r(config, "-uw") return lm def add_word(self, word, phones, update=True): """ Add a word to the dictionary and current language model. @param word: Name of the word to add. @type word: str @param phones: Pronunciation of the word, a space-separated list of phones. @type phones: str @param update: Update the decoder to recognize this new word. If adding a number of words at once you may wish to pass C{False} here. @type update: bool """ return ps_add_word(self.ps, word, phones, update) def load_dict(self, dictfile, fdictfile=None, format=None): """ Load a new pronunciation dictionary. @param dictfile: Dictionary filename. @type dictfile: str @param fdictfile: Filler dictionary filename. @type fdictfile: str @param format: Dictionary format, currently unused. @type format: str """ return ps_load_dict(self.ps, dictfile, fdictfile, format) def save_dict(self, dictfile, format=None): """ Save current pronunciation dictionary to a file. @param dictfile: Dictionary filename. @type dictfile: str @param format: Dictionary format, currently unused. @type format: str """ return ps_save_dict(self.ps, dictfile, format)