python/pocketsphinx.pyx

   1 # Copyright (c) 2008 Carnegie Mellon University. All rights
   2 # reserved.
   3 #
   4 # You may copy, modify, and distribute this code under the same terms
   5 # as PocketSphinx or Python, at your convenience, as long as this
   6 # notice is not removed.
   7 #
   8 # Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
   9
  10 cdef class LatNode:
  11     """
  12     Node in a word lattice.
  13
  14     @ivar word: Word this node corresponds to (with pronunciation variant).
  15     @type word: str
  16     @ivar baseword: Base word (no pronunciation variant) this node corresponds to.
  17     @type baseword: str
  18     @ivar sf: Start frame for this node.
  19     @type sf: int
  20     @ivar fef: First ending frame for this node.
  21     @type fef: int
  22     @ivar lef: Last ending frame for this node.
  23     @type lef: int
  24     @ivar best_exit: Best scoring exit link from this node
  25     @type best_exit: LatLink
  26     @ivar prob: Posterior probability for this node.
  27     @type prob: float
  28     """
  29     def __cinit__(self):
  30         self.node = NULL
  31
  32     cdef set_node(LatNode self, ps_lattice_t *dag, ps_latnode_t *node):
  33         """
  34         Internal function - binds this to a PocketSphinx lattice node.
  35         """
  36         cdef short fef, lef
  37         cdef ps_latlink_t *best_exit
  38         self.dag = dag
  39         self.node = node
  40         self.word = ps_latnode_word(dag, node)
  41         self.baseword = ps_latnode_baseword(dag, node)
  42         self.sf = ps_latnode_times(node, &fef, &lef)
  43         self.fef = fef
  44         self.lef = lef
  45         self.best_exit = None
  46         best_exit = NULL
  47         self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
  48                                          ps_latnode_prob(dag, node, &best_exit))
  49         if best_exit != NULL:
  50             self.best_exit = LatLink()
  51             self.best_exit.set_link(dag, best_exit)
  52
  53     def exits(self):
  54         """
  55         Obtain an iterator over arcs exiting this node.
  56
  57         @return: Iterator over arcs exiting this node
  58         @rtype: LatLinkIterator
  59         """
  60         cdef LatLinkIterator itor
  61         cdef ps_latlink_iter_t *citor
  62
  63         citor = ps_latnode_exits(self.node)
  64         itor = LatLinkIterator()
  65         itor.itor = citor
  66         itor.dag = self.dag
  67         return itor
  68
  69     def entries(self):
  70         """
  71         Obtain an iterator over arcs entering this node.
  72
  73         @return: Iterator over arcs entering this node
  74         @rtype: LatLinkIterator
  75         """
  76         cdef LatLinkIterator itor
  77         cdef ps_latlink_iter_t *citor
  78
  79         citor = ps_latnode_entries(self.node)
  80         itor = LatLinkIterator()
  81         itor.itor = citor
  82         itor.dag = self.dag
  83         return itor
  84
  85 cdef class LatNodeIterator:
  86     """
  87     Iterator over word lattice nodes.
  88     """
  89     def __init__(self, start, end):
  90         self.itor = NULL
  91         self.first_node = True
  92         self.start = start
  93         self.end = end
  94
  95     def __iter__(self):
  96         return self
  97
  98     def __next__(self):
  99         """
 100         Advance iterator and return the next node.
 101
 102         @return: Next lattice node in this iterator.
 103         @rtype: LatNode
 104         """
 105         cdef LatNode node
 106         cdef int start
 107         cdef ps_latnode_t *cnode
 108
 109         # Make sure we keep raising exceptions at the end
 110         if self.itor == NULL:
 111             raise StopIteration
 112         # Advance the iterator if this isn't the first item
 113         if self.first_node:
 114             self.first_node = False
 115         else:
 116             self.itor = ps_latnode_iter_next(self.itor)
 117             if self.itor == NULL:
 118                 raise StopIteration
 119         # Look for the next node within the given time range
 120         cnode = ps_latnode_iter_node(self.itor)
 121         start = ps_latnode_times(cnode, NULL, NULL)
 122         while start < self.start or start >= self.end:
 123             self.itor = ps_latnode_iter_next(self.itor)
 124             if self.itor == NULL:
 125                 raise StopIteration
 126             cnode = ps_latnode_iter_node(self.itor)
 127             start = ps_latnode_times(cnode, NULL, NULL)
 128         node = LatNode()
 129         node.set_node(self.dag, cnode)
 130         return node
 131
 132 cdef class LatLink:
 133     """
 134     Link (edge) in a word lattice, connecting two nodes.
 135
 136     @ivar word: Word (with pronunciation variant) for this link.
 137     @type word: str
 138     @ivar baseword: Base word (no pronunciation variant) for this link.
 139     @type baseword: str
 140     @ivar sf: Start frame for this link.
 141     @type sf: int
 142     @ivar fef: Ending frame for this link.
 143     @type fef: int
 144     @ivar prob: Posterior probability for this link.
 145     @type prob: float
 146     """
 147     def __cinit__(self):
 148         self.link = NULL
 149
 150     cdef set_link(LatLink self, ps_lattice_t *dag, ps_latlink_t *link):
 151         """
 152         Internal function - binds this to a PocketSphinx lattice link.
 153         """
 154         cdef short sf
 155         self.dag = dag
 156         self.link = link
 157         self.word = ps_latlink_word(dag, link)
 158         self.baseword = ps_latlink_baseword(dag, link)
 159         self.ef = ps_latlink_times(link, &sf)
 160         self.sf = sf
 161         self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
 162                                          ps_latlink_prob(dag, link, NULL))
 163
 164     def nodes(self):
 165         """
 166         Get source and destination nodes for this link.
 167
 168         @return: Source and destination nodes for this link
 169         @rtype: (LatNode, LatNode)
 170         """
 171         cdef LatNode src, dest
 172         cdef ps_latnode_t *csrc, *cdest
 173
 174         cdest = ps_latlink_nodes(self.link, &csrc)
 175         src = LatNode()
 176         src.set_node(self.dag, csrc)
 177         dest = LatNode()
 178         dest.set_node(self.dag, cdest)
 179         return src, dest
 180
 181     def pred(self):
 182         """
 183         Get backpointer from this link.
 184
 185         @return: Backpointer from this link, set by bestpath search.
 186         @rtype: LatLink
 187         """
 188         cdef LatLink pred
 189         cdef ps_latlink_t *cpred
 190
 191         cpred = ps_latlink_pred(self.link)
 192         if cpred == NULL:
 193             return None
 194         pred = LatLink()
 195         pred.set_link(self.dag, cpred)
 196         return pred
 197
 198 cdef class LatLinkIterator:
 199     """
 200     Iterator over word lattice links.
 201     """
 202     def __cinit__(self):
 203         self.itor = NULL
 204         self.first_link = True
 205
 206     def __iter__(self):
 207         return self
 208
 209     def __next__(self):
 210         """
 211         Advance iterator and return the next link.
 212
 213         @return: Next lattice link in this iterator.
 214         @rtype: LatLink
 215         """
 216         cdef LatLink link
 217         if self.first_link:
 218             self.first_link = False
 219         else:
 220             self.itor = ps_latlink_iter_next(self.itor)
 221         if self.itor == NULL:
 222             raise StopIteration
 223         link = LatLink()
 224         link.set_link(self.dag, ps_latlink_iter_link(self.itor))
 225         return link
 226
 227 cdef class Lattice:
 228     """
 229     Word lattice.
 230
 231     The word lattice is a compact representation of the set of
 232     hypotheses considered by the decoder when recognizing an
 233     utterance.
 234
 235     A lattice object can be constructed either from a lattice file
 236     on disk or from a 'boxed' object passed in from GStreamer (or,
 237     in theory, anything else that uses GLib).  In the first case,
 238     the C{ps} argument is required.
 239
 240     @param ps: PocketSphinx decoder.
 241     @type ps: Decoder
 242     @param latfile: Filename of lattice file to read.
 243     @type latfile: str
 244     @param boxed: Boxed pointer from GStreamer containing a lattice
 245     @type boxed: PyGBoxed
 246
 247     @ivar n_frames: Number of frames of audio covered by this lattice
 248     @type n_frames: int
 249     @ivar start: Start node
 250     @type start: LatNode
 251     @ivar end: End node
 252     @type end: LatNode
 253     """
 254     def __init__(self, ps=None, latfile=None, boxed=None):
 255         self.dag = NULL
 256         if latfile:
 257             self.read_dag(ps, latfile)
 258         if boxed:
 259             self.set_boxed(boxed)
 260
 261     cdef read_dag(Lattice self, Decoder ps, latfile):
 262         if ps:
 263             self.dag = ps_lattice_read(ps.ps, latfile)
 264         else:
 265             self.dag = ps_lattice_read(NULL, latfile)
 266         self.n_frames = ps_lattice_n_frames(self.dag)
 267         if self.dag == NULL:
 268             raise RuntimeError, "Failed to read lattice from %s" % latfile
 269
 270     cdef set_dag(Lattice self, ps_lattice_t *dag):
 271         ps_lattice_retain(dag)
 272         ps_lattice_free(self.dag)
 273         self.dag = dag
 274         self.n_frames = ps_lattice_n_frames(dag)
 275
 276     cdef set_boxed(Lattice self, box):
 277         cdef ps_lattice_t *dag
 278         dag = <ps_lattice_t *>(<PyGBoxed *>box).boxed
 279         ps_lattice_retain(dag)
 280         ps_lattice_free(self.dag)
 281         self.dag = dag
 282         self.n_frames = ps_lattice_n_frames(self.dag)
 283
 284     def __dealloc__(self):
 285         ps_lattice_free(self.dag)
 286
 287     def bestpath(self, NGramModel lmset, float lwf, float ascale):
 288         """
 289         Find the best path through the lattice, optionally using a
 290         language model.
 291
 292         This function performs best-path search on the lattice, and
 293         returns the final link in the best path found.  The existing
 294         acoustic scores on the lattice links are used in conjunction
 295         with an optional language model.  A scaling factor can be
 296         applied to the acoustic scores to produce more useful
 297         posterior probabilities (in conjunction with C{posterior()},
 298         below).
 299
 300         @param lmset: Language model (set) to use for rescoring
 301         @type lmset: sphinxbase.NGramModel
 302         @param lwf: Weight to apply to language model scores (on top
 303         of any existing language model weight set in C{lmset}).
 304         @type lwf: float
 305         @param ascale: Weight to apply to acoustic model scores.
 306         @type ascale: float
 307         @return: Final link in best path.
 308         @rtype: LatLink
 309         """
 310         cdef ps_latlink_t *end
 311         cdef LatLink link
 312         end = ps_lattice_bestpath(self.dag, lmset.lm, lwf, ascale)
 313         link = LatLink()
 314         link.set_link(self.dag, end)
 315         return link
 316
 317     def posterior(self, NGramModel lmset, float ascale):
 318         """
 319         Calculate posterior probabilities of all links in a lattice.
 320
 321         This function performs the backward part of forward-backward
 322         calculation of posterior probabilities for all links in the
 323         lattice.  It assumes that C{bestpath()} has already been
 324         called on the lattice.
 325
 326         @param lmset: Language model (set) to use for rescoring
 327         @type lmset: sphinxbase.NGramModel
 328         @param ascale: Weight to apply to acoustic model scores.
 329         @type ascale: float
 330         @return: Log-probability of the lattice as a whole.
 331         @rtype: float
 332         """
 333         cdef logmath_t *lmath
 334         lmath = ps_lattice_get_logmath(self.dag)
 335         return sb.logmath_log_to_ln(lmath,
 336                                     ps_lattice_posterior(self.dag, lmset.lm, ascale))
 337
 338     def nodes(self, start=0, end=-1):
 339         """
 340         Get an iterator over all nodes in the lattice.
 341
 342         @param start: First frame to iterate over.
 343         @type start: int
 344         @param end: Last frame to iterate over, or -1 for all remaining
 345         @type end: int
 346         @return: Iterator over nodes.
 347         @rtype: LatNodeIterator
 348         """
 349         cdef LatNodeIterator itor
 350
 351         if end == -1:
 352             end = ps_lattice_n_frames(self.dag)
 353         itor = LatNodeIterator(start, end)
 354         itor.dag = self.dag
 355         itor.itor = ps_latnode_iter(self.dag)
 356         return itor
 357
 358     def write(self, outfile):
 359         """
 360         Write the lattice to an output file.
 361
 362         @param outfile: Name of file to write to.
 363         @type outfile: str
 364         """
 365         cdef int rv
 366
 367         rv = ps_lattice_write(self.dag, outfile)
 368         if rv < 0:
 369             raise RuntimeError, "Failed to write lattice to %s" % outfile
 370
 371 cdef class Decoder:
 372     """
 373     PocketSphinx speech decoder.
 374
 375     To initialize the PocketSphinx decoder, pass a list of keyword
 376     arguments to the constructor::
 377
 378      d = pocketsphinx.Decoder(hmm='/path/to/acoustic/model',
 379                               lm='/path/to/language/model',
 380                               dict='/path/to/dictionary',
 381                               beam='1e-80')
 382
 383     If no arguments are passed, the default acoustic and language
 384     models will be loaded, which may be acceptable for general English
 385     speech.  Any arguments supported by the PocketSphinx decoder are
 386     allowed here.  Only the most frequent ones are described below.
 387
 388     @param boxed: Boxed pointer from GStreamer containing a decoder
 389     @type boxed: PyGBoxed
 390     @param hmm: Path to acoustic model directory
 391     @type hmm: str
 392     @param dict: Path to dictionary file
 393     @type dict: str
 394     @param lm: Path to language model file
 395     @type lm: str
 396     @param jsgf: Path to JSGF grammar file
 397     @type jsgf str
 398     """
 399     def __init__(self, **kwargs):
 400         cdef cmd_ln_t *config
 401         cdef int i
 402
 403         # Construct from an existing GObject pointer if given
 404         if 'boxed' in kwargs:
 405             self.argc = 0
 406             self.set_boxed(kwargs['boxed'])
 407             return
 408
 409         # A much more concise version of what pocketsphinx_parse_argdict used to do
 410         self.argc = len(kwargs) * 2
 411         self.argv = <char **>sb.ckd_calloc(self.argc, sizeof(char *))
 412         i = 0
 413         for k, v in kwargs.iteritems():
 414             if k[0] != '-':
 415                 k = '-' + k
 416             self.argv[i] = sb.ckd_salloc(k)
 417             self.argv[i+1] = sb.ckd_salloc(v)
 418             i = i + 2
 419         config = sb.cmd_ln_parse_r(NULL, ps_args(), self.argc, self.argv, 0)
 420         if config == NULL:
 421             raise RuntimeError, "Failed to parse argument list"
 422         self.ps = ps_init(config)
 423         if self.ps == NULL:
 424             raise RuntimeError, "Failed to initialize PocketSphinx"
 425
 426     cdef set_boxed(Decoder self, box):
 427         cdef ps_decoder_t *ps
 428         ps = <ps_decoder_t *>(<PyGBoxed *>box).boxed
 429         ps_retain(ps)
 430         ps_free(self.ps)
 431         self.ps = ps
 432
 433     def __dealloc__(self):
 434         ps_free(self.ps)
 435         for i from 0 <= i < self.argc:
 436             sb.ckd_free(self.argv[i])
 437         sb.ckd_free(self.argv)
 438         self.argv = NULL
 439         self.argc = 0
 440
 441     def decode_raw(self, fh, uttid=None, maxsamps=-1):
 442         """
 443         Decode raw audio from a file.
 444
 445         @param fh: Filehandle to read audio from.
 446         @type fh: file
 447         @param uttid: Identifier to give to this utterance.
 448         @type uttid: str
 449         @param maxsamps: Maximum number of samples to read.  If not
 450         specified or -1, the rest of the file will be read.
 451         @type maxsamps: int
 452         """
 453         cdef FILE *cfh
 454         cdef int nsamp
 455         cdef char *cuttid
 456
 457         cfh = PyFile_AsFile(fh)
 458         if uttid == None:
 459             cuttid = NULL
 460         else:
 461             cuttid = uttid
 462         return ps_decode_raw(self.ps, cfh, cuttid, maxsamps)
 463
 464     def decode_senscr(self, fh, uttid=None):
 465         """
 466         Decode senone scores from a file.
 467
 468         @param fh: Filehandle to read senone scores from.
 469         @type fh: file
 470         @param uttid: Identifier to give to this utterance.
 471         @type uttid: str
 472         """
 473         cdef FILE *cfh
 474         cdef char *cuttid
 475
 476         cfh = PyFile_AsFile(fh)
 477         if uttid == None:
 478             cuttid = NULL
 479         else:
 480             cuttid = uttid
 481         return ps_decode_senscr(self.ps, cfh, cuttid)
 482
 483     def start_utt(self, uttid=None):
 484         """
 485         Prepare the decoder to recognize an utterance.
 486
 487         @param uttid: Identifier to give to this utterance.
 488         @type uttid: str
 489         """
 490         cdef char *cuttid
 491
 492         if uttid == None:
 493             cuttid = NULL
 494         else:
 495             cuttid = uttid
 496         if ps_start_utt(self.ps, cuttid) < 0:
 497             raise RuntimeError, "Failed to start utterance processing"
 498
 499     def process_raw(self, data, no_search=False, full_utt=False):
 500         """
 501         Process (decode) some audio data.
 502
 503         @param data: Audio data to process.  This is packed binary
 504         data, which consists of single-channel, 16-bit PCM audio, at
 505         the sample rate specified when the decoder was initialized.
 506         @type data: str
 507         @param no_search: Buffer the data without actually processing it (default is to process the
 508         data as it is received).
 509         @type no_search: bool
 510         @param full_utt: This block of data is an entire utterance.
 511         Processing an entire utterance at once may improve
 512         recognition, particularly for the first utterance passed to
 513         the decoder.
 514         @type full_utt: bool
 515         """
 516         cdef Py_ssize_t len
 517         cdef char* strdata
 518         cdef raw_data_ptr cdata
 519
 520         PyString_AsStringAndSize(data, &strdata, &len)
 521         cdata = strdata
 522         if ps_process_raw(self.ps, cdata, len, no_search, full_utt) < 0:
 523             raise RuntimeError, "Failed to process %d samples of audio data" % len
 524
 525     def ps_end_utt(self):
 526         """
 527         Finish processing an utterance.
 528         """
 529         if ps_end_utt(self.ps) < 0:
 530             raise RuntimeError, "Failed to stop utterance processing"
 531
 532     def get_hyp(self):
 533         """
 534         Get a hypothesis string.
 535
 536         This function returns the text which has been recognized so
 537         far, or, if C{ps_end_utt()} has been called, the final
 538         recognition result.
 539
 540         @return: Hypothesis string, utterance ID, recognition score
 541         @rtype: (str, str, int)
 542         """
 543         cdef const_char_ptr hyp
 544         cdef const_char_ptr uttid
 545         cdef int score
 546
 547         hyp = ps_get_hyp(self.ps, &score, &uttid)
 548
 549         # No result
 550         if hyp == NULL:
 551              return None, uttid, 0
 552
 553         return hyp, uttid, score
 554
 555     def get_prob(self):
 556         """
 557         Get a posterior probability.
 558
 559         Returns the posterior in linear scale.
 560
 561         @return: posterior probability of the result
 562         @rtype: float
 563         """
 564         cdef logmath_t *lmath
 565         cdef const_char_ptr uttid
 566         lmath = ps_get_logmath(self.ps)
 567         return sb.logmath_exp(lmath, ps_get_prob(self.ps, &uttid))
 568
 569     def get_lattice(self):
 570         """
 571         Get the word lattice.
 572
 573         This function returns all hypotheses which have been
 574         considered so far, in the form of a word lattice.
 575
 576         @return: Word lattice
 577         @rtype: Lattice
 578         """
 579         cdef ps_lattice_t *dag
 580         cdef Lattice lat
 581
 582         dag = ps_get_lattice(self.ps)
 583         if dag == NULL:
 584             raise RuntimeError, "Failed to create word lattice"
 585         lat = Lattice()
 586         lat.set_dag(dag)
 587         return lat
 588
 589     def get_lmset(self):
 590         """
 591         Get the language model set.
 592
 593         This function returns the language model set, which allows you
 594         to obtain language model scores or switch language models.
 595
 596         @return: Language model set
 597         @rtype: sphinxbase.NGramModel
 598         """
 599         cdef ngram_model_t *clm
 600         cdef logmath_t *lmath
 601         cdef cmd_ln_t *config
 602         cdef NGramModel lm
 603
 604         clm = ps_get_lmset(self.ps)
 605         lm = NGramModel()
 606         lm.set_lm(clm)
 607         lmath = sb.logmath_retain(ps_get_logmath(self.ps))
 608         lm.set_lmath(lmath)
 609         config = ps_get_config(self.ps)
 610         # This is not necessarily true but it will have to do
 611         lm.lw = sb.cmd_ln_float32_r(config, "-lw")
 612         lm.wip = sb.cmd_ln_float32_r(config, "-wip")
 613         lm.uw = sb.cmd_ln_float32_r(config, "-uw")
 614         return lm
 615
 616     def add_word(self, word, phones, update=True):
 617         """
 618         Add a word to the dictionary and current language model.
 619
 620         @param word: Name of the word to add.
 621         @type word: str
 622         @param phones: Pronunciation of the word, a space-separated list of phones.
 623         @type phones: str
 624         @param update: Update the decoder to recognize this new word.
 625         If adding a number of words at once you may wish to pass
 626         C{False} here.
 627         @type update: bool
 628         """
 629         return ps_add_word(self.ps, word, phones, update)
 630
 631     def load_dict(self, dictfile, fdictfile=None, format=None):
 632         """
 633         Load a new pronunciation dictionary.
 634
 635         @param dictfile: Dictionary filename.
 636         @type dictfile: str
 637         @param fdictfile: Filler dictionary filename.
 638         @type fdictfile: str
 639         @param format: Dictionary format, currently unused.
 640         @type format: str
 641         """
 642         return ps_load_dict(self.ps, dictfile, fdictfile, format)
 643
 644     def save_dict(self, dictfile, format=None):
 645         """
 646         Save current pronunciation dictionary to a file.
 647
 648         @param dictfile: Dictionary filename.
 649         @type dictfile: str
 650         @param format: Dictionary format, currently unused.
 651         @type format: str
 652         """
 653         return ps_save_dict(self.ps, dictfile, format)