1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 2007 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
36 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
46 #include <sphinxbase/strfuncs.h>
48 #include "gstpocketsphinx.h"
50 #include "psmarshal.h"
52 GST_DEBUG_CATEGORY_STATIC(pocketsphinx_debug);
53 #define GST_CAT_DEFAULT pocketsphinx_debug
56 * Forward declarations.
59 static void gst_pocketsphinx_set_property(GObject * object, guint prop_id,
60 const GValue * value, GParamSpec * pspec);
61 static void gst_pocketsphinx_get_property(GObject * object, guint prop_id,
62 GValue * value, GParamSpec * pspec);
63 static GstFlowReturn gst_pocketsphinx_chain(GstPad * pad, GstBuffer * buffer);
64 static gboolean gst_pocketsphinx_event(GstPad *pad, GstEvent *event);
68 SIGNAL_PARTIAL_RESULT,
98 /* Default command line. (will go away soon and be constructed using properties) */
99 static char *default_argv[] = {
109 static const int default_argc = sizeof(default_argv)/sizeof(default_argv[0]);
111 static GstStaticPadTemplate sink_factory =
112 GST_STATIC_PAD_TEMPLATE("sink",
115 GST_STATIC_CAPS("audio/x-raw-int, "
118 "signed = (boolean) true, "
119 "endianness = (int) BYTE_ORDER, "
120 "channels = (int) 1, "
124 static GstStaticPadTemplate src_factory =
125 GST_STATIC_PAD_TEMPLATE("src",
128 GST_STATIC_CAPS("text/plain")
130 static guint gst_pocketsphinx_signals[LAST_SIGNAL];
133 * Boxing of ps_lattice_t.
137 ps_lattice_get_type(void)
139 static GType ps_lattice_type = 0;
141 if (G_UNLIKELY(ps_lattice_type == 0)) {
142 ps_lattice_type = g_boxed_type_register_static
144 /* Conveniently, these should just work. */
145 (GBoxedCopyFunc) ps_lattice_retain,
146 (GBoxedFreeFunc) ps_lattice_free);
149 return ps_lattice_type;
153 * Boxing of ps_decoder_t.
157 ps_decoder_get_type(void)
159 static GType ps_decoder_type = 0;
161 if (G_UNLIKELY(ps_decoder_type == 0)) {
162 ps_decoder_type = g_boxed_type_register_static
164 /* Conveniently, these should just work. */
165 (GBoxedCopyFunc) ps_retain,
166 (GBoxedFreeFunc) ps_free);
169 return ps_decoder_type;
174 * gst_pocketsphinx element.
176 GST_BOILERPLATE (GstPocketSphinx, gst_pocketsphinx, GstElement, GST_TYPE_ELEMENT);
179 gst_pocketsphinx_base_init(gpointer gclass)
181 static const GstElementDetails element_details = {
184 "Convert speech to text",
185 "David Huggins-Daines <dhuggins@cs.cmu.edu>"
187 GstElementClass *element_class = GST_ELEMENT_CLASS(gclass);
189 gst_element_class_add_pad_template(element_class,
190 gst_static_pad_template_get(&sink_factory));
191 gst_element_class_add_pad_template(element_class,
192 gst_static_pad_template_get(&src_factory));
193 gst_element_class_set_details(element_class, &element_details);
197 string_disposal(gpointer key, gpointer value, gpointer user_data)
203 gst_pocketsphinx_finalize(GObject * gobject)
205 GstPocketSphinx *ps = GST_POCKETSPHINX(gobject);
207 g_hash_table_foreach(ps->arghash, string_disposal, NULL);
208 g_hash_table_destroy(ps->arghash);
209 g_free(ps->last_result);
211 GST_CALL_PARENT(G_OBJECT_CLASS, finalize,(gobject));
215 gst_pocketsphinx_class_init(GstPocketSphinxClass * klass)
217 GObjectClass *gobject_class;
219 gobject_class =(GObjectClass *) klass;
221 gobject_class->set_property = gst_pocketsphinx_set_property;
222 gobject_class->get_property = gst_pocketsphinx_get_property;
223 gobject_class->finalize = GST_DEBUG_FUNCPTR(gst_pocketsphinx_finalize);
225 /* TODO: We will bridge cmd_ln.h properties to GObject
226 * properties here somehow eventually. */
227 g_object_class_install_property
228 (gobject_class, PROP_HMM_DIR,
229 g_param_spec_string("hmm", "HMM Directory",
230 "Directory containing acoustic model parameters",
233 g_object_class_install_property
234 (gobject_class, PROP_LM_FILE,
235 g_param_spec_string("lm", "LM File",
236 "Language model file",
239 g_object_class_install_property
240 (gobject_class, PROP_LMCTL_FILE,
241 g_param_spec_string("lmctl", "LM Control File",
242 "Language model control file (for class LMs)",
245 g_object_class_install_property
246 (gobject_class, PROP_LM_NAME,
247 g_param_spec_string("lmname", "LM Name",
248 "Language model name (to select LMs from lmctl)",
251 g_object_class_install_property
252 (gobject_class, PROP_FSG_FILE,
253 g_param_spec_string("fsg", "FSG File",
254 "Finite state grammar file",
257 g_object_class_install_property
258 (gobject_class, PROP_FSG_MODEL,
259 g_param_spec_pointer("fsg_model", "FSG Model",
260 "Finite state grammar object (fsg_model_t *)",
262 g_object_class_install_property
263 (gobject_class, PROP_DICT_FILE,
264 g_param_spec_string("dict", "Dictionary File",
269 g_object_class_install_property
270 (gobject_class, PROP_FWDFLAT,
271 g_param_spec_boolean("fwdflat", "Flat Lexicon Search",
272 "Enable Flat Lexicon Search",
275 g_object_class_install_property
276 (gobject_class, PROP_BESTPATH,
277 g_param_spec_boolean("bestpath", "Graph Search",
278 "Enable Graph Search",
282 g_object_class_install_property
283 (gobject_class, PROP_LATDIR,
284 g_param_spec_string("latdir", "Lattice Directory",
285 "Output Directory for Lattices",
288 g_object_class_install_property
289 (gobject_class, PROP_LATTICE,
290 g_param_spec_boxed("lattice", "Word Lattice",
291 "Word lattice object for most recent result",
295 g_object_class_install_property
296 (gobject_class, PROP_MAXHMMPF,
297 g_param_spec_int("maxhmmpf", "Maximum HMMs per frame",
298 "Maximum number of HMMs searched per frame",
301 g_object_class_install_property
302 (gobject_class, PROP_MAXWPF,
303 g_param_spec_int("maxwpf", "Maximum words per frame",
304 "Maximum number of words searched per frame",
307 g_object_class_install_property
308 (gobject_class, PROP_DSRATIO,
309 g_param_spec_int("dsratio", "Frame downsampling ratio",
310 "Evaluate acoustic model every N frames",
314 g_object_class_install_property
315 (gobject_class, PROP_DECODER,
316 g_param_spec_boxed("decoder", "Decoder object",
317 "The underlying decoder",
320 g_object_class_install_property
321 (gobject_class, PROP_CONFIGURED,
322 g_param_spec_boolean("configured", "Finalize configuration",
323 "Set this to finalize configuration",
327 gst_pocketsphinx_signals[SIGNAL_PARTIAL_RESULT] =
328 g_signal_new("partial_result",
329 G_TYPE_FROM_CLASS(klass),
331 G_STRUCT_OFFSET(GstPocketSphinxClass, partial_result),
333 ps_marshal_VOID__STRING_STRING,
335 2, G_TYPE_STRING, G_TYPE_STRING
338 gst_pocketsphinx_signals[SIGNAL_RESULT] =
339 g_signal_new("result",
340 G_TYPE_FROM_CLASS(klass),
342 G_STRUCT_OFFSET(GstPocketSphinxClass, result),
344 ps_marshal_VOID__STRING_STRING,
346 2, G_TYPE_STRING, G_TYPE_STRING
349 GST_DEBUG_CATEGORY_INIT(pocketsphinx_debug, "pocketsphinx", 0,
350 "Automatic Speech Recognition");
354 gst_pocketsphinx_set_string(GstPocketSphinx *ps,
355 const gchar *key, const GValue *value)
357 gchar *oldstr, *newstr;
360 newstr = g_strdup(g_value_get_string(value));
363 if ((oldstr = g_hash_table_lookup(ps->arghash, key)))
365 cmd_ln_set_str_r(ps->config, key, newstr);
366 g_hash_table_foreach(ps->arghash, (gpointer)key, newstr);
370 gst_pocketsphinx_set_int(GstPocketSphinx *ps,
371 const gchar *key, const GValue *value)
373 cmd_ln_set_int32_r(ps->config, key, g_value_get_int(value));
377 gst_pocketsphinx_set_boolean(GstPocketSphinx *ps,
378 const gchar *key, const GValue *value)
380 cmd_ln_set_boolean_r(ps->config, key, g_value_get_boolean(value));
384 gst_pocketsphinx_set_property(GObject * object, guint prop_id,
385 const GValue * value, GParamSpec * pspec)
387 GstPocketSphinx *ps = GST_POCKETSPHINX(object);
390 case PROP_CONFIGURED:
392 ps_reinit(ps->ps, NULL);
394 ps->ps = ps_init(ps->config);
397 gst_pocketsphinx_set_string(ps, "-hmm", value);
399 /* Reinitialize the decoder with the new acoustic model. */
400 ps_reinit(ps->ps, NULL);
404 /* FSG and LM are mutually exclusive. */
405 gst_pocketsphinx_set_string(ps, "-fsg", NULL);
406 gst_pocketsphinx_set_string(ps, "-lmctl", NULL);
407 gst_pocketsphinx_set_string(ps, "-lm", value);
409 ngram_model_t *lm, *lmset;
411 /* Switch to this new LM. */
412 lm = ngram_model_read(ps->config,
413 g_value_get_string(value),
415 ps_get_logmath(ps->ps));
416 lmset = ps_get_lmset(ps->ps);
417 ngram_model_set_add(lmset, lm, g_value_get_string(value),
419 ps_update_lmset(ps->ps, lmset);
422 case PROP_LMCTL_FILE:
423 /* FSG and LM are mutually exclusive. */
424 gst_pocketsphinx_set_string(ps, "-fsg", NULL);
425 gst_pocketsphinx_set_string(ps, "-lmctl", value);
426 gst_pocketsphinx_set_string(ps, "-lm", NULL);
428 ngram_model_t *lmset;
429 lmset = ngram_model_set_read(ps->config,
430 g_value_get_string(value),
431 ps_get_logmath(ps->ps));
432 ps_update_lmset(ps->ps, lmset);
436 gst_pocketsphinx_set_string(ps, "-fsg", NULL);
437 gst_pocketsphinx_set_string(ps, "-lmname", value);
439 ngram_model_t *lm, *lmset;
441 lmset = ps_get_lmset(ps->ps);
442 lm = ngram_model_set_select(lmset, g_value_get_string(value));
443 ps_update_lmset(ps->ps, lmset);
447 gst_pocketsphinx_set_string(ps, "-dict", value);
449 /* Reinitialize the decoder with the new dictionary. */
450 ps_reinit(ps->ps, NULL);
455 fsg_set_t *fsgs = ps_get_fsgset(ps->ps);
456 fsg_model_t *fsg = g_value_get_pointer(value);
458 fsg_set_remove_byname(fsgs, fsg_model_name(fsg));
459 fsg_set_add(fsgs, fsg_model_name(fsg), fsg);
460 fsg_set_select(fsgs, fsg_model_name(fsg));
464 /* FSG and LM are mutually exclusive */
465 gst_pocketsphinx_set_string(ps, "-lm", NULL);
466 gst_pocketsphinx_set_string(ps, "-fsg", value);
469 /* Switch to this new FSG. */
470 fsg_set_t *fsgs = ps_get_fsgset(ps->ps);
473 fsg = fsg_model_readfile(g_value_get_string(value),
474 ps_get_logmath(ps->ps),
475 cmd_ln_float32_r(ps->config, "-lw"));
477 fsg_set_add(fsgs, fsg_model_name(fsg), fsg);
478 fsg_set_select(fsgs, fsg_model_name(fsg));
483 gst_pocketsphinx_set_boolean(ps, "-fwdflat", value);
486 gst_pocketsphinx_set_boolean(ps, "-bestpath", value);
491 ps->latdir = g_strdup(g_value_get_string(value));
494 gst_pocketsphinx_set_int(ps, "-maxhmmpf", value);
497 gst_pocketsphinx_set_int(ps, "-maxwpf", value);
500 gst_pocketsphinx_set_int(ps, "-ds", value);
503 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
509 gst_pocketsphinx_get_property(GObject * object, guint prop_id,
510 GValue * value, GParamSpec * pspec)
512 GstPocketSphinx *ps = GST_POCKETSPHINX(object);
516 g_value_set_boxed(value, ps->ps);
518 case PROP_CONFIGURED:
519 g_value_set_boolean(value, ps->ps != NULL);
522 g_value_set_string(value, cmd_ln_str_r(ps->config, "-hmm"));
525 g_value_set_string(value, cmd_ln_str_r(ps->config, "-lm"));
527 case PROP_LMCTL_FILE:
528 g_value_set_string(value, cmd_ln_str_r(ps->config, "-lmctl"));
531 g_value_set_string(value, cmd_ln_str_r(ps->config, "-lmname"));
534 g_value_set_string(value, cmd_ln_str_r(ps->config, "-dict"));
537 g_value_set_string(value, cmd_ln_str_r(ps->config, "-fsg"));
540 g_value_set_boolean(value, cmd_ln_boolean_r(ps->config, "-fwdflat"));
543 g_value_set_boolean(value, cmd_ln_boolean_r(ps->config, "-bestpath"));
546 g_value_set_string(value, ps->latdir);
551 if (ps->ps && (dag = ps_get_lattice(ps->ps)))
552 g_value_set_boxed(value, dag);
554 g_value_set_boxed(value, NULL);
558 g_value_set_int(value, cmd_ln_int32_r(ps->config, "-maxhmmpf"));
561 g_value_set_int(value, cmd_ln_int32_r(ps->config, "-maxwpf"));
564 g_value_set_int(value, cmd_ln_int32_r(ps->config, "-ds"));
567 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
573 gst_pocketsphinx_init(GstPocketSphinx * ps,
574 GstPocketSphinxClass * gclass)
577 gst_pad_new_from_static_template(&sink_factory, "sink");
579 gst_pad_new_from_static_template(&src_factory, "src");
581 /* Create the hash table to store argument strings. */
582 ps->arghash = g_hash_table_new(g_str_hash, g_str_equal);
584 /* Parse default command-line options. */
585 ps->config = cmd_ln_parse_r(NULL, ps_args(), default_argc, default_argv, FALSE);
588 gst_element_add_pad(GST_ELEMENT(ps), ps->sinkpad);
589 gst_pad_set_chain_function(ps->sinkpad, gst_pocketsphinx_chain);
590 gst_pad_set_event_function(ps->sinkpad, gst_pocketsphinx_event);
591 gst_pad_use_fixed_caps(ps->sinkpad);
593 gst_element_add_pad(GST_ELEMENT(ps), ps->srcpad);
594 gst_pad_use_fixed_caps(ps->srcpad);
596 /* Initialize time. */
597 ps->last_result_time = 0;
598 ps->last_result = NULL;
602 gst_pocketsphinx_chain(GstPad * pad, GstBuffer * buffer)
606 ps = GST_POCKETSPHINX(GST_OBJECT_PARENT(pad));
608 /* Start an utterance for the first buffer we get (i.e. we assume
609 * that the VADER is "leaky") */
610 if (!ps->listening) {
611 ps->listening = TRUE;
612 ps_start_utt(ps->ps, NULL);
614 ps_process_raw(ps->ps,
615 (short *)GST_BUFFER_DATA(buffer),
616 GST_BUFFER_SIZE(buffer) / sizeof(short),
619 /* Get a partial result every now and then, see if it is different. */
620 if (ps->last_result_time == 0
621 /* Check every 100 milliseconds. */
622 || (GST_BUFFER_TIMESTAMP(buffer) - ps->last_result_time) > 100*10*1000) {
627 hyp = ps_get_hyp(ps->ps, &score, &uttid);
628 ps->last_result_time = GST_BUFFER_TIMESTAMP(buffer);
629 if (hyp && strlen(hyp) > 0) {
630 if (ps->last_result == NULL || 0 != strcmp(ps->last_result, hyp)) {
631 g_free(ps->last_result);
632 ps->last_result = g_strdup(hyp);
633 /* Emit a signal for applications. */
634 g_signal_emit(ps, gst_pocketsphinx_signals[SIGNAL_PARTIAL_RESULT],
643 gst_pocketsphinx_event(GstPad *pad, GstEvent *event)
647 ps = GST_POCKETSPHINX(GST_OBJECT_PARENT(pad));
649 /* Pick out VAD events. */
650 switch (event->type) {
651 case GST_EVENT_NEWSEGMENT:
652 /* Initialize the decoder once the audio starts, if it's not
654 if (ps->ps == NULL) {
655 ps->ps = ps_init(ps->config);
656 if (ps->ps == NULL) {
657 GST_ELEMENT_ERROR(GST_ELEMENT(ps), LIBRARY, INIT,
658 ("Failed to initialize PocketSphinx"),
659 ("Failed to initialize PocketSphinx"));
663 return gst_pad_event_default(pad, event);
664 case GST_EVENT_VADER_START:
665 ps->listening = TRUE;
666 ps_start_utt(ps->ps, NULL);
667 /* Forward this event. */
668 return gst_pad_event_default(pad, event);
670 case GST_EVENT_VADER_STOP: {
678 ps->listening = FALSE;
680 hyp = ps_get_hyp(ps->ps, &score, &uttid);
681 /* Dump the lattice if requested. */
683 char *latfile = string_join(ps->latdir, "/", uttid, ".lat", NULL);
686 if ((dag = ps_get_lattice(ps->ps)))
687 ps_lattice_write(dag, latfile);
692 /* Emit a signal for applications. */
693 g_signal_emit(ps, gst_pocketsphinx_signals[SIGNAL_RESULT],
695 /* Forward this result in a buffer. */
696 buffer = gst_buffer_new_and_alloc(strlen(hyp) + 2);
697 strcpy((char *)GST_BUFFER_DATA(buffer), hyp);
698 GST_BUFFER_DATA(buffer)[strlen(hyp)] = '\n';
699 GST_BUFFER_DATA(buffer)[strlen(hyp)+1] = '\0';
700 GST_BUFFER_TIMESTAMP(buffer) = GST_EVENT_TIMESTAMP(event);
701 gst_buffer_set_caps(buffer, GST_PAD_CAPS(ps->srcpad));
702 gst_pad_push(ps->srcpad, buffer);
705 /* Forward this event. */
706 return gst_pad_event_default(pad, event);
709 /* Don't bother with other events. */
710 return gst_pad_event_default(pad, event);
715 plugin_init(GstPlugin * plugin)
717 if (!gst_element_register(plugin, "pocketsphinx",
718 GST_RANK_NONE, GST_TYPE_POCKETSPHINX))
720 if (!gst_element_register(plugin, "vader",
721 GST_RANK_NONE, GST_TYPE_VADER))
726 #define VERSION PACKAGE_VERSION
727 #define PACKAGE PACKAGE_NAME
728 GST_PLUGIN_DEFINE(GST_VERSION_MAJOR,
731 "PocketSphinx plugin",
732 plugin_init, VERSION,
733 #if (GST_VERSION_MINOR == 10 && GST_VERSION_MICRO < 15) /* Nokia's bogus old GStreamer */
738 "PocketSphinx", "http://cmusphinx.sourceforge.net/")