1 /* ====================================================================
2 * Copyright (c) 1999-2010 Carnegie Mellon University. All rights
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
17 * This work was supported in part by funding from the Defense Advanced
18 * Research Projects Agency and the National Science Foundation of the
19 * United States of America, and the CMU Sphinx Speech Consortium.
21 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * ====================================================================
37 * Modified version of the "cutter" element to do better at VAD.
45 #include <gst/audio/audio.h>
49 GST_DEBUG_CATEGORY_STATIC(vader_debug);
50 #define GST_CAT_DEFAULT vader_debug
52 static const GstElementDetails vader_details =
53 GST_ELEMENT_DETAILS("VAD element",
54 "Filter/Editor/Audio",
55 "Voice Activity DEtectoR to split audio into non-silent bits",
56 "Thomas <thomas@apestaart.org>, David Huggins-Daines <dhuggins@cs.cmu.edu>");
58 static GstStaticPadTemplate vader_src_factory =
59 GST_STATIC_PAD_TEMPLATE("src",
62 GST_STATIC_CAPS("audio/x-raw-int, "
63 "rate = (int) [ 1, MAX ], "
64 "channels = (int) [ 1, MAX ], "
65 "endianness = (int) BYTE_ORDER, "
68 "signed = (boolean) true")
71 static GstStaticPadTemplate vader_sink_factory =
72 GST_STATIC_PAD_TEMPLATE("sink",
75 GST_STATIC_CAPS("audio/x-raw-int, "
76 /* FIXME: Actually we want this to be negotiable... */
78 "channels = (int) 1, "
79 "endianness = (int) BYTE_ORDER, "
82 "signed = (boolean) true")
92 static guint gst_vader_signals[LAST_SIGNAL];
105 GST_BOILERPLATE(GstVader, gst_vader, GstElement, GST_TYPE_ELEMENT);
107 static void gst_vader_set_property(GObject * object, guint prop_id,
108 const GValue * value, GParamSpec * pspec);
109 static void gst_vader_get_property(GObject * object, guint prop_id,
110 GValue * value, GParamSpec * pspec);
111 static void gst_vader_finalize(GObject *gobject);
113 static GstFlowReturn gst_vader_chain(GstPad * pad, GstBuffer * buffer);
116 gst_vader_base_init(gpointer g_class)
118 GstElementClass *element_class = GST_ELEMENT_CLASS(g_class);
120 gst_element_class_add_pad_template(element_class,
121 gst_static_pad_template_get(&vader_src_factory));
122 gst_element_class_add_pad_template(element_class,
123 gst_static_pad_template_get(&vader_sink_factory));
124 gst_element_class_set_details(element_class, &vader_details);
128 gst_vader_class_init(GstVaderClass * klass)
130 GObjectClass *gobject_class;
131 GstElementClass *gstelement_class;
133 gobject_class = (GObjectClass *) klass;
134 gstelement_class = (GstElementClass *) klass;
136 gobject_class->set_property = gst_vader_set_property;
137 gobject_class->get_property = gst_vader_get_property;
138 gobject_class->finalize = gst_vader_finalize;
140 g_object_class_install_property
141 (G_OBJECT_CLASS(klass), PROP_THRESHOLD,
142 g_param_spec_double("threshold", "Threshold",
143 "Volume threshold for speech/silence decision",
144 -1.0, 1.0, 256.0/32768.0, G_PARAM_READWRITE));
145 g_object_class_install_property
146 (G_OBJECT_CLASS(klass), PROP_AUTO_THRESHOLD,
147 g_param_spec_boolean("auto-threshold", "Automatic Threshold",
148 "Set speech/silence threshold automatically",
149 FALSE, G_PARAM_READWRITE));
150 g_object_class_install_property
151 (G_OBJECT_CLASS(klass), PROP_RUN_LENGTH,
152 g_param_spec_uint64("run-length", "Run length",
153 "Length of drop below threshold before cut_stop (in nanoseconds)",
154 0, G_MAXUINT64, (guint64)(0.5 * GST_SECOND), G_PARAM_READWRITE));
155 g_object_class_install_property
156 (G_OBJECT_CLASS(klass), PROP_PRE_LENGTH,
157 g_param_spec_uint64("pre-length", "Pre-recording buffer length",
158 "Length of pre-recording buffer (in nanoseconds)",
159 0, G_MAXUINT64, (guint64)(0.5 * GST_SECOND), G_PARAM_READWRITE));
160 g_object_class_install_property
161 (G_OBJECT_CLASS(klass), PROP_SILENT,
162 g_param_spec_boolean("silent", "Silent",
163 "Whether the VADER is currently in a silence region",
164 TRUE, G_PARAM_READWRITE));
165 g_object_class_install_property
166 (gobject_class, PROP_DUMPDIR,
167 g_param_spec_string("dump-dir", "Audio dump directory",
168 "Directory in which to write audio segments for debugging",
172 gst_vader_signals[SIGNAL_VADER_START] =
173 g_signal_new("vader_start",
174 G_TYPE_FROM_CLASS(klass),
176 G_STRUCT_OFFSET(GstVaderClass, vader_start),
178 gst_marshal_VOID__INT64,
183 gst_vader_signals[SIGNAL_VADER_STOP] =
184 g_signal_new("vader_stop",
185 G_TYPE_FROM_CLASS(klass),
187 G_STRUCT_OFFSET(GstVaderClass, vader_stop),
189 gst_marshal_VOID__INT64,
194 GST_DEBUG_CATEGORY_INIT(vader_debug, "vader", 0, "Voice Activity Detection");
198 gst_vader_init(GstVader * filter, GstVaderClass * g_class)
201 gst_pad_new_from_static_template(&vader_sink_factory, "sink");
203 gst_pad_new_from_static_template(&vader_src_factory, "src");
205 g_static_rec_mutex_init(&filter->mtx);
207 filter->threshold_level = 256;
208 filter->threshold_length = (guint64)(0.5 * GST_SECOND);
209 filter->prior_sample = 0;
210 filter->auto_threshold = FALSE;
211 filter->silence_mean = 0;
212 filter->silence_stddev = 0;
213 filter->silence_frames = 0;
214 filter->dumpdir = NULL;
215 filter->dumpfile = NULL;
218 memset(filter->window, 0, VADER_WINDOW * sizeof(*filter->window));
219 filter->silent = TRUE;
220 filter->silent_prev = TRUE;
221 filter->silent_run_length = 0;
223 filter->pre_buffer = NULL;
224 filter->pre_length = (guint64)(0.5 * GST_SECOND);
225 filter->pre_run_length = 0;
227 gst_element_add_pad(GST_ELEMENT(filter), filter->sinkpad);
228 gst_pad_set_chain_function(filter->sinkpad, gst_vader_chain);
229 gst_pad_use_fixed_caps(filter->sinkpad);
231 gst_element_add_pad(GST_ELEMENT(filter), filter->srcpad);
232 gst_pad_use_fixed_caps(filter->srcpad);
236 gst_vader_finalize(GObject *gobject)
238 GstVader *vader = GST_VADER(gobject);
240 g_static_rec_mutex_free(&vader->mtx);
242 fclose(vader->dumpfile);
244 g_free(vader->dumpdir);
245 GST_CALL_PARENT(G_OBJECT_CLASS, finalize, (gobject));
249 gst_vader_message_new(GstVader * c, gboolean above, GstClockTime timestamp)
253 s = gst_structure_new("vader",
254 "above", G_TYPE_BOOLEAN, above,
255 "timestamp", GST_TYPE_CLOCK_TIME, timestamp, NULL);
257 return gst_message_new_element(GST_OBJECT(c), s);
261 gst_vader_event_new(GstVader *c, GstEventType type, GstClockTime timestamp)
265 e = gst_event_new_custom(type, NULL);
266 GST_EVENT_TIMESTAMP(e) = timestamp;
272 compute_normed_power(gint16 *in_data, guint num_samples, gint *inout_prior)
274 guint i, shift, sumsq, prior;
278 prior = *inout_prior;
279 for (i = 0; i < num_samples; ++i) {
283 /* Do pre-emphasis to remove low-frequency noise (this should
284 * be sufficient, although ideally we'd band-pass filter the
285 * data from about 200 to 6000Hz) */
286 x = in_data[i] - prior;
289 sumsq += (sq >> shift);
290 /* Prevent overflows. */
291 while (sumsq > 0x10000) {
296 *inout_prior = prior;
298 /* Normalize it to Q15 (this is equivalent to dividing by (1<<30)
299 * then multiplying by (1<<15)). */
301 return (sumsq << (shift - 15)) / num_samples;
303 return (sumsq / num_samples) >> (15 - shift);
307 * Calculate Q15 square root z of a Q15 number x.
309 * This is equal to 32768 \sqrt(\frac{x}{32768}), which is equal to
310 * 2^{7.5} x^{0.5}, so:
313 * z = 2^{7.5 + 0.5y} = 2^{7.5 + 0.5y_{odd}} 2^{0.5y_{remainder})
314 * = 2^{7.5 + 0.5y_{odd}} + 2^{7.5 + 0.5y_{odd}} (2^{0.5y_{remainder}) - 1)
316 * Therefore the factor 2^{0.5y_{remainder}) - 1 can be stored in a
317 * table. Since 0 <= y_{remainder} < 2, this table has size 2^N -
318 * 2^{N-2} for some value of N (7 is a pretty good one...)
320 #define REMTAB_SIZE 96
321 static const guint16 remtab[REMTAB_SIZE] = {
322 0, 508, 1008, 1501, 1987, 2467, 2940, 3406, 3867, 4322, 4772, 5216, 5655, 6090, 6519, 6944, 7364, 7780, 8191, 8599, 9003, 9402, 9798, 10191, 10579, 10965, 11347, 11725, 12101, 12473, 12843, 13209, 13572, 13933, 14291, 14646, 14999, 15349, 15696, 16041, 16384, 16724, 17061, 17397, 17730, 18062, 18391, 18717, 19042, 19365, 19686, 20005, 20322, 20637, 20950, 21261, 21571, 21879, 22185, 22490, 22792, 23093, 23393, 23691, 23987, 24282, 24576, 24867, 25158, 25447, 25734, 26020, 26305, 26588, 26870, 27151, 27430, 27708, 27985, 28261, 28535, 28808, 29080, 29350, 29620, 29888, 30155, 30422, 30686, 30950, 31213, 31475, 31735, 31995, 32253, 32511
325 fixpoint_sqrt_q15(guint x)
328 int log2, scale, idx;
330 /* 0 and one are special cases since they have no closest odd
335 return 181; /* 32768 * sqrt(1.0/32768) */
337 /* Compute nearest log2. */
338 for (log2 = 31; log2 > 0; --log2)
341 /* Find nearest odd log2. */
344 /* Find index into remtab. */
346 /* Scale it to fit remtab. */
347 scale = (1<<(log2 + 2)) - (1<<log2);
348 idx = idx * REMTAB_SIZE / scale;
349 /* Base of square root. */
350 z = 1<<(8 + log2 / 2);
352 return z + ((z * remtab[idx]) >> 15);
356 * Very approximate fixed-point square root (for big numbers only!)
358 * Really simple, sqrt(x) = 2^{\frac{\log_2 x}{2}}. So approximate
359 * \log_2 x, then divide it by two, and exponentiate :)
362 fixpoint_bogus_sqrt(guint x)
366 /* Compute nearest log2. */
367 for (log2 = 31; log2 > 0; --log2)
370 /* Return "square root" */
371 return 1<<(log2/2+1);
375 gst_vader_transition(GstVader *filter, GstClockTime ts)
377 /* NOTE: This function MUST be called with filter->mtx held! */
378 /* has the silent status changed ? if so, send right signal
379 * and, if from silent -> not silent, flush pre_record buffer
381 if (filter->silent) {
382 /* Sound to silence transition. */
384 gst_vader_message_new(filter, FALSE, ts);
386 gst_vader_event_new(filter, GST_EVENT_VADER_STOP, ts);
387 GST_DEBUG_OBJECT(filter, "signaling CUT_STOP");
388 gst_element_post_message(GST_ELEMENT(filter), m);
389 /* Insert a custom event in the stream to mark the end of a cut. */
390 /* This will block if the pipeline is paused so we have to unlock. */
391 g_static_rec_mutex_unlock(&filter->mtx);
392 gst_pad_push_event(filter->srcpad, e);
393 g_static_rec_mutex_lock(&filter->mtx);
394 /* FIXME: That event's timestamp is wrong... as is this one. */
395 g_signal_emit(filter, gst_vader_signals[SIGNAL_VADER_STOP], 0, ts);
396 /* Stop dumping audio */
397 if (filter->dumpfile) {
398 fclose(filter->dumpfile);
399 filter->dumpfile = NULL;
403 /* Silence to sound transition. */
408 GST_DEBUG_OBJECT(filter, "signaling CUT_START");
409 /* Use the first pre_buffer's timestamp for the signal if possible. */
410 if (filter->pre_buffer) {
413 prebuf = (g_list_first(filter->pre_buffer))->data;
414 ts = GST_BUFFER_TIMESTAMP(prebuf);
417 g_signal_emit(filter, gst_vader_signals[SIGNAL_VADER_START],
419 m = gst_vader_message_new(filter, TRUE, ts);
420 e = gst_vader_event_new(filter, GST_EVENT_VADER_START, ts);
421 gst_element_post_message(GST_ELEMENT(filter), m);
423 /* Insert a custom event in the stream to mark the beginning of a cut. */
424 /* This will block if the pipeline is paused so we have to unlock. */
425 g_static_rec_mutex_unlock(&filter->mtx);
426 gst_pad_push_event(filter->srcpad, e);
427 g_static_rec_mutex_lock(&filter->mtx);
429 /* Start dumping audio */
430 if (filter->dumpdir) {
431 gchar *filename = g_strdup_printf("%s/%08d.raw", filter->dumpdir,
433 filter->dumpfile = fopen(filename, "wb");
437 /* first of all, flush current buffer */
438 GST_DEBUG_OBJECT(filter, "flushing buffer of length %" GST_TIME_FORMAT,
439 GST_TIME_ARGS(filter->pre_run_length));
440 while (filter->pre_buffer) {
443 prebuf = (g_list_first(filter->pre_buffer))->data;
444 filter->pre_buffer = g_list_remove(filter->pre_buffer, prebuf);
445 if (filter->dumpfile)
446 fwrite(GST_BUFFER_DATA(prebuf), 1, GST_BUFFER_SIZE(prebuf),
448 /* This will block if the pipeline is paused so we have to unlock. */
449 g_static_rec_mutex_unlock(&filter->mtx);
450 gst_pad_push(filter->srcpad, prebuf);
451 g_static_rec_mutex_lock(&filter->mtx);
454 GST_DEBUG_OBJECT(filter, "flushed %d buffers", count);
455 filter->pre_run_length = 0;
461 gst_vader_chain(GstPad * pad, GstBuffer * buf)
469 g_return_val_if_fail(pad != NULL, GST_FLOW_ERROR);
470 g_return_val_if_fail(GST_IS_PAD(pad), GST_FLOW_ERROR);
471 g_return_val_if_fail(buf != NULL, GST_FLOW_ERROR);
473 filter = GST_VADER(GST_OBJECT_PARENT(pad));
474 g_return_val_if_fail(filter != NULL, GST_FLOW_ERROR);
475 g_return_val_if_fail(GST_IS_VADER(filter), GST_FLOW_ERROR);
477 in_data = (gint16 *) GST_BUFFER_DATA(buf);
478 num_samples = GST_BUFFER_SIZE(buf) / 2;
480 /* Enter a critical section. */
481 g_static_rec_mutex_lock(&filter->mtx);
482 filter->silent_prev = filter->silent;
483 /* If we are in auto-threshold mode, check to see if we have
484 * enough data to estimate a threshold. (FIXME: we should be
485 * estimating at the sample level rather than the frame level,
487 if (filter->threshold_level == -1) {
488 if (filter->silence_frames > 5) {
489 filter->silence_mean /= filter->silence_frames;
490 filter->silence_stddev /= filter->silence_frames;
491 filter->silence_stddev -= filter->silence_mean * filter->silence_mean;
492 filter->silence_stddev = fixpoint_bogus_sqrt(filter->silence_stddev);
493 /* Set threshold three standard deviations from the mean. */
494 filter->threshold_level = filter->silence_mean + 3 * filter->silence_stddev;
495 GST_DEBUG_OBJECT(filter, "silence_mean %d stddev %d auto_threshold %d\n",
496 filter->silence_mean, filter->silence_stddev,
497 filter->threshold_level);
501 /* Divide buffer into reasonably sized parts. */
502 for (i = 0; i < num_samples; i += VADER_FRAME) {
505 frame_len = MIN(num_samples - i, VADER_FRAME);
506 power = compute_normed_power(in_data + i, frame_len, &filter->prior_sample);
507 rms = fixpoint_sqrt_q15(power);
509 /* If we are in auto-threshold mode, don't do any voting etc. */
510 if (filter->threshold_level == -1) {
511 filter->silence_mean += rms;
512 filter->silence_stddev += rms * rms;
513 filter->silence_frames += 1;
514 GST_DEBUG_OBJECT(filter, "silence_mean_acc %d silence_stddev_acc %d frames %d\n",
515 filter->silence_mean, filter->silence_stddev, filter->silence_frames);
518 /* Shift back window values. */
519 memmove(filter->window, filter->window + 1,
520 (VADER_WINDOW - 1) * sizeof(*filter->window));
522 /* Decide if this buffer is silence or not. */
523 if (rms > filter->threshold_level)
524 filter->window[VADER_WINDOW-1] = TRUE;
526 filter->window[VADER_WINDOW-1] = FALSE;
528 /* Vote on whether we have entered a region of non-silence. */
530 for (j = 0; j < VADER_WINDOW; ++j)
531 vote += filter->window[j];
533 GST_DEBUG_OBJECT(filter, "frame_len %d rms power %d threshold %d vote %d\n",
534 frame_len, rms, filter->threshold_level, vote);
536 if (vote > VADER_WINDOW / 2) {
537 filter->silent_run_length = 0;
538 filter->silent = FALSE;
541 filter->silent_run_length
542 += gst_audio_duration_from_pad_buffer(filter->sinkpad, buf);
545 if (filter->silent_run_length > filter->threshold_length)
546 /* it has been silent long enough, flag it */
547 filter->silent = TRUE;
550 /* Handle transitions between silence and non-silence. */
551 if (filter->silent != filter->silent_prev) {
552 gst_vader_transition(filter, GST_BUFFER_TIMESTAMP(buf));
554 /* Handling of silence detection is done. */
555 g_static_rec_mutex_unlock(&filter->mtx);
557 /* now check if we have to send the new buffer to the internal buffer cache
558 * or to the srcpad */
559 if (filter->silent) {
560 /* Claim the lock while manipulating the queue. */
561 g_static_rec_mutex_lock(&filter->mtx);
562 filter->pre_buffer = g_list_append(filter->pre_buffer, buf);
563 filter->pre_run_length +=
564 gst_audio_duration_from_pad_buffer(filter->sinkpad, buf);
565 while (filter->pre_run_length > filter->pre_length) {
568 prebuf = (g_list_first(filter->pre_buffer))->data;
569 g_assert(GST_IS_BUFFER(prebuf));
570 filter->pre_buffer = g_list_remove(filter->pre_buffer, prebuf);
571 filter->pre_run_length -=
572 gst_audio_duration_from_pad_buffer(filter->sinkpad, prebuf);
573 gst_buffer_unref(prebuf);
575 g_static_rec_mutex_unlock(&filter->mtx);
577 if (filter->dumpfile)
578 fwrite(GST_BUFFER_DATA(buf), 1, GST_BUFFER_SIZE(buf),
580 gst_pad_push(filter->srcpad, buf);
587 gst_vader_set_property(GObject * object, guint prop_id,
588 const GValue * value, GParamSpec * pspec)
592 g_return_if_fail(GST_IS_VADER(object));
593 filter = GST_VADER(object);
597 filter->threshold_level = (gint)(g_value_get_double(value) * 32768.0);
599 case PROP_AUTO_THRESHOLD:
600 /* We are going to muck around with things... */
601 g_static_rec_mutex_lock(&filter->mtx);
602 filter->auto_threshold = g_value_get_boolean(value);
603 /* Setting this to TRUE re-initializes auto calibration. */
604 if (filter->auto_threshold) {
605 /* We have to be in silence mode to calibrate. */
606 filter->silent_prev = filter->silent;
607 filter->silent = TRUE;
608 /* Do "artifical" sil-speech or speech-sil transitions. */
609 if (filter->silent != filter->silent_prev) {
610 gst_vader_transition(filter, gst_clock_get_time(GST_ELEMENT_CLOCK(filter)));
612 /* Reset counters and such. */
613 filter->threshold_level = -1;
614 memset(filter->window, 0, sizeof(*filter->window) * VADER_WINDOW);
615 filter->silence_mean = 0;
616 filter->silence_stddev = 0;
617 filter->silence_frames = 0;
619 g_static_rec_mutex_unlock(&filter->mtx);
622 /* We are going to muck around with things... */
623 g_static_rec_mutex_lock(&filter->mtx);
624 filter->silent_prev = filter->silent;
625 filter->silent = g_value_get_boolean(value);
626 /* Do "artifical" sil-speech or speech-sil transitions. */
627 if (filter->silent != filter->silent_prev) {
628 gst_vader_transition(filter, gst_clock_get_time(GST_ELEMENT_CLOCK(filter)));
629 /* Also flush the voting window so we don't go right back into speech. */
630 memset(filter->window, 0, sizeof(*filter->window) * VADER_WINDOW);
632 g_static_rec_mutex_unlock(&filter->mtx);
634 case PROP_RUN_LENGTH:
635 filter->threshold_length = g_value_get_uint64(value);
637 case PROP_PRE_LENGTH:
638 filter->pre_length = g_value_get_uint64(value);
641 g_free(filter->dumpdir);
642 filter->dumpdir = g_strdup(g_value_get_string(value));
645 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
651 gst_vader_get_property(GObject * object, guint prop_id,
652 GValue * value, GParamSpec * pspec)
656 g_return_if_fail(GST_IS_VADER(object));
657 filter = GST_VADER(object);
660 case PROP_RUN_LENGTH:
661 g_value_set_uint64(value, filter->threshold_length);
663 case PROP_PRE_LENGTH:
664 g_value_set_uint64(value, filter->pre_length);
667 g_value_set_double(value, (gdouble)filter->threshold_level / 32768.0);
669 case PROP_AUTO_THRESHOLD:
670 g_value_set_boolean(value, filter->auto_threshold);
673 g_value_set_boolean(value, filter->silent);
676 g_value_set_string(value, filter->dumpdir);
679 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);