include/sphinxbase/cont_ad.h

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37 /*
  38  * cont_ad.h -- Continuous A/D listening and silence filtering module.
  39  *
  40  * **********************************************
  41  * CMU ARPA Speech Project
  42  *
  43  * Copyright (c) 1996 Carnegie Mellon University.
  44  * ALL RIGHTS RESERVED.
  45  * **********************************************
  46  *
  47  * HISTORY
  48  *
  49  * 13-Jul-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  50  *              Added spf and adbufsize to cont_ad_t in order to support variable
  51  *              frame sizes depending on audio sampling rate.
  52  *
  53  * 30-Jun-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  54  *              Added FILE* argument to cont_ad_powhist_dump().
  55  *
  56  * 16-Jan-98    Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
  57  *              Changed to use dB instead of the weird power measure.
  58  *              Added most system parameters to cont_ad_t instead of hardwiring
  59  *              them in cont_ad.c.
  60  *              Added cont_ad_set_params() and cont_ad_get_params().
  61  *
  62  * 28-Jul-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  63  *              Added cont_ad_t.siglvl.
  64  *
  65  * 27-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  66  *              Added the option for cont_ad_read to return -1 on EOF.
  67  *
  68  * 21-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  69  *              Added cont_ad_set_thresh().
  70  *
  71  * 20-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  72  *              Separated thresholds for speech and silence.
  73  *
  74  * 17-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  75  *              Created, based loosely on Steve Reed's original implementation.
  76  */
  77
  78
  79 #ifndef _CONT_AD_H_
  80 #define _CONT_AD_H_
  81
  82 /* Win32/WinCE DLL gunk */
  83 #include <sphinxbase/sphinxbase_export.h>
  84 #include <sphinxbase/prim_type.h>
  85 #include <sphinxbase/ad.h>
  86
  87 /**
  88  * \file cont_ad.h
  89  * \brief Continuous A/D listening and silence filtering module.
  90  *
  91  * This module is intended to be interposed as a filter between any
  92  * raw A/D source and the application to remove silence regions.  Its
  93  * main purpose is to remove regions of silence from the raw input
  94  * speech.  It is initialized with a raw A/D source function (during
  95  * the cont_ad_init call).  The application is responsible for setting
  96  * up the A/D source, turning recording on and off as it desires.
  97  * Filtered A/D data can be read by the application using the
  98  * cont_ad_read function.
  99  *
 100  * In other words, the application calls cont_ad_read instead of the
 101  * raw A/D source function (e.g., ad_read in libad) to obtain filtered
 102  * A/D data with silence regions removed.  This module itself does not
 103  * enforce any other structural changes to the application.
 104  *
 105  * The cont_ad_read function also updates an "absolute" timestamp (see
 106  * cont_ad_t.read_ts) at the end of each invocation.  The timestamp
 107  * indicates the total number of samples of A/D data read until this
 108  * point, including data discarded as silence frames.  The application
 109  * is responsible for using this timestamp to make any policy
 110  * decisions regarding utterance boundaries or whatever.
 111  */
 112
 113
 114 #include <stdio.h>
 115
 116
 117 #ifdef __cplusplus
 118 extern "C" {
 119 #endif
 120 #if 0
 121 /* Fool Emacs. */
 122 }
 123 #endif
 124
 125 /* States of continuous listening module */
 126 #define CONT_AD_STATE_SIL       0
 127 #define CONT_AD_STATE_SPEECH    1
 128
 129
 130 /**
 131  * \struct spseg_t
 132  * \brief  (FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by the
 133  * application.
 134  */
 135 typedef struct spseg_s {
 136     int32 startfrm;     /**< Frame-id in adbuf (see below) of start of this segment */
 137     int32 nfrm;         /**< Number of frames in segment (may wrap around adbuf) */
 138     struct spseg_s *next;       /**< Next speech segment (with some intervening silence) */
 139 } spseg_t;
 140
 141
 142 /**
 143  * \struct cont_ad_t
 144  * \brief Continuous listening module or object
 145  * Continuous listening module or object.  An application can open and maintain several
 146  * such objects, if necessary.
 147  * FYI: Module always in one of two states: SILENCE or SPEECH.  Transitions between the
 148  * two detected by sliding a window spanning several frames and looking for some minimum
 149  * number of frames of the other type.
 150  */
 151 typedef struct {
 152     /* Function to be called for obtaining A/D data (see prototype for ad_read in ad.h) */
 153     int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max);
 154     ad_rec_t *ad;       /**< A/D device argument for adfunc.  Also, ad->sps used to
 155                            determine frame size (spf, see below) */
 156     int32 rawmode;      /**< Pass all input data through, without filtering silence */
 157
 158     int16 *adbuf;       /**< Circular buffer for maintaining A/D data read until consumed */
 159
 160     /* **************************************************************************
 161      * state, read_ts, and siglvl are provided for READ-ONLY use by client
 162      * applications, and are updated by calls to cont_ad_read() (see below).  All
 163      * other variables should be left alone.
 164      */
 165     int32 state;        /**< State of data returned by most recent cont_ad_read call;
 166                            CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. */
 167     int32 read_ts;      /**< Absolute timestamp (total no. of raw samples consumed
 168                            upto the most recent cont_ad_read call, starting from
 169                            the very beginning).  Note that this is a 32-bit
 170                            integer; applications should guard against overflow. */
 171     int32 seglen;       /**< Total no. of raw samples consumed in the segment
 172                            returned by the most recent cont_ad_read call.  Can be
 173                            used to detect silence segments that have stretched long
 174                            enough to terminate an utterance */
 175     int32 siglvl;       /**< Max signal level for the data consumed by the most recent
 176                            cont_ad_read call (dB range: 0-99).  Can be used to
 177                            update a V-U meter, for example. */
 178     /* ************************************************************************ */
 179
 180     int32 sps;          /**< Samples/sec; moved from ad->sps to break dependence on
 181                            ad by N. Roy.*/
 182
 183     int32 eof;          /**< Whether the source ad device has encountered EOF */
 184
 185     int32 spf;          /**< Samples/frame; audio level is analyzed within frames */
 186     int32 adbufsize;    /**< Buffer size (Number of samples) */
 187     int32 prev_sample;  /**< For pre-emphasis filter */
 188     int32 headfrm;      /**< Frame number in adbuf with unconsumed A/D data */
 189     int32 n_frm;        /**< Number of complete frames of unconsumed A/D data in adbuf */
 190     int32 n_sample;     /**< Number of samples of unconsumed data in adbuf */
 191     int32 tot_frm;      /**< Total number of frames of A/D data read, including consumed ones */
 192     int32 noise_level;  /**< PWP: what we claim as the "current" noise level */
 193
 194     int32 *pow_hist;    /**< Histogram of frame power, moving window, decayed */
 195     char *frm_pow;      /**< Frame power */
 196
 197     int32 auto_thresh;  /**< Do automatic threshold adjustment or not */
 198     int32 delta_sil;    /**< Max silence power/frame ABOVE noise level */
 199     int32 delta_speech; /**< Min speech power/frame ABOVE noise level */
 200     int32 min_noise;    /**< noise lower than this we ignore */
 201     int32 max_noise;    /**< noise higher than this signals an error */
 202     int32 winsize;      /**< how many frames to look at for speech det */
 203     int32 speech_onset; /**< start speech on >= these many frames out of winsize, of >= delta_speech */
 204     int32 sil_onset;    /**< end speech on >= these many frames out of winsize, of <= delta_sil */
 205     int32 leader;       /**< pad beggining of speech with this many extra frms */
 206     int32 trailer;      /**< pad end of speech with this many extra frms */
 207
 208     int32 thresh_speech;/**< Frame considered to be speech if power >= thresh_speech
 209                            (for transitioning from SILENCE to SPEECH state) */
 210     int32 thresh_sil;   /**< Frame considered to be silence if power <= thresh_sil
 211                            (for transitioning from SPEECH to SILENCE state) */
 212     int32 thresh_update;/**< Number of frames before next update to pow_hist/thresholds */
 213     float32 adapt_rate; /**< Linear interpolation constant for rate at which noise level adapted
 214                            to each estimate;
 215                            range: 0-1; 0=> no adaptation, 1=> instant adaptation */
 216
 217     int32 tail_state;   /**< State at the end of its internal buffer (internal use):
 218                            CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.  Note: This is
 219                            different from cont_ad_t.state. */
 220     int32 win_startfrm; /**< Where next analysis window begins */
 221     int32 win_validfrm; /**< Number of frames currently available from win_startfrm for analysis */
 222     int32 n_other;      /**< If in SILENCE state, number of frames in analysis window considered to
 223                            be speech; otherwise number of frames considered to be silence */
 224     spseg_t *spseg_head;/**< First of unconsumed speech segments */
 225     spseg_t *spseg_tail;/**< Last of unconsumed speech segments */
 226
 227     FILE *rawfp;        /**< If non-NULL, raw audio input data processed by cont_ad
 228                            is dumped to this file.  Controlled by user application
 229                            via cont_ad_set_rawfp().  NULL when cont_ad object is
 230                            initially created. */
 231     FILE *logfp;        /**< If non-NULL, write detailed logs of this object's
 232                            progress to the file.  Controlled by user application
 233                            via cont_ad_set_logfp().  NULL when cont_ad object is
 234                            initially created. */
 235
 236     int32 n_calib_frame; /**< Number of frames of calibration data seen so far. */
 237 } cont_ad_t;
 238
 239
 240 /**
 241  * Initialize a continuous listening/silence filtering object.
 242  *
 243  * One time initialization of a continuous listening/silence filtering
 244  * object/module.  This can work in either "stream mode", where it
 245  * reads data from an audio device represented by
 246  * <code>ad_rec_t</code>, or in "block mode", where it filters out
 247  * silence regions from blocks of data passed into it.
 248  *
 249  * @param ad An audio device to read from, or NULL to operate in block mode.
 250  * @param adfunc The function used to read audio from <code>ad</code>,
 251  * or NULL to operate in block mode.  This is usually ad_read().
 252  * @return A pointer to a READ-ONLY structure used in other calls to
 253  * the object.  If any error occurs, the return value is NULL.
 254  */
 255 SPHINXBASE_EXPORT
 256 cont_ad_t *cont_ad_init (ad_rec_t *ad,  /**< In: The A/D source object to be filtered */
 257                          int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max)
 258                          /**< In: adfunc = source function to be invoked
 259                                            to obtain raw A/D data.  See ad.h for the
 260                                            required prototype definition. */
 261                          );
 262
 263 /**
 264  * Initializes a continuous listening object which simply passes data through (!)
 265  *
 266  * Like cont_ad_init, but put the module in raw mode; i.e., all data is passed
 267  * through, unfiltered.  (By special request.)
 268  */
 269 SPHINXBASE_EXPORT
 270 cont_ad_t *cont_ad_init_rawmode (ad_rec_t *ad,
 271                                  int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max));
 272
 273
 274 /**
 275  * Read raw audio data into the silence filter.
 276  *
 277  * The main read routine for reading speech/silence segmented audio data.  Audio
 278  * data is copied into the caller provided buffer, much like a file read routine.
 279  *
 280  * In "block mode", i.e. if NULL was passed as a read function to
 281  * <code>cont_ad_init</code>, the data in <code>buf</code> is taken as
 282  * input, and any non-silence data is written back to <code>buf</code>
 283  * on exit.  In this case, you must take care that <code>max</code>
 284  * does not overflow the internal buffer of the silence filter.  The
 285  * available number of samples can be obtained by calling
 286  * cont_ad_buffer_space().  Any excess data will be discarded.
 287  *
 288  * In normal mode, only speech segments are copied; silence segments are dropped.
 289  * In rawmode (cont_ad module initialized using cont_ad_init_rawmode()), all data
 290  * are passed through to the caller.  But, in either case, any single call to
 291  * cont_ad_read will never return data that crosses a speech/silence segment
 292  * boundary.
 293  *
 294  * The following variables are updated for use by the caller (see cont_ad_t above):
 295  *   cont_ad_t.state,
 296  *   cont_ad_t.read_ts,
 297  *   cont_ad_t.seglen,
 298  *   cont_ad_t.siglvl.
 299  *
 300  * Return value: Number of samples actually read, possibly 0; <0 if EOF on A/D source.
 301  */
 302 SPHINXBASE_EXPORT
 303 int32 cont_ad_read (cont_ad_t *r,       /**< In: Object pointer returned by cont_ad_init */
 304                     int16 *buf,         /**< In/Out: In block mode, contains input data.
 305                                            On return, buf contains A/D data returned
 306                                            by this function, if any. */
 307                     int32 max           /**< In: Maximum number of samples to be filled into buf.
 308                                            NOTE: max must be at least 256; otherwise
 309                                            the functions returns -1. */
 310         );
 311
 312 /**
 313  * Get the maximum number of samples which can be passed into cont_ad_read().
 314  */
 315 SPHINXBASE_EXPORT
 316 int32 cont_ad_buffer_space(cont_ad_t *r);
 317
 318 /**
 319  * Calibrate the silence filter.
 320  *
 321  * Calibration to determine an initial silence threshold.  This function can be called
 322  * any number of times.  It should be called at least once immediately after cont_ad_init.
 323  * The silence threshold is also updated internally once in a while, so this function
 324  * only needs to be called in the middle if there is a definite change in the recording
 325  * environment.
 326  * The application is responsible for making sure that the raw audio source is turned on
 327  * before the calibration.
 328  * Return value: 0 if successful, <0 otherwise.
 329  */
 330 SPHINXBASE_EXPORT
 331 int32 cont_ad_calib (cont_ad_t *cont    /**< In: object pointer returned by cont_ad_init */
 332                      );
 333
 334 /**
 335  * Calibrate the silence filter without an audio device.
 336  *
 337  * If the application has not passed an audio device into the silence filter
 338  * at initialisation,  this routine can be used to calibrate the filter. The
 339  * buf (of length max samples) should contain audio data for calibration. This
 340  * data is assumed to be completely consumed. More than one call may be
 341  * necessary to fully calibrate.
 342  * Return value: 0 if successful, <0 on failure, >0 if calibration not
 343  * complete.
 344  */
 345 SPHINXBASE_EXPORT
 346 int32 cont_ad_calib_loop (cont_ad_t *r, int16 *buf, int32 max);
 347
 348 /**
 349  * Get the number of samples required to calibrate the silence filter.
 350  *
 351  * Since, as mentioned above, the calibration data is assumed to be
 352  * fully consumed, it may be desirable to "hold onto" this data in
 353  * case it contains useful speech.  This function returns the number
 354  * of samples required to calibrate the silence filter, which is
 355  * useful in allocating a buffer to store this data.
 356  *
 357  * @return Number of samples required for successful calibration.
 358  */
 359 SPHINXBASE_EXPORT
 360 int32 cont_ad_calib_size(cont_ad_t *r);
 361
 362 /**
 363  * Set silence and speech threshold parameters.
 364  *
 365  * The silence threshold is the max power
 366  * level, RELATIVE to the peak background noise level, in any silence frame.  Similarly,
 367  * the speech threshold is the min power level, RELATIVE to the peak background noise
 368  * level, in any speech frame.  In general, silence threshold <= speech threshold.
 369  * Increasing the thresholds (say, from the default value of 2 to 3 or 4) reduces the
 370  * sensitivity to background noise, but may also increase the chances of clipping actual
 371  * speech.
 372  * @return: 0 if successful, <0 otherwise.
 373  */
 374 SPHINXBASE_EXPORT
 375 int32 cont_ad_set_thresh (cont_ad_t *cont,      /**< In: Object ptr from cont_ad_init */
 376                           int32 sil,    /**< In: silence threshold (default 2) */
 377                           int32 sp      /**< In: speech threshold (default 2) */
 378                           );
 379
 380
 381 /**
 382  * Set the changable parameters.
 383  *
 384  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
 385  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
 386  *   16 ms length (256 samples @ 16kHz sampling).
 387  */
 388 SPHINXBASE_EXPORT
 389 int32 cont_ad_set_params (cont_ad_t *r, int32 delta_sil, int32 delta_speech,
 390                           int32 min_noise, int32 max_noise,
 391                           int32 winsize, int32 speech_onset, int32 sil_onset,
 392                           int32 leader, int32 trailer,
 393                           float32 adapt_rate);
 394
 395 /**
 396  * PWP 1/14/98 -- get the changable params.
 397  *
 398  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
 399  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
 400  *   16 ms length (256 samples @ 16kHz sampling).
 401  */
 402 SPHINXBASE_EXPORT
 403 int32 cont_ad_get_params (cont_ad_t *r, int32 *delta_sil, int32 *delta_speech,
 404                           int32 *min_noise, int32 *max_noise,
 405                           int32 *winsize, int32 *speech_onset, int32 *sil_onset,
 406                           int32 *leader, int32 *trailer,
 407                           float32 *adapt_rate);
 408
 409 /**
 410  * Reset, discarding any accumulated speech segments.
 411  * @return 0 if successful, <0 otherwise.
 412  */
 413 SPHINXBASE_EXPORT
 414 int32 cont_ad_reset (cont_ad_t *cont);  /* In: Object pointer from cont_ad_init */
 415
 416
 417 /**
 418  * Close the continuous listening object.
 419  */
 420 SPHINXBASE_EXPORT
 421 int32 cont_ad_close (cont_ad_t *cont);  /* In: Object pointer from cont_ad_init */
 422
 423
 424 /**
 425  * Dump the power histogram.  For debugging...
 426  */
 427 SPHINXBASE_EXPORT
 428 void cont_ad_powhist_dump (FILE *fp, cont_ad_t *cont);
 429
 430
 431 /**
 432  * Detach the given continuous listening module from the associated audio device.
 433  * @return 0 if successful, -1 otherwise.
 434  */
 435 SPHINXBASE_EXPORT
 436 int32 cont_ad_detach (cont_ad_t *c);
 437
 438
 439 /**
 440  * Attach the continuous listening module to the given audio device/function.
 441  * (Like cont_ad_init, but without the calibration.)
 442  * @return 0 if successful, -1 otherwise.
 443  */
 444 SPHINXBASE_EXPORT
 445 int32 cont_ad_attach (cont_ad_t *c, ad_rec_t *a, int32 (*func)(ad_rec_t *, int16 *, int32));
 446
 447
 448 /**
 449  * Set a file for dumping raw audio input.
 450  *
 451  * The application can ask cont_ad to dump the raw audio input that cont_ad
 452  * processes to a file.  Use this function to give the FILE* to the cont_ad
 453  * object.  If invoked with fp == NULL, dumping is turned off.  The application
 454  * is responsible for opening and closing the file.  If fp is non-NULL, cont_ad
 455  * assumes the file pointer is valid and opened for writing.
 456  *
 457  * @return 0 if successful, -1 otherwise.
 458  */
 459 SPHINXBASE_EXPORT
 460 int32 cont_ad_set_rawfp (cont_ad_t *c,  /* The cont_ad object being addressed */
 461                          FILE *fp);     /* File to which raw audio data is to
 462                                            be dumped; NULL to stop dumping. */
 463
 464 /**
 465  * Set the file to which cont_ad logs its progress.
 466  *
 467  * Mainly for debugging.  If <code>fp</code> is NULL, logging is turned off.
 468  *
 469  * @return 0 if successful, -1 otherwise.
 470  */
 471 SPHINXBASE_EXPORT
 472 int32 cont_ad_set_logfp (cont_ad_t *c,  /* The cont_ad object being addressed */
 473                          FILE *fp);     /* File to which logs are written;
 474                                            NULL to stop logging. */
 475
 476 /**
 477  * Set the silence and speech thresholds.
 478  *
 479  * For this to remain permanently in effect, the auto_thresh field of
 480  * the continuous listening module should be set to FALSE or 0.
 481  * Otherwise the thresholds may be modified by the noise- level
 482  * adaptation.
 483  */
 484 SPHINXBASE_EXPORT
 485 int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech);
 486
 487 #ifdef __cplusplus
 488 }
 489 #endif
 490
 491
 492 #endif