include/sphinxbase/fe.h

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 1996-2004 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37
  38 /*
  39  * fe.h
  40  *
  41  * $Log: fe.h,v $
  42  * Revision 1.11  2005/02/05 02:15:02  egouvea
  43  * Removed fe_process(), never used
  44  *
  45  * Revision 1.10  2004/12/10 16:48:55  rkm
  46  * Added continuous density acoustic model handling
  47  *
  48  *
  49  */
  50
  51 #if defined(WIN32) && !defined(GNUWINCE)
  52 #define srand48(x) srand(x)
  53 #define lrand48() rand()
  54 #endif
  55
  56 #ifndef _NEW_FE_H_
  57 #define _NEW_FE_H_
  58
  59 /* Win32/WinCE DLL gunk */
  60 #include <sphinxbase/sphinxbase_export.h>
  61
  62 #include <sphinxbase/cmd_ln.h>
  63 #include <sphinxbase/fixpoint.h>
  64
  65 #ifdef __cplusplus
  66 extern "C" {
  67 #endif
  68 #if 0
  69 /* Fool Emacs. */
  70 }
  71 #endif
  72
  73 #ifdef WORDS_BIGENDIAN
  74 #define NATIVE_ENDIAN "big"
  75 #else
  76 #define NATIVE_ENDIAN "little"
  77 #endif
  78
  79 /** Default number of samples per second. */
  80 #define DEFAULT_SAMPLING_RATE 16000
  81 /** Default number of frames per second. */
  82 #define DEFAULT_FRAME_RATE 100
  83 /** Default spacing between frame starts (equal to
  84  * DEFAULT_SAMPLING_RATE/DEFAULT_FRAME_RATE) */
  85 #define DEFAULT_FRAME_SHIFT 160
  86 /** Default size of each frame (410 samples @ 16000Hz). */
  87 #define DEFAULT_WINDOW_LENGTH 0.025625
  88 /** Default number of FFT points. */
  89 #define DEFAULT_FFT_SIZE 512
  90 /** Default number of MFCC coefficients in output. */
  91 #define DEFAULT_NUM_CEPSTRA 13
  92 /** Default number of filter bands used to generate MFCCs. */
  93 #define DEFAULT_NUM_FILTERS 40
  94 /** Default lower edge of mel filter bank. */
  95 #define DEFAULT_LOWER_FILT_FREQ 133.33334
  96 /** Default upper edge of mel filter bank. */
  97 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
  98 /** Default pre-emphasis filter coefficient. */
  99 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
 100 /** Default type of frequency warping to use for VTLN. */
 101 #define DEFAULT_WARP_TYPE "inverse_linear"
 102 /** Default random number seed to use for dithering. */
 103 #define SEED  -1
 104
 105 #define waveform_to_cepstral_command_line_macro() \
 106   { "-logspec", \
 107     ARG_BOOLEAN, \
 108     "no", \
 109     "Write out logspectral files instead of cepstra" }, \
 110    \
 111   { "-smoothspec", \
 112     ARG_BOOLEAN, \
 113     "no", \
 114     "Write out cepstral-smoothed logspectral files" }, \
 115    \
 116   { "-transform", \
 117     ARG_STRING, \
 118     "legacy", \
 119     "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
 120    \
 121   { "-alpha", \
 122     ARG_FLOAT32, \
 123     ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
 124     "Preemphasis parameter" }, \
 125    \
 126   { "-samprate", \
 127     ARG_FLOAT32, \
 128     ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
 129     "Sampling rate" }, \
 130    \
 131   { "-frate", \
 132     ARG_INT32, \
 133     ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
 134     "Frame rate" }, \
 135    \
 136   { "-wlen", \
 137     ARG_FLOAT32, \
 138     ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
 139     "Hamming window length" }, \
 140    \
 141   { "-nfft", \
 142     ARG_INT32, \
 143     ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
 144     "Size of FFT" }, \
 145    \
 146   { "-nfilt", \
 147     ARG_INT32, \
 148     ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
 149     "Number of filter banks" }, \
 150    \
 151   { "-lowerf", \
 152     ARG_FLOAT32, \
 153     ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
 154     "Lower edge of filters" }, \
 155    \
 156   { "-upperf", \
 157     ARG_FLOAT32, \
 158     ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
 159     "Upper edge of filters" }, \
 160    \
 161   { "-unit_area", \
 162     ARG_BOOLEAN, \
 163     "yes", \
 164     "Normalize mel filters to unit area" }, \
 165    \
 166   { "-round_filters", \
 167     ARG_BOOLEAN, \
 168     "yes", \
 169     "Round mel filter frequencies to DFT points" }, \
 170    \
 171   { "-ncep", \
 172     ARG_INT32, \
 173     ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
 174     "Number of cep coefficients" }, \
 175    \
 176   { "-doublebw", \
 177     ARG_BOOLEAN, \
 178     "no", \
 179     "Use double bandwidth filters (same center freq)" }, \
 180    \
 181   { "-lifter", \
 182     ARG_INT32, \
 183     "0", \
 184     "Length of sin-curve for liftering, or 0 for no liftering." }, \
 185    \
 186   { "-input_endian", \
 187     ARG_STRING, \
 188     NATIVE_ENDIAN, \
 189     "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
 190    \
 191   { "-warp_type", \
 192     ARG_STRING, \
 193     DEFAULT_WARP_TYPE, \
 194     "Warping function type (or shape)" }, \
 195    \
 196   { "-warp_params", \
 197     ARG_STRING, \
 198     NULL, \
 199     "Parameters defining the warping function" }, \
 200    \
 201   { "-dither", \
 202     ARG_BOOLEAN, \
 203     "no", \
 204     "Add 1/2-bit noise" }, \
 205    \
 206   { "-seed", \
 207     ARG_INT32, \
 208     ARG_STRINGIFY(SEED), \
 209     "Seed for random number generator; if less than zero, pick our own" }, \
 210    \
 211   { "-remove_dc", \
 212     ARG_BOOLEAN, \
 213     "no", \
 214     "Remove DC offset from each frame" }, \
 215                                           \
 216   { "-verbose", \
 217     ARG_BOOLEAN, \
 218     "no", \
 219     "Show input filenames" } \
 220
 221
 222 #ifdef FIXED_POINT
 223 /** MFCC computation type. */
 224 typedef fixed32 mfcc_t;
 225
 226 /** Convert a floating-point value to mfcc_t. */
 227 #define FLOAT2MFCC(x) FLOAT2FIX(x)
 228 /** Convert a mfcc_t value to floating-point. */
 229 #define MFCC2FLOAT(x) FIX2FLOAT(x)
 230 /** Multiply two mfcc_t values. */
 231 #define MFCCMUL(a,b) FIXMUL(a,b)
 232 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
 233 #else /* !FIXED_POINT */
 234
 235 /** MFCC computation type. */
 236 typedef float32 mfcc_t;
 237 /** Convert a floating-point value to mfcc_t. */
 238 #define FLOAT2MFCC(x) (x)
 239 /** Convert a mfcc_t value to floating-point. */
 240 #define MFCC2FLOAT(x) (x)
 241 /** Multiply two mfcc_t values. */
 242 #define MFCCMUL(a,b) ((a)*(b))
 243 #define MFCCLN(x,in,out) log(x)
 244 #endif /* !FIXED_POINT */
 245
 246 /**
 247  * Structure for the front-end computation.
 248  */
 249 typedef struct fe_s fe_t;
 250
 251 /**
 252  * Error codes returned by stuff.
 253  */
 254 enum fe_error_e {
 255         FE_SUCCESS = 0,
 256         FE_OUTPUT_FILE_SUCCESS  = 0,
 257         FE_CONTROL_FILE_ERROR = -1,
 258         FE_START_ERROR = -2,
 259         FE_UNKNOWN_SINGLE_OR_BATCH = -3,
 260         FE_INPUT_FILE_OPEN_ERROR = -4,
 261         FE_INPUT_FILE_READ_ERROR = -5,
 262         FE_MEM_ALLOC_ERROR = -6,
 263         FE_OUTPUT_FILE_WRITE_ERROR = -7,
 264         FE_OUTPUT_FILE_OPEN_ERROR = -8,
 265         FE_ZERO_ENERGY_ERROR = -9,
 266         FE_INVALID_PARAM_ERROR =  -10
 267 };
 268
 269 /**
 270  * Initialize a front-end object from global command-line.
 271  *
 272  * This is equivalent to calling fe_init_auto_r(cmd_ln_get()).
 273  *
 274  * @return Newly created front-end object.
 275  */
 276 SPHINXBASE_EXPORT
 277 fe_t* fe_init_auto(void);
 278
 279 /**
 280  * Get the default set of arguments for fe_init_auto_r().
 281  *
 282  * @return Pointer to an argument structure which can be passed to
 283  * cmd_ln_init() in friends to create argument structures for
 284  * fe_init_auto_r().
 285  */
 286 SPHINXBASE_EXPORT
 287 arg_t const *fe_get_args(void);
 288
 289 /**
 290  * Initialize a front-end object from a command-line parse.
 291  *
 292  * @param config Command-line object, as returned by cmd_ln_parse_r()
 293  *               or cmd_ln_parse_file().  Ownership of this object is
 294  *               claimed by the fe_t, so you must not attempt to free
 295  *               it manually.  Use cmd_ln_retain() if you wish to
 296  *               reuse it.
 297  * @return Newly created front-end object.
 298  */
 299 SPHINXBASE_EXPORT
 300 fe_t *fe_init_auto_r(cmd_ln_t *config);
 301
 302 /**
 303  * Retrieve the command-line object used to initialize this front-end.
 304  *
 305  * @return command-line object for this front-end.  This pointer is
 306  *         retained by the fe_t, so you should not attempt to free it
 307  *         manually.
 308  */
 309 SPHINXBASE_EXPORT
 310 cmd_ln_t *fe_get_config(fe_t *fe);
 311
 312 /**
 313  * Start processing an utterance.
 314  * @return 0 for success, <0 for error (see enum fe_error_e)
 315  */
 316 SPHINXBASE_EXPORT
 317 int fe_start_utt(fe_t *fe);
 318
 319 /**
 320  * Get the dimensionality of the output of this front-end object.
 321  *
 322  * This is guaranteed to be the number of values in one frame of
 323  * output from fe_end_utt(), fe_process_frame(), and
 324  * fe_process_frames().  It is usually the number of MFCC
 325  * coefficients, but it might be the number of log-spectrum bins, if
 326  * the <tt>-logspec</tt> or <tt>-smoothspec</tt> options to
 327  * fe_init_auto() were true.
 328  *
 329  * @return Dimensionality of front-end output.
 330  */
 331 SPHINXBASE_EXPORT
 332 int fe_get_output_size(fe_t *fe);
 333
 334 /**
 335  * Get the dimensionality of the input to this front-end object.
 336  *
 337  * This function retrieves the number of input samples consumed by one
 338  * frame of processing.  To obtain one frame of output, you must have
 339  * at least <code>*out_frame_size</code> samples.  To obtain <i>N</i>
 340  * frames of output, you must have at least <code>(N-1) *
 341  * *out_frame_shift + *out_frame_size</code> input samples.
 342  *
 343  * @param out_frame_shift Output: Number of samples between each frame start.
 344  * @param out_frame_size Output: Number of samples in each frame.
 345  */
 346 SPHINXBASE_EXPORT
 347 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
 348                        int *out_frame_size);
 349
 350 /**
 351  * Finish processing an utterance.
 352  *
 353  * This function also collects any remaining samples and calculates a
 354  * final cepstral vector.  If there are overflow samples remaining, it
 355  * will pad with zeros to make a complete frame.
 356  *
 357  * @param fe Front-end object.
 358  * @param out_cepvector Buffer to hold a residual cepstral vector, or NULL
 359  *                      if you wish to ignore it.  Must be large enough
 360  * @param out_nframes Number of frames of residual cepstra created
 361  *                    (either 0 or 1).
 362  * @return 0 for success, <0 for error (see enum fe_error_e)
 363  */
 364 SPHINXBASE_EXPORT
 365 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
 366
 367 /**
 368  * Retain ownership of a front end object.
 369  *
 370  * @return pointer to the retained front end.
 371  */
 372 SPHINXBASE_EXPORT
 373 fe_t *fe_retain(fe_t *fe);
 374
 375 /**
 376  * Free the front end.
 377  *
 378  * Releases resources associated with the front-end object.
 379  *
 380  * @return new reference count (0 if freed completely)
 381  */
 382 SPHINXBASE_EXPORT
 383 int fe_free(fe_t *fe);
 384
 385 /**
 386  * Process one frame of samples.
 387  *
 388  * @param spch Speech samples (signed 16-bit linear PCM)
 389  * @param nsamps Number of samples in <code>spch</code>
 390  * @param buf_cep Buffer which will receive one frame of features.
 391  * @return 0 for success, <0 for error (see enum fe_error_e)
 392  */
 393 SPHINXBASE_EXPORT
 394 int fe_process_frame(fe_t *fe, int16 const *spch,
 395                      int32 nsamps, mfcc_t *out_cep);
 396
 397 /**
 398  * Process a block of samples.
 399  *
 400  * This function generates up to <code>*inout_nframes</code> of
 401  * features, or as many as can be generated from
 402  * <code>*inout_nsamps</code> samples.
 403  *
 404  * On exit, the <code>inout_spch</code>, <code>inout_nsamps</code>,
 405  * and <code>inout_nframes</code> parameters are updated to point to
 406  * the remaining sample data, the number of remaining samples, and the
 407  * number of frames processed, respectively.  This allows you to call
 408  * this repeatedly to process a large block of audio in small (say,
 409  * 5-frame) chunks:
 410  *
 411  *  int16 *bigbuf, *p;
 412  *  mfcc_t **cepstra;
 413  *  int32 nsamps;
 414  *  int32 nframes = 5;
 415  *
 416  *  cepstra = (mfcc_t **)
 417  *      ckd_calloc_2d(nframes, fe_get_output_size(fe), sizeof(**cepstra));
 418  *  p = bigbuf;
 419  *  while (nsamps) {
 420  *      nframes = 5;
 421  *      fe_process_frames(fe, &p, &nsamps, cepstra, &nframes);
 422  *      // Now do something with these frames...
 423  *      if (nframes)
 424  *          do_some_stuff(cepstra, nframes);
 425  *  }
 426  *
 427  * @param inout_spch Input: Pointer to pointer to speech samples
 428  *                   (signed 16-bit linear PCM).
 429  *                   Output: Pointer to remaining samples.
 430  * @param inout_nsamps Input: Pointer to maximum number of samples to
 431  *                     process.
 432  *                     Output: Number of samples remaining in input buffer.
 433  * @param buf_cep Two-dimensional buffer (allocated with
 434  *                ckd_calloc_2d()) which will receive frames of output
 435  *                data.  If NULL, no actual processing will be done,
 436  *                and the maximum number of output frames which would
 437  *                be generated is returned in
 438  *                <code>*inout_nframes</code>.
 439  * @param inout_nframes Input: Pointer to maximum number of frames to
 440  *                      generate.
 441  *                      Output: Number of frames actually generated.
 442  * @return 0 for success, <0 for failure (see enum fe_error_e)
 443  */
 444 SPHINXBASE_EXPORT
 445 int fe_process_frames(fe_t *fe,
 446                       int16 const **inout_spch,
 447                       size_t *inout_nsamps,
 448                       mfcc_t **buf_cep,
 449                       int32 *inout_nframes);
 450
 451 /**
 452  * Process a block of samples, returning as many frames as possible.
 453  *
 454  * This function processes all the samples in a block of data and
 455  * returns a newly allocated block of feature vectors.  This block
 456  * needs to be freed with fe_free_2d() after use.
 457  *
 458  * It is possible for there to be some left-over data which could not
 459  * fit in a complete frame.  This data can be processed with
 460  * fe_end_utt().
 461  *
 462  * This function is deprecated in favor of fe_process_frames().
 463  *
 464  * @return 0 for success, <0 for failure (see enum fe_error_e)
 465  */
 466 SPHINXBASE_EXPORT
 467 int fe_process_utt(fe_t *fe,  /**< A front end object */
 468                    int16 const *spch, /**< The speech samples */
 469                    size_t nsamps, /**< number of samples*/
 470                    mfcc_t ***cep_block, /**< Output pointer to cepstra */
 471                    int32 *nframes /**< Number of frames processed */
 472         );
 473
 474 /**
 475  * Free the output pointer returned by fe_process_utt().
 476  **/
 477 SPHINXBASE_EXPORT
 478 void fe_free_2d(void *arr);
 479
 480 /**
 481  * Convert a block of mfcc_t to float32 (can be done in-place)
 482  **/
 483 SPHINXBASE_EXPORT
 484 int fe_mfcc_to_float(fe_t *fe,
 485                      mfcc_t **input,
 486                      float32 **output,
 487                      int32 nframes);
 488
 489 /**
 490  * Convert a block of float32 to mfcc_t (can be done in-place)
 491  **/
 492 SPHINXBASE_EXPORT
 493 int fe_float_to_mfcc(fe_t *fe,
 494                      float32 **input,
 495                      mfcc_t **output,
 496                      int32 nframes);
 497
 498 /**
 499  * Process one frame of log spectra into MFCC using discrete cosine
 500  * transform.
 501  *
 502  * This uses a variant of the DCT-II where the first frequency bin is
 503  * scaled by 0.5.  Unless somebody misunderstood the DCT-III equations
 504  * and thought that's what they were implementing here, this is
 505  * ostensibly done to account for the symmetry properties of the
 506  * DCT-II versus the DFT - the first coefficient of the input is
 507  * assumed to be repeated in the negative frequencies, which is not
 508  * the case for the DFT.  (This begs the question, why not just use
 509  * the DCT-I, since it has the appropriate symmetry properties...)
 510  * Moreover, this is bogus since the mel-frequency bins on which we
 511  * are doing the DCT don't extend to the edge of the DFT anyway.
 512  *
 513  * This also means that the matrix used in computing this DCT can not
 514  * be made orthogonal, and thus inverting the transform is difficult.
 515  * Therefore if you want to do cepstral smoothing or have some other
 516  * reason to invert your MFCCs, use fe_logspec_dct2() and its inverse
 517  * fe_logspec_dct3() instead.
 518  *
 519  * Also, it normalizes by 1/nfilt rather than 2/nfilt, for some reason.
 520  **/
 521 SPHINXBASE_EXPORT
 522 int fe_logspec_to_mfcc(fe_t *fe,  /**< A fe structure */
 523                        const mfcc_t *fr_spec, /**< One frame of spectrum */
 524                        mfcc_t *fr_cep /**< One frame of cepstrum */
 525         );
 526
 527 /**
 528  * Convert log spectra to MFCC using DCT-II.
 529  *
 530  * This uses the "unitary" form of the DCT-II, i.e. with a scaling
 531  * factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
 532  * cos(0) basis vector (i.e. the one corresponding to the DC
 533  * coefficient in the output).
 534  **/
 535 SPHINXBASE_EXPORT
 536 int fe_logspec_dct2(fe_t *fe,  /**< A fe structure */
 537                     const mfcc_t *fr_spec, /**< One frame of spectrum */
 538                     mfcc_t *fr_cep /**< One frame of cepstrum */
 539         );
 540
 541 /**
 542  * Convert MFCC to log spectra using DCT-III.
 543  *
 544  * This uses the "unitary" form of the DCT-III, i.e. with a scaling
 545  * factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
 546  * cos(0) basis vector (i.e. the one corresponding to the DC
 547  * coefficient in the input).
 548  **/
 549 SPHINXBASE_EXPORT
 550 int fe_mfcc_dct3(fe_t *fe,  /**< A fe structure */
 551                  const mfcc_t *fr_cep, /**< One frame of cepstrum */
 552                  mfcc_t *fr_spec /**< One frame of spectrum */
 553         );
 554
 555 #ifdef __cplusplus
 556 }
 557 #endif
 558
 559
 560 #endif