1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
42 * Revision 1.11 2005/02/05 02:15:02 egouvea
43 * Removed fe_process(), never used
45 * Revision 1.10 2004/12/10 16:48:55 rkm
46 * Added continuous density acoustic model handling
51 #if defined(WIN32) && !defined(GNUWINCE)
52 #define srand48(x) srand(x)
53 #define lrand48() rand()
59 /* Win32/WinCE DLL gunk */
60 #include <sphinxbase/sphinxbase_export.h>
62 #include <sphinxbase/cmd_ln.h>
63 #include <sphinxbase/fixpoint.h>
73 #ifdef WORDS_BIGENDIAN
74 #define NATIVE_ENDIAN "big"
76 #define NATIVE_ENDIAN "little"
79 /** Default number of samples per second. */
80 #define DEFAULT_SAMPLING_RATE 16000
81 /** Default number of frames per second. */
82 #define DEFAULT_FRAME_RATE 100
83 /** Default spacing between frame starts (equal to
84 * DEFAULT_SAMPLING_RATE/DEFAULT_FRAME_RATE) */
85 #define DEFAULT_FRAME_SHIFT 160
86 /** Default size of each frame (410 samples @ 16000Hz). */
87 #define DEFAULT_WINDOW_LENGTH 0.025625
88 /** Default number of FFT points. */
89 #define DEFAULT_FFT_SIZE 512
90 /** Default number of MFCC coefficients in output. */
91 #define DEFAULT_NUM_CEPSTRA 13
92 /** Default number of filter bands used to generate MFCCs. */
93 #define DEFAULT_NUM_FILTERS 40
94 /** Default lower edge of mel filter bank. */
95 #define DEFAULT_LOWER_FILT_FREQ 133.33334
96 /** Default upper edge of mel filter bank. */
97 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
98 /** Default pre-emphasis filter coefficient. */
99 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
100 /** Default type of frequency warping to use for VTLN. */
101 #define DEFAULT_WARP_TYPE "inverse_linear"
102 /** Default random number seed to use for dithering. */
105 #define waveform_to_cepstral_command_line_macro() \
109 "Write out logspectral files instead of cepstra" }, \
114 "Write out cepstral-smoothed logspectral files" }, \
119 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
123 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
124 "Preemphasis parameter" }, \
128 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
133 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
138 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
139 "Hamming window length" }, \
143 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
148 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
149 "Number of filter banks" }, \
153 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
154 "Lower edge of filters" }, \
158 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
159 "Upper edge of filters" }, \
164 "Normalize mel filters to unit area" }, \
166 { "-round_filters", \
169 "Round mel filter frequencies to DFT points" }, \
173 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
174 "Number of cep coefficients" }, \
179 "Use double bandwidth filters (same center freq)" }, \
184 "Length of sin-curve for liftering, or 0 for no liftering." }, \
189 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
194 "Warping function type (or shape)" }, \
199 "Parameters defining the warping function" }, \
204 "Add 1/2-bit noise" }, \
208 ARG_STRINGIFY(SEED), \
209 "Seed for random number generator; if less than zero, pick our own" }, \
214 "Remove DC offset from each frame" }, \
219 "Show input filenames" } \
223 /** MFCC computation type. */
224 typedef fixed32 mfcc_t;
226 /** Convert a floating-point value to mfcc_t. */
227 #define FLOAT2MFCC(x) FLOAT2FIX(x)
228 /** Convert a mfcc_t value to floating-point. */
229 #define MFCC2FLOAT(x) FIX2FLOAT(x)
230 /** Multiply two mfcc_t values. */
231 #define MFCCMUL(a,b) FIXMUL(a,b)
232 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
233 #else /* !FIXED_POINT */
235 /** MFCC computation type. */
236 typedef float32 mfcc_t;
237 /** Convert a floating-point value to mfcc_t. */
238 #define FLOAT2MFCC(x) (x)
239 /** Convert a mfcc_t value to floating-point. */
240 #define MFCC2FLOAT(x) (x)
241 /** Multiply two mfcc_t values. */
242 #define MFCCMUL(a,b) ((a)*(b))
243 #define MFCCLN(x,in,out) log(x)
244 #endif /* !FIXED_POINT */
247 * Structure for the front-end computation.
249 typedef struct fe_s fe_t;
252 * Error codes returned by stuff.
256 FE_OUTPUT_FILE_SUCCESS = 0,
257 FE_CONTROL_FILE_ERROR = -1,
259 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
260 FE_INPUT_FILE_OPEN_ERROR = -4,
261 FE_INPUT_FILE_READ_ERROR = -5,
262 FE_MEM_ALLOC_ERROR = -6,
263 FE_OUTPUT_FILE_WRITE_ERROR = -7,
264 FE_OUTPUT_FILE_OPEN_ERROR = -8,
265 FE_ZERO_ENERGY_ERROR = -9,
266 FE_INVALID_PARAM_ERROR = -10
270 * Initialize a front-end object from global command-line.
272 * This is equivalent to calling fe_init_auto_r(cmd_ln_get()).
274 * @return Newly created front-end object.
277 fe_t* fe_init_auto(void);
280 * Get the default set of arguments for fe_init_auto_r().
282 * @return Pointer to an argument structure which can be passed to
283 * cmd_ln_init() in friends to create argument structures for
287 arg_t const *fe_get_args(void);
290 * Initialize a front-end object from a command-line parse.
292 * @param config Command-line object, as returned by cmd_ln_parse_r()
293 * or cmd_ln_parse_file(). Ownership of this object is
294 * claimed by the fe_t, so you must not attempt to free
295 * it manually. Use cmd_ln_retain() if you wish to
297 * @return Newly created front-end object.
300 fe_t *fe_init_auto_r(cmd_ln_t *config);
303 * Retrieve the command-line object used to initialize this front-end.
305 * @return command-line object for this front-end. This pointer is
306 * retained by the fe_t, so you should not attempt to free it
310 cmd_ln_t *fe_get_config(fe_t *fe);
313 * Start processing an utterance.
314 * @return 0 for success, <0 for error (see enum fe_error_e)
317 int fe_start_utt(fe_t *fe);
320 * Get the dimensionality of the output of this front-end object.
322 * This is guaranteed to be the number of values in one frame of
323 * output from fe_end_utt(), fe_process_frame(), and
324 * fe_process_frames(). It is usually the number of MFCC
325 * coefficients, but it might be the number of log-spectrum bins, if
326 * the <tt>-logspec</tt> or <tt>-smoothspec</tt> options to
327 * fe_init_auto() were true.
329 * @return Dimensionality of front-end output.
332 int fe_get_output_size(fe_t *fe);
335 * Get the dimensionality of the input to this front-end object.
337 * This function retrieves the number of input samples consumed by one
338 * frame of processing. To obtain one frame of output, you must have
339 * at least <code>*out_frame_size</code> samples. To obtain <i>N</i>
340 * frames of output, you must have at least <code>(N-1) *
341 * *out_frame_shift + *out_frame_size</code> input samples.
343 * @param out_frame_shift Output: Number of samples between each frame start.
344 * @param out_frame_size Output: Number of samples in each frame.
347 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
348 int *out_frame_size);
351 * Finish processing an utterance.
353 * This function also collects any remaining samples and calculates a
354 * final cepstral vector. If there are overflow samples remaining, it
355 * will pad with zeros to make a complete frame.
357 * @param fe Front-end object.
358 * @param out_cepvector Buffer to hold a residual cepstral vector, or NULL
359 * if you wish to ignore it. Must be large enough
360 * @param out_nframes Number of frames of residual cepstra created
362 * @return 0 for success, <0 for error (see enum fe_error_e)
365 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
368 * Retain ownership of a front end object.
370 * @return pointer to the retained front end.
373 fe_t *fe_retain(fe_t *fe);
376 * Free the front end.
378 * Releases resources associated with the front-end object.
380 * @return new reference count (0 if freed completely)
383 int fe_free(fe_t *fe);
386 * Process one frame of samples.
388 * @param spch Speech samples (signed 16-bit linear PCM)
389 * @param nsamps Number of samples in <code>spch</code>
390 * @param buf_cep Buffer which will receive one frame of features.
391 * @return 0 for success, <0 for error (see enum fe_error_e)
394 int fe_process_frame(fe_t *fe, int16 const *spch,
395 int32 nsamps, mfcc_t *out_cep);
398 * Process a block of samples.
400 * This function generates up to <code>*inout_nframes</code> of
401 * features, or as many as can be generated from
402 * <code>*inout_nsamps</code> samples.
404 * On exit, the <code>inout_spch</code>, <code>inout_nsamps</code>,
405 * and <code>inout_nframes</code> parameters are updated to point to
406 * the remaining sample data, the number of remaining samples, and the
407 * number of frames processed, respectively. This allows you to call
408 * this repeatedly to process a large block of audio in small (say,
416 * cepstra = (mfcc_t **)
417 * ckd_calloc_2d(nframes, fe_get_output_size(fe), sizeof(**cepstra));
421 * fe_process_frames(fe, &p, &nsamps, cepstra, &nframes);
422 * // Now do something with these frames...
424 * do_some_stuff(cepstra, nframes);
427 * @param inout_spch Input: Pointer to pointer to speech samples
428 * (signed 16-bit linear PCM).
429 * Output: Pointer to remaining samples.
430 * @param inout_nsamps Input: Pointer to maximum number of samples to
432 * Output: Number of samples remaining in input buffer.
433 * @param buf_cep Two-dimensional buffer (allocated with
434 * ckd_calloc_2d()) which will receive frames of output
435 * data. If NULL, no actual processing will be done,
436 * and the maximum number of output frames which would
437 * be generated is returned in
438 * <code>*inout_nframes</code>.
439 * @param inout_nframes Input: Pointer to maximum number of frames to
441 * Output: Number of frames actually generated.
442 * @return 0 for success, <0 for failure (see enum fe_error_e)
445 int fe_process_frames(fe_t *fe,
446 int16 const **inout_spch,
447 size_t *inout_nsamps,
449 int32 *inout_nframes);
452 * Process a block of samples, returning as many frames as possible.
454 * This function processes all the samples in a block of data and
455 * returns a newly allocated block of feature vectors. This block
456 * needs to be freed with fe_free_2d() after use.
458 * It is possible for there to be some left-over data which could not
459 * fit in a complete frame. This data can be processed with
462 * This function is deprecated in favor of fe_process_frames().
464 * @return 0 for success, <0 for failure (see enum fe_error_e)
467 int fe_process_utt(fe_t *fe, /**< A front end object */
468 int16 const *spch, /**< The speech samples */
469 size_t nsamps, /**< number of samples*/
470 mfcc_t ***cep_block, /**< Output pointer to cepstra */
471 int32 *nframes /**< Number of frames processed */
475 * Free the output pointer returned by fe_process_utt().
478 void fe_free_2d(void *arr);
481 * Convert a block of mfcc_t to float32 (can be done in-place)
484 int fe_mfcc_to_float(fe_t *fe,
490 * Convert a block of float32 to mfcc_t (can be done in-place)
493 int fe_float_to_mfcc(fe_t *fe,
499 * Process one frame of log spectra into MFCC using discrete cosine
502 * This uses a variant of the DCT-II where the first frequency bin is
503 * scaled by 0.5. Unless somebody misunderstood the DCT-III equations
504 * and thought that's what they were implementing here, this is
505 * ostensibly done to account for the symmetry properties of the
506 * DCT-II versus the DFT - the first coefficient of the input is
507 * assumed to be repeated in the negative frequencies, which is not
508 * the case for the DFT. (This begs the question, why not just use
509 * the DCT-I, since it has the appropriate symmetry properties...)
510 * Moreover, this is bogus since the mel-frequency bins on which we
511 * are doing the DCT don't extend to the edge of the DFT anyway.
513 * This also means that the matrix used in computing this DCT can not
514 * be made orthogonal, and thus inverting the transform is difficult.
515 * Therefore if you want to do cepstral smoothing or have some other
516 * reason to invert your MFCCs, use fe_logspec_dct2() and its inverse
517 * fe_logspec_dct3() instead.
519 * Also, it normalizes by 1/nfilt rather than 2/nfilt, for some reason.
522 int fe_logspec_to_mfcc(fe_t *fe, /**< A fe structure */
523 const mfcc_t *fr_spec, /**< One frame of spectrum */
524 mfcc_t *fr_cep /**< One frame of cepstrum */
528 * Convert log spectra to MFCC using DCT-II.
530 * This uses the "unitary" form of the DCT-II, i.e. with a scaling
531 * factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
532 * cos(0) basis vector (i.e. the one corresponding to the DC
533 * coefficient in the output).
536 int fe_logspec_dct2(fe_t *fe, /**< A fe structure */
537 const mfcc_t *fr_spec, /**< One frame of spectrum */
538 mfcc_t *fr_cep /**< One frame of cepstrum */
542 * Convert MFCC to log spectra using DCT-III.
544 * This uses the "unitary" form of the DCT-III, i.e. with a scaling
545 * factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
546 * cos(0) basis vector (i.e. the one corresponding to the DC
547 * coefficient in the input).
550 int fe_mfcc_dct3(fe_t *fe, /**< A fe structure */
551 const mfcc_t *fr_cep, /**< One frame of cepstrum */
552 mfcc_t *fr_spec /**< One frame of spectrum */