1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
52 #include "sphinxbase/prim_type.h"
53 #include "sphinxbase/byteorder.h"
54 #include "sphinxbase/fixpoint.h"
55 #include "sphinxbase/genrand.h"
56 #include "sphinxbase/err.h"
57 #include "sphinxbase/cmd_ln.h"
58 #include "sphinxbase/ckd_alloc.h"
60 #include "fe_internal.h"
63 static const arg_t fe_args[] = {
64 waveform_to_cepstral_command_line_macro(),
65 { NULL, 0, NULL, NULL }
69 fe_parse_general_params(cmd_ln_t *config, fe_t * fe)
74 fe->sampling_rate = cmd_ln_float32_r(config, "-samprate");
75 fe->frame_rate = (int16)cmd_ln_int32_r(config, "-frate");
76 if (cmd_ln_boolean_r(config, "-dither")) {
78 fe->seed = cmd_ln_int32_r(config, "-seed");
80 #ifdef WORDS_BIGENDIAN
81 fe->swap = strcmp("big", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
83 fe->swap = strcmp("little", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
85 fe->window_length = cmd_ln_float32_r(config, "-wlen");
86 fe->pre_emphasis_alpha = cmd_ln_float32_r(config, "-alpha");
88 fe->num_cepstra = (uint8)cmd_ln_int32_r(config, "-ncep");
89 fe->fft_size = (int16)cmd_ln_int32_r(config, "-nfft");
91 /* Check FFT size, compute FFT order (log_2(n)) */
92 for (j = fe->fft_size, fe->fft_order = 0; j > 1; j >>= 1, fe->fft_order++) {
93 if (((j % 2) != 0) || (fe->fft_size <= 0)) {
94 E_ERROR("fft: number of points must be a power of 2 (is %d)\n",
99 /* Verify that FFT size is greater or equal to window length. */
100 if (fe->fft_size < (int)(fe->window_length * fe->sampling_rate)) {
101 E_ERROR("FFT: Number of points must be greater or equal to frame size (%d samples)\n",
102 (int)(fe->window_length * fe->sampling_rate));
106 fe->remove_dc = cmd_ln_boolean_r(config, "-remove_dc");
108 if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "dct"))
109 fe->transform = DCT_II;
110 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "legacy"))
111 fe->transform = LEGACY_DCT;
112 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "htk"))
113 fe->transform = DCT_HTK;
115 E_ERROR("Invalid transform type (values are 'dct', 'legacy', 'htk')\n");
119 if (cmd_ln_boolean_r(config, "-logspec"))
120 fe->log_spec = RAW_LOG_SPEC;
121 if (cmd_ln_boolean_r(config, "-smoothspec"))
122 fe->log_spec = SMOOTH_LOG_SPEC;
128 fe_parse_melfb_params(cmd_ln_t *config, fe_t *fe, melfb_t * mel)
130 mel->sampling_rate = fe->sampling_rate;
131 mel->fft_size = fe->fft_size;
132 mel->num_cepstra = fe->num_cepstra;
133 mel->num_filters = cmd_ln_int32_r(config, "-nfilt");
136 fe->feature_dimension = mel->num_filters;
138 fe->feature_dimension = fe->num_cepstra;
140 mel->upper_filt_freq = cmd_ln_float32_r(config, "-upperf");
141 mel->lower_filt_freq = cmd_ln_float32_r(config, "-lowerf");
143 mel->doublewide = cmd_ln_boolean_r(config, "-doublebw");
145 mel->warp_type = cmd_ln_str_r(config, "-warp_type");
146 mel->warp_params = cmd_ln_str_r(config, "-warp_params");
147 mel->lifter_val = cmd_ln_int32_r(config, "-lifter");
149 mel->unit_area = cmd_ln_boolean_r(config, "-unit_area");
150 mel->round_filters = cmd_ln_boolean_r(config, "-round_filters");
152 if (fe_warp_set(mel, mel->warp_type) != FE_SUCCESS) {
153 E_ERROR("Failed to initialize the warping function.\n");
156 fe_warp_set_parameters(mel, mel->warp_params, mel->sampling_rate);
161 fe_print_current(fe_t const *fe)
163 E_INFO("Current FE Parameters:\n");
164 E_INFO("\tSampling Rate: %f\n", fe->sampling_rate);
165 E_INFO("\tFrame Size: %d\n", fe->frame_size);
166 E_INFO("\tFrame Shift: %d\n", fe->frame_shift);
167 E_INFO("\tFFT Size: %d\n", fe->fft_size);
168 E_INFO("\tLower Frequency: %g\n",
169 fe->mel_fb->lower_filt_freq);
170 E_INFO("\tUpper Frequency: %g\n",
171 fe->mel_fb->upper_filt_freq);
172 E_INFO("\tNumber of filters: %d\n", fe->mel_fb->num_filters);
173 E_INFO("\tNumber of Overflow Samps: %d\n", fe->num_overflow_samps);
174 E_INFO("\tStart Utt Status: %d\n", fe->start_flag);
175 E_INFO("Will %sremove DC offset at frame level\n",
176 fe->remove_dc ? "" : "not ");
178 E_INFO("Will add dither to audio\n");
179 E_INFO("Dither seeded with %d\n", fe->seed);
182 E_INFO("Will not add dither to audio\n");
184 if (fe->mel_fb->lifter_val) {
185 E_INFO("Will apply sine-curve liftering, period %d\n",
186 fe->mel_fb->lifter_val);
188 E_INFO("Will %snormalize filters to unit area\n",
189 fe->mel_fb->unit_area ? "" : "not ");
190 E_INFO("Will %sround filter frequencies to DFT points\n",
191 fe->mel_fb->round_filters ? "" : "not ");
192 E_INFO("Will %suse double bandwidth in mel filter\n",
193 fe->mel_fb->doublewide ? "" : "not ");
199 return fe_init_auto_r(cmd_ln_retain(cmd_ln_get()));
203 fe_init_auto_r(cmd_ln_t *config)
207 fe = ckd_calloc(1, sizeof(*fe));
210 /* transfer params to front end */
211 if (fe_parse_general_params(config, fe) < 0) {
216 /* compute remaining fe parameters */
217 /* We add 0.5 so approximate the float with the closest
218 * integer. E.g., 2.3 is truncate to 2, whereas 3.7 becomes 4
220 fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5);
221 fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5);
223 fe->frame_counter = 0;
225 if (fe->frame_size > (fe->fft_size)) {
227 ("Number of FFT points has to be a power of 2 higher than %d\n",
234 fe_init_dither(fe->seed);
236 /* establish buffers for overflow samps and hamming window */
237 fe->overflow_samps = ckd_calloc(fe->frame_size, sizeof(int16));
238 fe->hamming_window = ckd_calloc(fe->frame_size/2, sizeof(window_t));
240 /* create hamming window */
241 fe_create_hamming(fe->hamming_window, fe->frame_size);
243 /* init and fill appropriate filter structure */
244 fe->mel_fb = ckd_calloc(1, sizeof(*fe->mel_fb));
246 /* transfer params to mel fb */
247 fe_parse_melfb_params(config, fe, fe->mel_fb);
248 fe_build_melfilters(fe->mel_fb);
249 fe_compute_melcosine(fe->mel_fb);
251 /* Create temporary FFT, spectrum and mel-spectrum buffers. */
252 /* FIXME: Gosh there are a lot of these. */
253 fe->spch = ckd_calloc(fe->frame_size, sizeof(*fe->spch));
254 fe->frame = ckd_calloc(fe->fft_size, sizeof(*fe->frame));
255 fe->spec = ckd_calloc(fe->fft_size, sizeof(*fe->spec));
256 fe->mfspec = ckd_calloc(fe->mel_fb->num_filters, sizeof(*fe->mfspec));
258 /* create twiddle factors */
259 fe->ccc = ckd_calloc(fe->fft_size / 4, sizeof(*fe->ccc));
260 fe->sss = ckd_calloc(fe->fft_size / 4, sizeof(*fe->sss));
261 fe_create_twiddle(fe);
263 if (cmd_ln_boolean_r(config, "-verbose")) {
264 fe_print_current(fe);
268 /*** Initialize the overflow buffers ***/
280 fe_get_config(fe_t *fe)
286 fe_init_dither(int32 seed)
289 E_INFO("You are using the internal mechanism to generate the seed.\n");
291 s3_rand_seed(GetTickCount());
293 s3_rand_seed((long) time(0));
297 E_INFO("You are using %d as the seed.\n", seed);
303 fe_start_utt(fe_t * fe)
305 fe->num_overflow_samps = 0;
306 memset(fe->overflow_samps, 0, fe->frame_size * sizeof(int16));
313 fe_get_output_size(fe_t *fe)
315 return (int)fe->feature_dimension;
319 fe_get_input_size(fe_t *fe, int *out_frame_shift,
323 *out_frame_shift = fe->frame_shift;
325 *out_frame_size = fe->frame_size;
329 fe_process_frame(fe_t * fe, int16 const *spch, int32 nsamps, mfcc_t * fr_cep)
331 fe_read_frame(fe, spch, nsamps);
332 return fe_write_frame(fe, fr_cep);
336 fe_process_frames(fe_t *fe,
337 int16 const **inout_spch,
338 size_t *inout_nsamps,
340 int32 *inout_nframes)
343 int outidx, i, n, n_overflow, orig_n_overflow;
344 int16 const *orig_spch;
346 /* In the special case where there is no output buffer, return the
347 * maximum number of frames which would be generated. */
348 if (buf_cep == NULL) {
349 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size)
353 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
355 return *inout_nframes;
358 /* Are there not enough samples to make at least 1 frame? */
359 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size) {
360 if (*inout_nsamps > 0) {
361 /* Append them to the overflow buffer. */
362 memcpy(fe->overflow_samps + fe->num_overflow_samps,
363 *inout_spch, *inout_nsamps * (sizeof(int16)));
364 fe->num_overflow_samps += *inout_nsamps;
365 /* Update input-output pointers and counters. */
366 *inout_spch += *inout_nsamps;
369 /* We produced no frames of output, sorry! */
374 /* Can't write a frame? Then do nothing! */
375 if (*inout_nframes < 1) {
380 /* Keep track of the original start of the buffer. */
381 orig_spch = *inout_spch;
382 orig_n_overflow = fe->num_overflow_samps;
383 /* How many frames will we be able to get? */
385 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
387 /* Limit it to the number of output frames available. */
388 if (frame_count > *inout_nframes)
389 frame_count = *inout_nframes;
390 /* Index of output frame. */
393 /* Start processing, taking care of any incoming overflow. */
394 if (fe->num_overflow_samps) {
395 int offset = fe->frame_size - fe->num_overflow_samps;
397 /* Append start of spch to overflow samples to make a full frame. */
398 memcpy(fe->overflow_samps + fe->num_overflow_samps,
399 *inout_spch, offset * sizeof(**inout_spch));
400 fe_read_frame(fe, fe->overflow_samps, fe->frame_size);
401 assert(outidx < frame_count);
402 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
405 /* Update input-output pointers and counters. */
406 *inout_spch += offset;
407 *inout_nsamps -= offset;
408 fe->num_overflow_samps -= fe->frame_shift;
411 fe_read_frame(fe, *inout_spch, fe->frame_size);
412 assert(outidx < frame_count);
413 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
416 /* Update input-output pointers and counters. */
417 *inout_spch += fe->frame_size;
418 *inout_nsamps -= fe->frame_size;
421 /* Process all remaining frames. */
422 for (i = 1; i < frame_count; ++i) {
423 assert(*inout_nsamps >= (size_t)fe->frame_shift);
425 fe_shift_frame(fe, *inout_spch, fe->frame_shift);
426 assert(outidx < frame_count);
427 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
430 /* Update input-output pointers and counters. */
431 *inout_spch += fe->frame_shift;
432 *inout_nsamps -= fe->frame_shift;
433 /* Amount of data behind the original input which is still needed. */
434 if (fe->num_overflow_samps > 0)
435 fe->num_overflow_samps -= fe->frame_shift;
438 /* How many relevant overflow samples are there left? */
439 if (fe->num_overflow_samps <= 0) {
440 /* Maximum number of overflow samples past *inout_spch to save. */
441 n_overflow = *inout_nsamps;
442 if (n_overflow > fe->frame_shift)
443 n_overflow = fe->frame_shift;
444 fe->num_overflow_samps = fe->frame_size - fe->frame_shift;
445 /* Make sure this isn't an illegal read! */
446 if (fe->num_overflow_samps > *inout_spch - orig_spch)
447 fe->num_overflow_samps = *inout_spch - orig_spch;
448 fe->num_overflow_samps += n_overflow;
449 if (fe->num_overflow_samps > 0) {
450 memcpy(fe->overflow_samps,
451 *inout_spch - (fe->frame_size - fe->frame_shift),
452 fe->num_overflow_samps * sizeof(**inout_spch));
453 /* Update the input pointer to cover this stuff. */
454 *inout_spch += n_overflow;
455 *inout_nsamps -= n_overflow;
459 /* There is still some relevant data left in the overflow buffer. */
460 /* Shift existing data to the beginning. */
461 memmove(fe->overflow_samps,
462 fe->overflow_samps + orig_n_overflow - fe->num_overflow_samps,
463 fe->num_overflow_samps * sizeof(*fe->overflow_samps));
464 /* Copy in whatever we had in the original speech buffer. */
465 n_overflow = *inout_spch - orig_spch + *inout_nsamps;
466 if (n_overflow > fe->frame_size - fe->num_overflow_samps)
467 n_overflow = fe->frame_size - fe->num_overflow_samps;
468 memcpy(fe->overflow_samps + fe->num_overflow_samps,
469 orig_spch, n_overflow * sizeof(*orig_spch));
470 fe->num_overflow_samps += n_overflow;
471 /* Advance the input pointers. */
472 if (n_overflow > *inout_spch - orig_spch) {
473 n_overflow -= (*inout_spch - orig_spch);
474 *inout_spch += n_overflow;
475 *inout_nsamps -= n_overflow;
479 /* Finally update the frame counter with the number of frames we procesed. */
480 *inout_nframes = outidx; /* FIXME: Not sure why I wrote it this way... */
485 fe_process_utt(fe_t * fe, int16 const * spch, size_t nsamps,
486 mfcc_t *** cep_block, int32 * nframes)
491 /* Figure out how many frames we will need. */
492 fe_process_frames(fe, NULL, &nsamps, NULL, nframes);
493 /* Create the output buffer (it has to exist, even if there are no output frames). */
495 cep = (mfcc_t **)ckd_calloc_2d(*nframes, fe->feature_dimension, sizeof(**cep));
497 cep = (mfcc_t **)ckd_calloc_2d(1, fe->feature_dimension, sizeof(**cep));
498 /* Now just call fe_process_frames() with the allocated buffer. */
499 rv = fe_process_frames(fe, &spch, &nsamps, cep, nframes);
507 fe_end_utt(fe_t * fe, mfcc_t * cepvector, int32 * nframes)
509 /* Process any remaining data. */
510 if (fe->num_overflow_samps > 0) {
511 fe_read_frame(fe, fe->overflow_samps, fe->num_overflow_samps);
512 *nframes = fe_write_frame(fe, cepvector);
518 /* reset overflow buffers... */
519 fe->num_overflow_samps = 0;
537 if (--fe->refcount > 0)
540 /* kill FE instance - free everything... */
542 if (fe->mel_fb->mel_cosine)
543 fe_free_2d((void *) fe->mel_fb->mel_cosine);
544 ckd_free(fe->mel_fb->lifter);
545 ckd_free(fe->mel_fb->spec_start);
546 ckd_free(fe->mel_fb->filt_start);
547 ckd_free(fe->mel_fb->filt_width);
548 ckd_free(fe->mel_fb->filt_coeffs);
549 ckd_free(fe->mel_fb);
556 ckd_free(fe->mfspec);
557 ckd_free(fe->overflow_samps);
558 ckd_free(fe->hamming_window);
559 cmd_ln_free_r(fe->config);
566 * Convert a block of mfcc_t to float32 (can be done in-place)
569 fe_mfcc_to_float(fe_t * fe,
570 mfcc_t ** input, float32 ** output, int32 nframes)
575 if ((void *) input == (void *) output)
576 return nframes * fe->feature_dimension;
578 for (i = 0; i < nframes * fe->feature_dimension; ++i)
579 output[0][i] = MFCC2FLOAT(input[0][i]);
585 * Convert a block of float32 to mfcc_t (can be done in-place)
588 fe_float_to_mfcc(fe_t * fe,
589 float32 ** input, mfcc_t ** output, int32 nframes)
594 if ((void *) input == (void *) output)
595 return nframes * fe->feature_dimension;
597 for (i = 0; i < nframes * fe->feature_dimension; ++i)
598 output[0][i] = FLOAT2MFCC(input[0][i]);
604 fe_logspec_to_mfcc(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
607 fe_spec2cep(fe, fr_spec, fr_cep);
608 #else /* ! FIXED_POINT */
612 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
613 for (i = 0; i < fe->mel_fb->num_filters; ++i)
614 powspec[i] = (powspec_t) fr_spec[i];
615 fe_spec2cep(fe, powspec, fr_cep);
617 #endif /* ! FIXED_POINT */
622 fe_logspec_dct2(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
625 fe_dct2(fe, fr_spec, fr_cep, 0);
626 #else /* ! FIXED_POINT */
630 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
631 for (i = 0; i < fe->mel_fb->num_filters; ++i)
632 powspec[i] = (powspec_t) fr_spec[i];
633 fe_dct2(fe, powspec, fr_cep, 0);
635 #endif /* ! FIXED_POINT */
640 fe_mfcc_dct3(fe_t * fe, const mfcc_t * fr_cep, mfcc_t * fr_spec)
643 fe_dct3(fe, fr_cep, fr_spec);
644 #else /* ! FIXED_POINT */
648 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
649 fe_dct3(fe, fr_cep, powspec);
650 for (i = 0; i < fe->mel_fb->num_filters; ++i)
651 fr_spec[i] = (mfcc_t) powspec[i];
653 #endif /* ! FIXED_POINT */