1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
51 #include <sphinxbase/fe.h>
52 #include <sphinxbase/strfuncs.h>
53 #include <sphinxbase/pio.h>
54 #include <sphinxbase/filename.h>
55 #include <sphinxbase/cmd_ln.h>
56 #include <sphinxbase/err.h>
57 #include <sphinxbase/ckd_alloc.h>
58 #include <sphinxbase/byteorder.h>
59 #include <sphinxbase/hash_table.h>
61 #include "sphinx_wave2feat.h"
62 #include "cmd_ln_defn.h"
64 typedef struct audio_type_s {
66 int (*detect)(sphinx_wave2feat_t *wtf, char const *infile);
67 int (*decode)(sphinx_wave2feat_t *wtf);
70 typedef struct output_type_s {
72 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
73 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
76 struct sphinx_wave2feat_s {
77 int refcount; /**< Reference count. */
78 cmd_ln_t *config; /**< Configuration parameters. */
79 fe_t *fe; /**< Front end object. */
80 char *infile; /**< Path to input file. */
81 char *outfile; /**< Path to output file. */
82 FILE *infh; /**< Input file handle. */
83 FILE *outfh; /**< Output file handle. */
84 short *audio; /**< Audio buffer. */
85 mfcc_t **feat; /**< Feature buffer. */
86 int blocksize; /**< Size of audio buffer. */
87 int featsize; /**< Size of feature buffer. */
88 int veclen; /**< Length of each output vector. */
89 int in_veclen; /**< Length of each input vector (for cep<->spec). */
90 int byteswap; /**< Whether byteswapping is necessary. */
92 SNDFILE *insfh; /**< Input sndfile handle. */
94 output_type_t const *ot;/**< Output type object. */
97 /** RIFF 44-byte header structure for MS wav files. */
98 typedef struct RIFFHeader{
99 char rifftag[4]; /* "RIFF" string */
100 int32 TotalLength; /* Total length */
101 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
102 int32 RemainingLength; /* Remaining length */
103 int16 data_format; /* data format tag, 1 = PCM */
104 int16 numchannels; /* Number of channels in file */
105 int32 SamplingFreq; /* Sampling frequency */
106 int32 BytesPerSec; /* Average bytes/sec */
107 int16 BlockAlign; /* Block align */
108 int16 BitsPerSample; /* 8 or 16 bit */
109 char datatag[4]; /* "data" string */
110 int32 datalength; /* Raw data length */
114 * Detect RIFF file and parse its header if detected.
116 * @return TRUE if it's a RIFF file, FALSE if not, -1 if an error occurred.
119 detect_riff(sphinx_wave2feat_t *wtf, char const *infile)
124 if ((fh = fopen(infile, "rb")) == NULL) {
125 E_ERROR_SYSTEM("Failed to open %s", infile);
128 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
129 E_ERROR_SYSTEM("Failed to read RIFF header");
133 /* Make sure it is actually a RIFF file. */
134 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
139 /* Get relevant information. */
140 cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
141 cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
143 ckd_free(wtf->infile);
144 wtf->infile = ckd_salloc(infile);
151 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh)
157 if ((fh = fopen(infile, "rb")) == NULL) {
158 E_ERROR_SYSTEM("Failed to open %s", infile);
161 if (fread(&nist, 1, 7, fh) != 7) {
162 E_ERROR_SYSTEM("Failed to read NIST header");
166 /* Is this actually a NIST file? */
167 if (0 != strncmp(nist, "NIST_1A", 7)) {
171 /* Rewind, parse lines. */
172 fseek(fh, 0, SEEK_SET);
173 for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
177 string_trim(li->buf, STRING_BOTH);
178 if (strlen(li->buf) == 0) {
182 nword = str2words(li->buf, NULL, 0);
185 words = ckd_calloc(nword, sizeof(*words));
186 str2words(li->buf, words, nword);
187 if (0 == strcmp(words[0], "sample_rate")) {
188 cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
190 if (0 == strcmp(words[0], "channel_count")) {
191 cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
193 if (0 == strcmp(words[0], "sample_byte_format")) {
194 cmd_ln_set_str_r(wtf->config, "-input_endian",
195 (0 == strcmp(words[2], "10")) ? "big" : "little");
200 fseek(fh, 1024, SEEK_SET);
210 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
216 /* Determine if it's NIST file and get parameters. */
217 if ((rv = open_nist_file(wtf, infile, NULL)) != TRUE)
220 /* Now popen it with sph2pipe. */
221 cmdline = string_join("sph2pipe -f raw '", infile, "'", NULL);
222 if ((fh = popen(cmdline, "r")) == NULL) {
223 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", infile);
229 ckd_free(wtf->infile);
230 wtf->infile = ckd_salloc(infile);
234 #else /* !HAVE_POPEN */
236 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
238 E_ERROR("popen() not available, cannot run sph2pipe\n");
241 #endif /* !HAVE_POPEN */
244 * Detect NIST file and parse its header if detected.
246 * @return TRUE if it's a NIST file, FALSE if not, -1 if an error occurred.
249 detect_nist(sphinx_wave2feat_t *wtf, char const *infile)
254 if ((rv = open_nist_file(wtf, infile, &fh)) != TRUE)
257 ckd_free(wtf->infile);
258 wtf->infile = ckd_salloc(infile);
265 * Default "detection" function, just opens the file and keeps the
266 * default configuration parameters.
268 * @return TRUE, or -1 on error.
271 detect_raw(sphinx_wave2feat_t *wtf, char const *infile)
275 if ((fh = fopen(infile, "rb")) == NULL) {
276 E_ERROR_SYSTEM("Failed to open %s", infile);
280 ckd_free(wtf->infile);
281 wtf->infile = ckd_salloc(infile);
287 * "Detect" Sphinx MFCC files, meaning verify their lousy headers, and
288 * set up some parameters from the config object.
290 * @return TRUE, or -1 on error.
293 detect_sphinx_mfc(sphinx_wave2feat_t *wtf, char const *infile)
299 if ((fh = fopen(infile, "rb")) == NULL) {
300 E_ERROR_SYSTEM("Failed to open %s", infile);
303 if (fread(&len, 4, 1, fh) != 1) {
304 E_ERROR_SYSTEM("Failed to read header from %s\n", infile);
307 fseek(fh, 0, SEEK_END);
310 /* figure out whether to byteswap */
311 flen = (flen / 4) - 1;
313 /* First make sure this is an endianness problem, otherwise fail. */
317 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
321 /* Set the input endianness to the opposite of the machine endianness... */
322 cmd_ln_set_str_r(wtf->config, "-input_endian",
323 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
324 ? "little" : "big"));
327 fseek(fh, 4, SEEK_SET);
329 ckd_free(wtf->infile);
330 wtf->infile = ckd_salloc(infile);
332 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
333 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
335 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
336 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
337 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
340 /* Should not happen. */
341 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
349 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
354 for (i = whichchan - 1; i < nsamp; i += nchans)
355 buf[i/nchans] = buf[i];
358 for (i = 0; i < nsamp; i += nchans) {
360 for (j = 0; j < nchans && i + j < nsamp; ++j) {
363 buf[i/nchans] = (int16)(tmp / nchans);
369 #ifdef HAVE_SNDFILE_H
371 * Detect a file supported by libsndfile and parse its header if detected.
373 * @return TRUE if it's a supported file, FALSE if not, -1 if an error occurred.
376 detect_sndfile(sphinx_wave2feat_t *wtf, char const *infile)
381 memset(&sfinfo, 0, sizeof(sfinfo));
382 /* We let other detectors catch I/O errors, since there is
383 no way to tell them from format errors when opening :( */
384 if ((sf = sf_open(infile, SFM_READ, &sfinfo)) == NULL) {
387 /* Get relevant information. */
388 cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
389 cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
391 ckd_free(wtf->infile);
392 wtf->infile = ckd_salloc(infile);
400 * Process PCM audio from a libsndfile file. FIXME: looks a lot like
401 * decode_pcm! Also needs stereo support (as does decode_pcm).
404 decode_sndfile(sphinx_wave2feat_t *wtf)
407 int32 nfr, nchans, whichchan;
410 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
411 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
412 fe_start_utt(wtf->fe);
414 while ((nsamp = sf_read_short(wtf->insfh,
416 wtf->blocksize)) != 0) {
417 int16 const *inspeech;
420 /* Mix or pick channels. */
422 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
424 inspeech = wtf->audio;
425 nvec = wtf->featsize;
426 /* Consume all samples. */
429 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
431 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
436 inspeech = wtf->audio;
438 /* Now process any leftover audio frames. */
439 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
441 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
446 sf_close(wtf->insfh);
450 #endif /* HAVE_SNDFILE_H */
453 * Process PCM audio from a filehandle. Assume that wtf->infh is
454 * positioned just after the file header.
457 decode_pcm(sphinx_wave2feat_t *wtf)
460 int32 nfr, nchans, whichchan;
463 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
464 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
465 fe_start_utt(wtf->fe);
467 while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
469 int16 const *inspeech;
471 /* Byteswap stuff here if necessary. */
473 for (n = 0; n < nsamp; ++n)
474 SWAP_INT16(wtf->audio + n);
477 /* Mix or pick channels. */
479 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
481 inspeech = wtf->audio;
482 nvec = wtf->featsize;
483 /* Consume all samples. */
486 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
488 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
493 inspeech = wtf->audio;
495 /* Now process any leftover audio frames. */
496 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
498 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
503 if (fclose(wtf->infh) == EOF)
504 E_ERROR_SYSTEM("Failed to close input file");
510 * Process Sphinx MFCCs/logspectra from a filehandle. Assume that
511 * wtf->infh is positioned just after the file header.
514 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
517 int featsize = wtf->featsize;
519 /* If the input vector length is less than the output length, we
520 * need to do this one frame at a time, because there's empty
521 * space at the end of each vector in wtf->feat. */
522 if (wtf->in_veclen < wtf->veclen)
524 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
525 featsize * wtf->in_veclen, wtf->infh)) != 0) {
526 int i, nfr = n / wtf->in_veclen;
527 if (n % wtf->in_veclen) {
528 E_ERROR("Size of file %d not a multiple of veclen %d\n",
532 /* Byteswap stuff here if necessary. */
534 for (i = 0; i < n; ++i)
535 SWAP_FLOAT32(wtf->feat[0] + i);
537 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
538 for (i = 0; i < nfr; ++i) {
539 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
540 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
541 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
543 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
545 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
546 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
549 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
554 if (fclose(wtf->infh) == EOF)
555 E_ERROR_SYSTEM("Failed to close input file");
560 static const audio_type_t types[] = {
561 #ifdef HAVE_SNDFILE_H
562 { "-sndfile", &detect_sndfile, &decode_sndfile },
564 { "-mswav", &detect_riff, &decode_pcm },
565 { "-nist", &detect_nist, &decode_pcm },
566 { "-raw", &detect_raw, &decode_pcm },
567 { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
569 static const int ntypes = sizeof(types)/sizeof(types[0]);
570 static const audio_type_t mfcc_type = {
571 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
575 * Output sphinx format "header"
577 * @return 0 for success, <0 for error.
580 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
582 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
583 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
590 * Output frames in sphinx format.
592 * @return 0 for success, <0 for error.
595 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
599 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
600 for (i = 0; i < nfr; ++i) {
601 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
602 E_ERROR_SYSTEM("Writing %d values to %s failed",
603 wtf->veclen, wtf->outfile);
606 nfloat += wtf->veclen;
611 typedef enum htk_feature_kind_e {
612 WAVEFORM = 0, /* PCM audio (rarely used) */
613 LPC = 1, /* LPC filter coefficients */
614 LPCREFC = 2, /* LPC reflection coefficients */
615 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
616 LPCDELCEP = 4, /* LPCC plus deltas */
617 IREFC = 5, /* 16-bit integer LPC reflection coefficients */
618 MFCC = 6, /* MFCCs */
619 FBANK = 7, /* Log mel spectrum */
620 MELSPEC = 8, /* Linear mel spectrum */
621 USER = 9, /* User defined */
622 DISCRETE = 10, /* Vector quantized data */
623 PLP = 11 /* PLP coefficients */
624 } htk_feature_kind_t;
626 typedef enum htk_feature_flag_e {
627 _E = 0000100, /* has energy */
628 _N = 0000200, /* absolute energy supressed */
629 _D = 0000400, /* has delta coefficients */
630 _A = 0001000, /* has acceleration (delta-delta) coefficients */
631 _C = 0002000, /* is compressed */
632 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
633 _K = 0010000, /* has CRC checksum */
634 _O = 0020000, /* has 0th cepstral coefficient */
635 _V = 0040000, /* has VQ data */
636 _T = 0100000 /* has third differential coefficients */
637 } htk_feature_flag_t;
640 * Output HTK format header.
643 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
650 /* HTK files are big-endian. */
651 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
653 /* Same file size thing as in Sphinx files (I think) */
654 if (swap) SWAP_INT32(&nfloat);
655 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
657 /* Sample period in 100ns units. */
658 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
659 if (swap) SWAP_INT32(&samp_period);
660 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
662 /* Sample size - veclen * sizeof each sample. */
663 samp_size = wtf->veclen * 4;
664 if (swap) SWAP_INT16(&samp_size);
665 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
667 /* Format and flags. */
668 if (cmd_ln_boolean_r(wtf->config, "-logspec")
669 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
670 param_kind = FBANK; /* log mel-filter bank outputs */
672 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
673 if (swap) SWAP_INT16(¶m_kind);
674 if (fwrite(¶m_kind, 2, 1, wtf->outfh) != 1)
681 * Output frames in HTK format.
684 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
686 int i, j, swap, htk_reorder, nfloat = 0;
688 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
689 /* This is possibly inefficient, but probably not a big deal. */
690 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
691 htk_reorder = (0 == strcmp("htk", wtf->ot->name)
692 && !(cmd_ln_boolean_r(wtf->config, "-logspec")
693 || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
694 for (i = 0; i < nfr; ++i) {
696 mfcc_t c0 = frames[i][0];
697 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
698 frames[i][wtf->veclen - 1] = c0;
701 for (j = 0; j < wtf->veclen; ++j)
702 SWAP_FLOAT32(frames[i] + j);
703 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
704 E_ERROR_SYSTEM("Writing %d values to %s failed",
705 wtf->veclen, wtf->outfile);
708 nfloat += wtf->veclen;
714 * Output frames in text format.
717 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
719 int i, j, nfloat = 0;
721 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
722 for (i = 0; i < nfr; ++i) {
723 for (j = 0; j < wtf->veclen; ++j) {
724 fprintf(wtf->outfh, "%.5g", frames[i][j]);
725 if (j == wtf->veclen - 1)
726 fprintf(wtf->outfh, "\n");
728 fprintf(wtf->outfh, " ");
730 nfloat += wtf->veclen;
735 static const output_type_t outtypes[] = {
736 { "sphinx", &output_header_sphinx, &output_frames_sphinx },
737 { "htk", &output_header_htk, &output_frames_htk },
738 { "text", NULL, &output_frames_text }
740 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
743 sphinx_wave2feat_init(cmd_ln_t *config)
745 sphinx_wave2feat_t *wtf;
748 wtf = ckd_calloc(1, sizeof(*wtf));
750 wtf->config = cmd_ln_retain(config);
751 wtf->fe = fe_init_auto_r(wtf->config);
752 wtf->ot = outtypes; /* Default (sphinx) type. */
753 for (i = 0; i < nouttypes; ++i) {
754 output_type_t const *otype = &outtypes[i];
755 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
760 if (i == nouttypes) {
761 E_ERROR("Unknown output type: '%s'\n",
762 cmd_ln_str_r(config, "-ofmt"));
763 sphinx_wave2feat_free(wtf);
771 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
775 if (--wtf->refcount > 0)
776 return wtf->refcount;
778 ckd_free(wtf->audio);
779 ckd_free_2d(wtf->feat);
780 ckd_free(wtf->infile);
781 ckd_free(wtf->outfile);
783 if (fclose(wtf->infh) == EOF)
784 E_ERROR_SYSTEM("Failed to close input file");
787 if (fclose(wtf->outfh) == EOF)
788 E_ERROR_SYSTEM("Failed to close output file");
790 cmd_ln_free_r(wtf->config);
798 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
804 static audio_type_t const *
805 detect_audio_type(sphinx_wave2feat_t *wtf, char const *infile)
807 audio_type_t const *atype;
810 /* Special case audio type for Sphinx MFCC inputs. */
811 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
812 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
813 int rv = mfcc_type.detect(wtf, infile);
819 /* Try to use the type of infile given on the command line. */
820 for (i = 0; i < ntypes; ++i) {
823 if (cmd_ln_boolean_r(wtf->config, atype->name)) {
824 rv = (*atype->detect)(wtf, infile);
832 /* Detect file type of infile and get parameters. */
833 for (i = 0; i < ntypes; ++i) {
836 rv = (*atype->detect)(wtf, infile);
854 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
855 char const *infile, char const *outfile)
857 int nchans, minfft, nfft, nfloat, veclen;
858 audio_type_t const *atype;
861 if (cmd_ln_boolean_r(wtf->config, "-verbose"))
862 E_INFO("Converting %s to %s\n", infile, outfile);
864 /* Detect input file type. */
865 if ((atype = detect_audio_type(wtf, infile)) == NULL)
868 /* Determine whether to byteswap input. */
869 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
870 cmd_ln_str_r(wtf->config, "-input_endian"));
872 /* Make sure the FFT size is sufficiently large. */
873 minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
874 * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
875 for (nfft = 1; nfft < minfft; nfft <<= 1)
877 if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
878 E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
879 cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
880 cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
882 wtf->fe = fe_init_auto_r(wtf->config);
885 /* Get the output frame size (if not already set). */
886 if (wtf->veclen == 0)
887 wtf->veclen = fe_get_output_size(wtf->fe);
889 /* Set up the input and output buffers. */
890 fe_get_input_size(wtf->fe, &fshift, &fsize);
891 /* Want to get at least a whole frame plus shift in here. Also we
892 will either pick or mix multiple channels so we need to read
894 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
895 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
896 if (wtf->blocksize < (fsize + fshift) * nchans) {
897 E_INFO("Block size of %d too small, increasing to %d\n",
899 (fsize + fshift) * nchans);
900 wtf->blocksize = (fsize + fshift) * nchans;
902 wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
903 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
905 /* Use the maximum of the input and output frame sizes to allocate this. */
906 veclen = wtf->veclen;
907 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
908 wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
911 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
912 E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
915 /* Write an empty header, which we'll fill in later. */
916 if (wtf->ot->output_header &&
917 (*wtf->ot->output_header)(wtf, 0) < 0) {
918 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
921 wtf->outfile = ckd_salloc(outfile);
923 if ((nfloat = (*atype->decode)(wtf)) < 0)
926 if (wtf->ot->output_header) {
927 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
928 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
931 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
932 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
936 if (fclose(wtf->outfh) == EOF)
937 E_ERROR_SYSTEM("Failed to close output file");
950 build_filenames(cmd_ln_t *config, char const *basename,
951 char **out_infile, char **out_outfile)
953 char const *di, *do_, *ei, *eo;
955 di = cmd_ln_str_r(config, "-di");
956 do_ = cmd_ln_str_r(config, "-do");
957 ei = cmd_ln_str_r(config, "-ei");
958 eo = cmd_ln_str_r(config, "-eo");
960 *out_infile = string_join(di ? di : "",
966 *out_outfile = string_join(do_ ? do_ : "",
972 /* Build output directory structure if possible/requested (it is
974 if (cmd_ln_boolean_r(config, "-build_outdirs")) {
975 char *dirname = ckd_salloc(*out_outfile);
976 path2dirname(*out_outfile, dirname);
977 build_directory(dirname);
983 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
989 int nskip, runlen, npart, rv = 0;
991 if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
992 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
995 nskip = cmd_ln_int32_r(wtf->config, "-nskip");
996 runlen = cmd_ln_int32_r(wtf->config, "-runlen");
997 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
998 /* Count lines in the file. */
999 int partlen, part, nlines = 0;
1000 part = cmd_ln_int32_r(wtf->config, "-part");
1001 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
1003 fseek(ctlfh, 0, SEEK_SET);
1004 partlen = nlines / npart;
1005 nskip = partlen * (part - 1);
1012 E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
1013 files = hash_table_new(runlen, HASH_CASE_YES);
1016 E_INFO("Processing all remaining utterances at position %d\n", nskip);
1017 files = hash_table_new(1000, HASH_CASE_YES);
1019 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
1020 char *c, *infile, *outfile;
1030 string_trim(li->buf, STRING_BOTH);
1031 /* Extract the file ID from the control line. */
1032 if ((c = strchr(li->buf, ' ')) != NULL)
1034 build_filenames(wtf->config, li->buf, &infile, &outfile);
1035 if (hash_table_lookup(files, infile, NULL) == 0)
1037 rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
1038 hash_table_enter(files, infile, outfile);
1041 if (fclose(ctlfh) == EOF)
1042 E_ERROR_SYSTEM("Failed to close control file");
1046 for (itor = hash_table_iter(files); itor;
1047 itor = hash_table_iter_next(itor)) {
1048 ckd_free((void *)hash_entry_key(itor->ent));
1049 ckd_free(hash_entry_val(itor->ent));
1051 hash_table_free(files);
1056 main(int argc, char *argv[])
1058 sphinx_wave2feat_t *wtf;
1062 /* Initialize config. */
1063 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
1066 /* Parse an argument file if there's one in there. */
1067 if (cmd_ln_str_r(config, "-argfile"))
1068 config = cmd_ln_parse_file_r(config, defn,
1069 cmd_ln_str_r(config, "-argfile"), FALSE);
1070 if (config == NULL) {
1071 E_ERROR("Command line parsing failed\n");
1074 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1075 E_ERROR("Failed to initialize wave2feat object\n");
1079 /* If there's a control file run through it, otherwise we will do
1080 * a single file (which is what run_control_file will do
1081 * internally too) */
1082 if (cmd_ln_str_r(config, "-c"))
1083 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1085 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1086 cmd_ln_str_r(config, "-o"));
1088 sphinx_wave2feat_free(wtf);