1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
41 #include <sphinxbase/cmd_ln.h>
42 #include <sphinxbase/yin.h>
43 #include <sphinxbase/ckd_alloc.h>
44 #include <sphinxbase/byteorder.h>
45 #include <sphinxbase/strfuncs.h>
46 #include <sphinxbase/err.h>
47 #include <sphinxbase/pio.h>
49 #ifndef WORDS_BIGENDIAN
50 #define WORDS_BIGENDIAN 0
53 static arg_t defn[] = {
57 "Single audio input file" },
62 "Single text output file (standard output will be used if not given)" },
67 "Control file for batch processing" },
72 "If a control file was specified, the number of utterances to skip at the head of the file" },
77 "If a control file was specified, the number of utterances to process (see -nskip too)" },
82 "Input directory, input file names are relative to this, if defined" },
87 "Input extension to be applied to all input files" },
92 "Output directory, output files are relative to this" },
97 "Output extension to be applied to all output files" },
102 "Defines input format as NIST sphere" },
107 "Defines input format as raw binary data" },
112 "Defines input format as Microsoft Wav (RIFF)" },
117 "Sampling rate of audio data (will be determined automatically if 0)" },
122 "Endianness of audio data (will be determined automatically if not given)" },
127 "Frame shift: number of seconds between each analysis frame." },
132 "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." },
137 "Number of frames on either side of the current frame to use for smoothing." },
142 "Threshold of normalized difference under which to search for the fundamental period." },
147 "Fraction of the best local estimate to use as a search range for smoothing." },
149 { NULL, 0, NULL, NULL }
152 static int extract_pitch(const char *in, const char *out);
153 static int run_control_file(const char *ctl);
156 main(int argc, char *argv[])
158 cmd_ln_parse(defn, argc, argv, TRUE);
160 /* Run a control file if requested. */
161 if (cmd_ln_str("-c")) {
162 if (run_control_file(cmd_ln_str("-c")) < 0)
166 if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0)
175 guess_file_type(char const *file, FILE *infh)
179 fseek(infh, 0, SEEK_SET);
180 if (fread(header, 1, 4, infh) != 4) {
181 E_ERROR_SYSTEM("Failed to read 4 byte header");
184 if (0 == memcmp(header, "RIFF", 4)) {
185 E_INFO("%s appears to be a WAV file\n", file);
186 cmd_ln_set_boolean("-mswav", TRUE);
187 cmd_ln_set_boolean("-nist", FALSE);
188 cmd_ln_set_boolean("-raw", FALSE);
190 else if (0 == memcmp(header, "NIST", 4)) {
191 E_INFO("%s appears to be a NIST SPHERE file\n", file);
192 cmd_ln_set_boolean("-mswav", FALSE);
193 cmd_ln_set_boolean("-nist", TRUE);
194 cmd_ln_set_boolean("-raw", FALSE);
197 E_INFO("%s appears to be raw data\n", file);
198 cmd_ln_set_boolean("-mswav", FALSE);
199 cmd_ln_set_boolean("-nist", FALSE);
200 cmd_ln_set_boolean("-raw", TRUE);
202 fseek(infh, 0, SEEK_SET);
206 #define TRY_FREAD(ptr, size, nmemb, stream) \
207 if (fread(ptr, size, nmemb, stream) != (nmemb)) { \
208 E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb); \
213 read_riff_header(FILE *infh)
216 int32 intval, header_len;
219 /* RIFF files are little-endian by definition. */
220 cmd_ln_set_str("-input_endian", "little");
222 /* Read in all the header chunks and etcetera. */
223 TRY_FREAD(id, 1, 4, infh);
224 /* Total file length (we don't care) */
225 TRY_FREAD(&intval, 4, 1, infh);
227 TRY_FREAD(id, 1, 4, infh);
228 if (0 != memcmp(id, "WAVE", 4)) {
229 E_ERROR("This is not a WAVE file\n");
233 TRY_FREAD(id, 1, 4, infh);
234 if (0 != memcmp(id, "fmt ", 4)) {
235 E_ERROR("Format chunk missing\n");
238 /* Length of 'fmt ' chunk */
239 TRY_FREAD(&intval, 4, 1, infh);
240 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
244 TRY_FREAD(&shortval, 2, 1, infh);
245 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
246 if (shortval != 1) { /* PCM */
247 E_ERROR("WAVE file is not in PCM format\n");
251 /* Number of channels. */
252 TRY_FREAD(&shortval, 2, 1, infh);
253 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
254 if (shortval != 1) { /* PCM */
255 E_ERROR("WAVE file is not single channel\n");
259 /* Sampling rate (finally!) */
260 TRY_FREAD(&intval, 4, 1, infh);
261 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
262 if (cmd_ln_int32("-samprate") == 0)
263 cmd_ln_set_int32("-samprate", intval);
264 else if (cmd_ln_int32("-samprate") != intval) {
265 E_WARN("WAVE file sampling rate %d != -samprate %d\n",
266 intval, cmd_ln_int32("-samprate"));
269 /* Average bytes per second (we don't care) */
270 TRY_FREAD(&intval, 4, 1, infh);
272 /* Block alignment (we don't care) */
273 TRY_FREAD(&shortval, 2, 1, infh);
275 /* Bits per sample (must be 16) */
276 TRY_FREAD(&shortval, 2, 1, infh);
277 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
278 if (shortval != 16) {
279 E_ERROR("WAVE file is not 16-bit\n");
283 /* Any extra parameters. */
285 fseek(infh, header_len - 16, SEEK_CUR);
287 /* Now skip to the 'data' chunk. */
289 TRY_FREAD(id, 1, 4, infh);
290 if (0 == memcmp(id, "data", 4)) {
291 /* Total number of bytes of data (we don't care). */
292 TRY_FREAD(&intval, 4, 1, infh);
296 /* Some other stuff... */
297 /* Number of bytes of ... whatever */
298 TRY_FREAD(&intval, 4, 1, infh);
299 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
300 fseek(infh, intval, SEEK_CUR);
304 /* We are ready to rumble. */
311 read_nist_header(FILE *infh)
316 TRY_FREAD(hdr, 1, 1024, infh);
319 /* Roughly parse it to find the sampling rate and byte order
320 * (don't bother with other stuff) */
321 if ((line = strstr(hdr, "sample_rate")) == NULL) {
322 E_ERROR("No sampling rate in NIST header!\n");
325 c = strchr(line, '\n');
327 c = strrchr(line, ' ');
329 E_ERROR("Could not find sampling rate!\n");
333 if (cmd_ln_int32("-samprate") == 0)
334 cmd_ln_set_int32("-samprate", atoi(c));
335 else if (cmd_ln_int32("-samprate") != atoi(c)) {
336 E_WARN("NIST file sampling rate %d != -samprate %d\n",
337 atoi(c), cmd_ln_int32("-samprate"));
340 if (line + strlen(line) < hdr + 1023)
341 line[strlen(line)] = ' ';
342 if ((line = strstr(hdr, "sample_byte_format")) == NULL) {
343 E_ERROR("No sample byte format in NIST header!\n");
346 c = strchr(line, '\n');
348 c = strrchr(line, ' ');
350 E_ERROR("Could not find sample byte order!\n");
354 if (0 == memcmp(c, "01", 2)) {
355 cmd_ln_set_str("-input_endian", "little");
357 else if (0 == memcmp(c, "10", 2)) {
358 cmd_ln_set_str("-input_endian", "big");
361 E_ERROR("Unknown byte order %s\n", c);
365 /* We are ready to rumble. */
372 extract_pitch(const char *in, const char *out)
374 FILE *infh = NULL, *outfh = NULL;
375 size_t flen, fshift, nsamps;
378 uint16 period, bestdiff;
382 if ((outfh = fopen(out, "w")) == NULL) {
383 E_ERROR_SYSTEM("Failed to open %s for writing", out);
390 if ((infh = fopen(in, "rb")) == NULL) {
391 E_ERROR_SYSTEM("Failed to open %s for reading", in);
395 /* If we weren't told what the file type is, weakly try to
396 * determine it (actually it's pretty obvious) */
397 if (!(cmd_ln_boolean("-raw")
398 || cmd_ln_boolean("-mswav")
399 || cmd_ln_boolean("-nist"))) {
400 if (guess_file_type(in, infh) < 0)
404 /* Grab the sampling rate and byte order from the header and also
405 * make sure this is 16-bit linear PCM. */
406 if (cmd_ln_boolean("-mswav")) {
407 if (read_riff_header(infh) < 0)
410 else if (cmd_ln_boolean("-nist")) {
411 if (read_nist_header(infh) < 0)
414 else if (cmd_ln_boolean("-raw")) {
415 /* Just use some defaults for sampling rate and endian. */
416 if (cmd_ln_str("-input_endian") == NULL) {
418 cmd_ln_set_str("-input_endian", "big");
420 cmd_ln_set_str("-input_endian", "little");
422 if (cmd_ln_int32("-samprate") == 0)
423 cmd_ln_set_int32("-samprate", 16000);
426 /* Now read frames and write pitch estimates. */
427 sps = cmd_ln_int32("-samprate");
428 flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen"));
429 fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift"));
430 yin = yin_init(flen, cmd_ln_float32("-voice_thresh"),
431 cmd_ln_float32("-search_range"),
432 cmd_ln_int32("-smooth_window"));
434 E_ERROR("Failed to initialize YIN\n");
437 buf = ckd_calloc(flen, sizeof(*buf));
438 /* Read the first full frame of data. */
439 if (fread(buf, sizeof(*buf), flen, infh) != flen) {
440 /* Fail silently, which is probably okay. */
444 while (!feof(infh)) {
445 /* Process a frame of data. */
447 if (yin_read(yin, &period, &bestdiff)) {
448 fprintf(outfh, "%.3f %.2f %.2f\n",
451 /* "Probability" of voicing. */
452 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
453 /* Pitch (possibly bogus) */
454 period == 0 ? sps : (double)sps / period);
457 /* Shift it back and get the next frame's overlap. */
458 memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf));
459 if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) {
460 /* Fail silently (FIXME: really?) */
464 /* Process trailing frames of data. */
465 while (yin_read(yin, &period, &bestdiff)) {
466 fprintf(outfh, "%.3f %.2f %.2f\n",
469 /* "Probability" of voicing. */
470 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
471 /* Pitch (possibly bogus) */
472 period == 0 ? sps : (double)sps / period);
486 if (infh) fclose(infh);
487 if (outfh && outfh != stdout) fclose(outfh);
492 run_control_file(const char *ctl)
496 char *di, *dout, *ei, *eio;
498 int rv, guess_type, guess_sps, guess_endian;
501 skip = cmd_ln_int32("-nskip");
502 runlen = cmd_ln_int32("-runlen");
504 /* Whether to guess file types */
505 guess_type = !(cmd_ln_boolean("-raw")
506 || cmd_ln_boolean("-mswav")
507 || cmd_ln_boolean("-nist"));
508 /* Whether to guess sampling rate */
509 guess_sps = (cmd_ln_int32("-samprate") == 0);
510 /* Whether to guess endian */
511 guess_endian = (cmd_ln_str("-input_endian") == NULL);
513 if ((ctlfh = fopen(ctl, "r")) == NULL) {
514 E_ERROR_SYSTEM("Failed to open control file %s", ctl);
517 if (cmd_ln_str("-di"))
518 di = string_join(cmd_ln_str("-di"), "/", NULL);
521 if (cmd_ln_str("-do"))
522 dout = string_join(cmd_ln_str("-do"), "/", NULL);
524 dout = ckd_salloc("");
525 if (cmd_ln_str("-ei"))
526 ei = string_join(".", cmd_ln_str("-ei"), NULL);
529 if (cmd_ln_str("-eo"))
530 eio = string_join(".", cmd_ln_str("-eo"), NULL);
532 eio = ckd_salloc("");
534 while ((line = fread_line(ctlfh, &len)) != NULL) {
535 char *infile, *outfile;
547 if (line[len-1] == '\n')
550 infile = string_join(di, line, ei, NULL);
551 outfile = string_join(dout, line, eio, NULL);
553 /* Reset various guessed information */
555 cmd_ln_set_boolean("-nist", FALSE);
556 cmd_ln_set_boolean("-mswav", FALSE);
557 cmd_ln_set_boolean("-raw", FALSE);
560 cmd_ln_set_int32("-samprate", 0);
562 cmd_ln_set_str("-input_endian", NULL);
564 rv = extract_pitch(infile, outfile);