1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * ====================================================================
38 * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
42 * $Log: cont_fileseg.c,v $
43 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
46 * Revision 1.13 2005/06/30 00:28:46 rkm
47 * Kept within-utterance silences in rawmode
50 * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
51 * Modified to use new state variables in cont_ad_t.
53 * Revision 1.12 2005/05/31 15:54:38 rkm
54 * *** empty log message ***
56 * Revision 1.11 2005/05/24 20:56:58 rkm
57 * Added min/max-noise parameters to cont_fileseg
59 * Revision 1.10 2005/05/13 23:28:43 egouvea
60 * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
62 * $Log: cont_fileseg.c,v $
63 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
66 * Revision 1.13 2005/06/30 00:28:46 rkm
67 * Kept within-utterance silences in rawmode
69 * Revision 1.12 2005/05/31 15:54:38 rkm
70 * *** empty log message ***
72 * Revision 1.11 2005/05/24 20:56:58 rkm
73 * Added min/max-noise parameters to cont_fileseg
75 * Revision 1.9 2005/02/13 01:29:48 rkm
76 * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
78 * Revision 1.8 2005/02/01 22:21:13 rkm
79 * Added raw data logging, and raw data pass-through mode to cont_ad
81 * Revision 1.7 2004/07/16 00:57:11 egouvea
82 * Added Ravi's implementation of FSG support.
84 * Revision 1.3 2004/06/25 14:58:05 rkm
85 * *** empty log message ***
87 * Revision 1.2 2004/06/23 20:32:08 rkm
88 * Exposed several cont_ad config parameters
91 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
101 #include <sphinxbase/prim_type.h>
102 #include <sphinxbase/ad.h>
103 #include <sphinxbase/cont_ad.h>
104 #include <sphinxbase/err.h>
106 static FILE *infp; /* File being segmented */
109 /* Max size read by file_ad_read function on each invocation, for debugging */
110 static int32 max_ad_read_size;
112 #if defined(WIN32) && !defined(GNUWINCE)
113 #define NULL_DEVICE "NUL"
115 #define NULL_DEVICE "/dev/null"
120 * Need to provide cont_ad_init with a read function to read the input file.
121 * This is it. The ad_rec_t *r argument is ignored since there is no A/D
125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
129 if (max > max_ad_read_size)
130 max = max_ad_read_size;
132 k = fread(buf, sizeof(int16), max, infp);
134 for (i = 0; i < k; i++) {
135 buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
139 return ((k > 0) ? k : -1);
146 E_INFO("Usage: %s \\\n", pgm);
147 E_INFOCONT("\t[-? | -h] \\\n");
148 E_INFOCONT("\t[-d | -debug] \\\n");
149 E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
150 E_INFOCONT("\t[-b | -byteswap] \\\n");
152 ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
153 E_INFOCONT("\t[-w | -writeseg] \\\n");
154 E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
155 E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
156 E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
157 E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
158 E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
159 E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
160 E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
161 E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
162 E_INFOCONT("\t[-c <copy-input-file>] \\\n");
163 E_INFOCONT("\t[-r | -rawmode] \\\n");
164 E_INFOCONT("\t-i <input-file>\n");
170 * Read specified input file, segment it into utterances wherever a silence segment of
171 * a given minimum duration is encountered. Filter out long silences.
172 * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
175 main(int32 argc, char **argv)
178 int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
180 char *infile, *copyfile, segfile[1024];
185 int32 winsize, leader, trailer;
186 int32 orig_min_noise, orig_max_noise;
187 int32 orig_delta_sil, orig_delta_speech;
188 int32 orig_speech_onset, orig_sil_onset;
189 int32 min_noise, max_noise;
190 int32 delta_sil, delta_speech;
191 int32 sil_onset, speech_onset;
192 float32 orig_adapt_rate;
194 int32 total_speech_samples;
195 float32 total_speech_sec;
198 /* Set argument defaults */
204 min_noise = max_noise = -1;
205 delta_sil = delta_speech = -1;
206 sil_onset = speech_onset = -1;
208 max_ad_read_size = (int32) 0x7ffffff0;
215 /* Parse arguments */
216 for (i = 1; i < argc; i++) {
217 if ((strcmp(argv[i], "-help") == 0)
218 || (strcmp(argv[i], "-h") == 0)
219 || (strcmp(argv[i], "-?") == 0)) {
222 else if ((strcmp(argv[i], "-debug") == 0)
223 || (strcmp(argv[i], "-d") == 0)) {
226 else if (strcmp(argv[i], "-sps") == 0) {
229 || (sscanf(argv[i], "%d", &sps) != 1)
231 E_ERROR("Invalid -sps argument\n");
235 else if ((strcmp(argv[i], "-byteswap") == 0)
236 || (strcmp(argv[i], "-b") == 0)) {
239 else if ((strcmp(argv[i], "-silsep") == 0)
240 || (strcmp(argv[i], "-s") == 0)) {
243 || (sscanf(argv[i], "%f", &endsil) != 1)
244 || (endsil <= 0.0)) {
245 E_ERROR("Invalid -silsep argument\n");
249 else if ((strcmp(argv[i], "-writeseg") == 0)
250 || (strcmp(argv[i], "-w") == 0)) {
253 else if (strcmp(argv[i], "-min-noise") == 0) {
256 (sscanf(argv[i], "%d", &min_noise) != 1) ||
258 E_ERROR("Invalid -min-noise argument\n");
262 else if (strcmp(argv[i], "-max-noise") == 0) {
265 (sscanf(argv[i], "%d", &max_noise) != 1) ||
267 E_ERROR("Invalid -max-noise argument\n");
271 else if (strcmp(argv[i], "-delta-sil") == 0) {
274 (sscanf(argv[i], "%d", &delta_sil) != 1) ||
276 E_ERROR("Invalid -delta-sil argument\n");
280 else if (strcmp(argv[i], "-delta-speech") == 0) {
283 (sscanf(argv[i], "%d", &delta_speech) != 1) ||
284 (delta_speech < 0)) {
285 E_ERROR("Invalid -delta-speech argument\n");
289 else if (strcmp(argv[i], "-sil-onset") == 0) {
292 (sscanf(argv[i], "%d", &sil_onset) != 1) ||
294 E_ERROR("Invalid -sil-onset argument\n");
298 else if (strcmp(argv[i], "-speech-onset") == 0) {
301 (sscanf(argv[i], "%d", &speech_onset) != 1) ||
302 (speech_onset < 1)) {
303 E_ERROR("Invalid -speech-onset argument\n");
307 else if (strcmp(argv[i], "-adapt-rate") == 0) {
310 (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
311 (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
312 E_ERROR("Invalid -adapt-rate argument\n");
316 else if (strcmp(argv[i], "-max-adreadsize") == 0) {
319 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
320 (max_ad_read_size < 1)) {
321 E_ERROR("Invalid -max-adreadsize argument\n");
325 else if (strcmp(argv[i], "-c") == 0) {
328 E_ERROR("Invalid -c argument\n");
333 else if ((strcmp(argv[i], "-rawmode") == 0)
334 || (strcmp(argv[i], "-r") == 0)) {
337 else if (strcmp(argv[i], "-i") == 0) {
340 E_ERROR("Invalid -i argument\n");
350 if (infile == NULL) {
351 E_ERROR("No input file specified\n");
355 if ((infp = fopen(infile, "rb")) == NULL)
356 E_FATAL("Failed to open '%s' for reading: %s\n", infile, strerror(errno));
359 * Associate continuous listening module with opened input file and read function.
360 * No A/D device is involved, but need to fill in ad->sps.
361 * Calibrate input data using first few seconds of file, but then rewind it!!
364 ad.bps = sizeof(int16);
366 cont = cont_ad_init(&ad, file_ad_read);
368 cont = cont_ad_init_rawmode(&ad, file_ad_read);
370 printf("Calibrating ...");
372 if (cont_ad_calib(cont) < 0)
373 printf(" failed; file too short?\n");
378 /* Convert desired min. inter-utterance silence duration to #samples */
379 siltime = (int32) (endsil * sps);
381 /* Enable writing raw input to output by the cont module if specified */
383 if ((rawfp = fopen(copyfile, "wb")) == NULL)
384 E_ERROR("Failed to open raw output file '%s' for writing: %s\n",
385 copyfile, strerror(errno));
387 cont_ad_set_rawfp(cont, rawfp);
390 cont_ad_get_params(cont,
391 &orig_delta_sil, &orig_delta_speech,
392 &orig_min_noise, &orig_max_noise,
394 &orig_speech_onset, &orig_sil_onset,
395 &leader, &trailer, &orig_adapt_rate);
397 E_INFO("Default parameters:\n");
398 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
399 orig_min_noise, orig_max_noise);
400 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
401 orig_delta_sil, orig_delta_speech);
402 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
403 orig_sil_onset, orig_speech_onset);
404 E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
407 min_noise = orig_min_noise;
409 max_noise = orig_max_noise;
411 delta_sil = orig_delta_sil;
412 if (delta_speech < 0)
413 delta_speech = orig_delta_speech;
415 sil_onset = orig_sil_onset;
416 if (speech_onset < 0)
417 speech_onset = orig_speech_onset;
418 if (adapt_rate < 0.0)
419 adapt_rate = orig_adapt_rate;
421 cont_ad_set_params(cont,
422 delta_sil, delta_speech,
423 min_noise, max_noise,
425 speech_onset, sil_onset,
426 leader, trailer, adapt_rate);
428 E_INFO("Current parameters:\n");
429 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
430 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
432 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
434 E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
436 E_INFO("Sampling rate: %d", sps);
437 E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
438 E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
441 cont_ad_set_logfp(cont, stdout);
443 total_speech_samples = 0;
444 total_speech_sec = 0.0;
453 /* Get audio data from continuous listening module */
454 k = cont_ad_read(cont, buf, 4096);
456 if (k < 0) { /* End of input audio file; close any open output file and exit */
462 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
463 uttid, (double) starttime / (double) sps,
464 (double) (starttime + uttlen) / (double) sps,
465 (double) uttlen / (double) sps, uttlen);
468 total_speech_samples += uttlen;
469 total_speech_sec += (double) uttlen / (double) sps;
477 if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
478 if (fp != NULL) { /* Currently in an utterance */
479 if (cont->seglen > siltime) { /* Long enough silence detected; end the utterance */
484 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
485 uttid, (double) starttime / (double) sps,
486 (double) (starttime + uttlen) / (double) sps,
487 (double) uttlen / (double) sps, uttlen);
490 total_speech_samples += uttlen;
491 total_speech_sec += (double) uttlen / (double) sps;
497 * Short silence within utt; write it to output. (Some extra trailing silence
498 * is included in the utterance, as a result. Not to worry about it.)
501 fwrite(buf, sizeof(int16), k, fp);
508 assert(cont->state == CONT_AD_STATE_SPEECH);
510 if (fp == NULL) { /* Not in an utt; open a new output file */
512 sprintf(segfile, "%08d.raw", uttid);
514 strcpy(segfile, NULL_DEVICE);
515 if ((fp = fopen(segfile, "wb")) == NULL)
516 E_FATAL("Failed to open segmentation file '%s' for writing: %s\n", segfile, strerror(errno));
518 starttime = cont->read_ts - k;
522 /* Write data obtained to output file */
524 fwrite(buf, sizeof(int16), k, fp);
533 E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
534 cont->tot_frm, cont->tot_frm * cont->spf,
535 (cont->tot_frm * cont->spf) / (float32) cont->sps);
536 E_INFO("Total speech detected = %d samples, %.2f sec\n",
537 total_speech_samples, total_speech_sec);