src/sphinx_adtools/cont_fileseg.c

   1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* ====================================================================
   3  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
   4  * reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  *
  18  * This work was supported in part by funding from the Defense Advanced
  19  * Research Projects Agency and the National Science Foundation of the
  20  * United States of America, and the CMU Sphinx Speech Consortium.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33  *
  34  * ====================================================================
  35  *
  36  */
  37 /*
  38  * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
  39  *
  40  * HISTORY
  41  *
  42  * $Log: cont_fileseg.c,v $
  43  * Revision 1.1.1.1  2006/05/23 18:45:02  dhuggins
  44  * re-importation
  45  *
  46  * Revision 1.13  2005/06/30 00:28:46  rkm
  47  * Kept within-utterance silences in rawmode
  48  *
  49  *
  50  * 28-Jun-2005  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  51  *              Modified to use new state variables in cont_ad_t.
  52  *
  53  * Revision 1.12  2005/05/31 15:54:38  rkm
  54  * *** empty log message ***
  55  *
  56  * Revision 1.11  2005/05/24 20:56:58  rkm
  57  * Added min/max-noise parameters to cont_fileseg
  58  *
  59  * Revision 1.10  2005/05/13 23:28:43  egouvea
  60  * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
  61  *
  62  * $Log: cont_fileseg.c,v $
  63  * Revision 1.1.1.1  2006/05/23 18:45:02  dhuggins
  64  * re-importation
  65  *
  66  * Revision 1.13  2005/06/30 00:28:46  rkm
  67  * Kept within-utterance silences in rawmode
  68  *
  69  * Revision 1.12  2005/05/31 15:54:38  rkm
  70  * *** empty log message ***
  71  *
  72  * Revision 1.11  2005/05/24 20:56:58  rkm
  73  * Added min/max-noise parameters to cont_fileseg
  74  *
  75  * Revision 1.9  2005/02/13 01:29:48  rkm
  76  * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
  77  *
  78  * Revision 1.8  2005/02/01 22:21:13  rkm
  79  * Added raw data logging, and raw data pass-through mode to cont_ad
  80  *
  81  * Revision 1.7  2004/07/16 00:57:11  egouvea
  82  * Added Ravi's implementation of FSG support.
  83  *
  84  * Revision 1.3  2004/06/25 14:58:05  rkm
  85  * *** empty log message ***
  86  *
  87  * Revision 1.2  2004/06/23 20:32:08  rkm
  88  * Exposed several cont_ad config parameters
  89  *
  90  *
  91  * 27-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  92  *              Created.
  93  */
  94
  95 #include <stdio.h>
  96 #include <stdlib.h>
  97 #include <string.h>
  98 #include <assert.h>
  99 #include <math.h>
 100
 101 #include <sphinxbase/prim_type.h>
 102 #include <sphinxbase/ad.h>
 103 #include <sphinxbase/cont_ad.h>
 104 #include <sphinxbase/err.h>
 105
 106 static FILE *infp;              /* File being segmented */
 107 static int32 swap;
 108
 109 /* Max size read by file_ad_read function on each invocation, for debugging */
 110 static int32 max_ad_read_size;
 111
 112 #if defined(WIN32) && !defined(GNUWINCE)
 113 #define NULL_DEVICE "NUL"
 114 #else
 115 #define NULL_DEVICE "/dev/null"
 116 #endif
 117
 118
 119 /*
 120  * Need to provide cont_ad_init with a read function to read the input file.
 121  * This is it.  The ad_rec_t *r argument is ignored since there is no A/D
 122  * device involved.
 123  */
 124 static int32
 125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
 126 {
 127     int32 i, k;
 128
 129     if (max > max_ad_read_size)
 130         max = max_ad_read_size;
 131
 132     k = fread(buf, sizeof(int16), max, infp);
 133     if (swap) {
 134         for (i = 0; i < k; i++) {
 135             buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
 136         }
 137     }
 138
 139     return ((k > 0) ? k : -1);
 140 }
 141
 142
 143 static void
 144 usagemsg(char *pgm)
 145 {
 146     E_INFO("Usage: %s \\\n", pgm);
 147     E_INFOCONT("\t[-? | -h] \\\n");
 148     E_INFOCONT("\t[-d | -debug] \\\n");
 149     E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
 150     E_INFOCONT("\t[-b | -byteswap] \\\n");
 151     E_INFOCONT
 152         ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
 153     E_INFOCONT("\t[-w | -writeseg] \\\n");
 154     E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
 155     E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
 156     E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
 157     E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
 158     E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
 159     E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
 160     E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
 161     E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
 162     E_INFOCONT("\t[-c <copy-input-file>] \\\n");
 163     E_INFOCONT("\t[-r | -rawmode] \\\n");
 164     E_INFOCONT("\t-i <input-file>\n");
 165
 166     exit(0);
 167 }
 168
 169 /*
 170  * Read specified input file, segment it into utterances wherever a silence segment of
 171  * a given minimum duration is encountered.  Filter out long silences.
 172  * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
 173  */
 174 int
 175 main(int32 argc, char **argv)
 176 {
 177     cont_ad_t *cont;
 178     int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
 179     int16 buf[4096];
 180     char *infile, *copyfile, segfile[1024];
 181     FILE *fp;
 182     float endsil;
 183     ad_rec_t ad;
 184     int32 i, k;
 185     int32 winsize, leader, trailer;
 186     int32 orig_min_noise, orig_max_noise;
 187     int32 orig_delta_sil, orig_delta_speech;
 188     int32 orig_speech_onset, orig_sil_onset;
 189     int32 min_noise, max_noise;
 190     int32 delta_sil, delta_speech;
 191     int32 sil_onset, speech_onset;
 192     float32 orig_adapt_rate;
 193     float32 adapt_rate;
 194     int32 total_speech_samples;
 195     float32 total_speech_sec;
 196     FILE *rawfp;
 197
 198     /* Set argument defaults */
 199     cont = NULL;
 200     sps = 16000;
 201     swap = 0;
 202     endsil = 0.5;
 203     writeseg = 0;
 204     min_noise = max_noise = -1;
 205     delta_sil = delta_speech = -1;
 206     sil_onset = speech_onset = -1;
 207     adapt_rate = -1.0;
 208     max_ad_read_size = (int32) 0x7ffffff0;
 209     debug = 0;
 210     infile = NULL;
 211     copyfile = NULL;
 212     rawfp = NULL;
 213     rawmode = 0;
 214
 215     /* Parse arguments */
 216     for (i = 1; i < argc; i++) {
 217         if ((strcmp(argv[i], "-help") == 0)
 218             || (strcmp(argv[i], "-h") == 0)
 219             || (strcmp(argv[i], "-?") == 0)) {
 220             usagemsg(argv[0]);
 221         }
 222         else if ((strcmp(argv[i], "-debug") == 0)
 223                  || (strcmp(argv[i], "-d") == 0)) {
 224             debug = 1;
 225         }
 226         else if (strcmp(argv[i], "-sps") == 0) {
 227             i++;
 228             if ((i == argc)
 229                 || (sscanf(argv[i], "%d", &sps) != 1)
 230                 || (sps <= 0)) {
 231                 E_ERROR("Invalid -sps argument\n");
 232                 usagemsg(argv[0]);
 233             }
 234         }
 235         else if ((strcmp(argv[i], "-byteswap") == 0)
 236                  || (strcmp(argv[i], "-b") == 0)) {
 237             swap = 1;
 238         }
 239         else if ((strcmp(argv[i], "-silsep") == 0)
 240                  || (strcmp(argv[i], "-s") == 0)) {
 241             i++;
 242             if ((i == argc)
 243                 || (sscanf(argv[i], "%f", &endsil) != 1)
 244                 || (endsil <= 0.0)) {
 245                 E_ERROR("Invalid -silsep argument\n");
 246                 usagemsg(argv[0]);
 247             }
 248         }
 249         else if ((strcmp(argv[i], "-writeseg") == 0)
 250                  || (strcmp(argv[i], "-w") == 0)) {
 251             writeseg = 1;
 252         }
 253         else if (strcmp(argv[i], "-min-noise") == 0) {
 254             i++;
 255             if ((i == argc) ||
 256                 (sscanf(argv[i], "%d", &min_noise) != 1) ||
 257                 (min_noise < 0)) {
 258                 E_ERROR("Invalid -min-noise argument\n");
 259                 usagemsg(argv[0]);
 260             }
 261         }
 262         else if (strcmp(argv[i], "-max-noise") == 0) {
 263             i++;
 264             if ((i == argc) ||
 265                 (sscanf(argv[i], "%d", &max_noise) != 1) ||
 266                 (max_noise < 0)) {
 267                 E_ERROR("Invalid -max-noise argument\n");
 268                 usagemsg(argv[0]);
 269             }
 270         }
 271         else if (strcmp(argv[i], "-delta-sil") == 0) {
 272             i++;
 273             if ((i == argc) ||
 274                 (sscanf(argv[i], "%d", &delta_sil) != 1) ||
 275                 (delta_sil < 0)) {
 276                 E_ERROR("Invalid -delta-sil argument\n");
 277                 usagemsg(argv[0]);
 278             }
 279         }
 280         else if (strcmp(argv[i], "-delta-speech") == 0) {
 281             i++;
 282             if ((i == argc) ||
 283                 (sscanf(argv[i], "%d", &delta_speech) != 1) ||
 284                 (delta_speech < 0)) {
 285                 E_ERROR("Invalid -delta-speech argument\n");
 286                 usagemsg(argv[0]);
 287             }
 288         }
 289         else if (strcmp(argv[i], "-sil-onset") == 0) {
 290             i++;
 291             if ((i == argc) ||
 292                 (sscanf(argv[i], "%d", &sil_onset) != 1) ||
 293                 (sil_onset < 1)) {
 294                 E_ERROR("Invalid -sil-onset argument\n");
 295                 usagemsg(argv[0]);
 296             }
 297         }
 298         else if (strcmp(argv[i], "-speech-onset") == 0) {
 299             i++;
 300             if ((i == argc) ||
 301                 (sscanf(argv[i], "%d", &speech_onset) != 1) ||
 302                 (speech_onset < 1)) {
 303                 E_ERROR("Invalid -speech-onset argument\n");
 304                 usagemsg(argv[0]);
 305             }
 306         }
 307         else if (strcmp(argv[i], "-adapt-rate") == 0) {
 308             i++;
 309             if ((i == argc) ||
 310                 (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
 311                 (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
 312                 E_ERROR("Invalid -adapt-rate argument\n");
 313                 usagemsg(argv[0]);
 314             }
 315         }
 316         else if (strcmp(argv[i], "-max-adreadsize") == 0) {
 317             i++;
 318             if ((i == argc) ||
 319                 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
 320                 (max_ad_read_size < 1)) {
 321                 E_ERROR("Invalid -max-adreadsize argument\n");
 322                 usagemsg(argv[0]);
 323             }
 324         }
 325         else if (strcmp(argv[i], "-c") == 0) {
 326             i++;
 327             if (i == argc) {
 328                 E_ERROR("Invalid -c argument\n");
 329                 usagemsg(argv[0]);
 330             }
 331             copyfile = argv[i];
 332         }
 333         else if ((strcmp(argv[i], "-rawmode") == 0)
 334                  || (strcmp(argv[i], "-r") == 0)) {
 335             rawmode = 1;
 336         }
 337         else if (strcmp(argv[i], "-i") == 0) {
 338             i++;
 339             if (i == argc) {
 340                 E_ERROR("Invalid -i argument\n");
 341                 usagemsg(argv[0]);
 342             }
 343             infile = argv[i];
 344         }
 345         else {
 346             usagemsg(argv[0]);
 347         }
 348     }
 349
 350     if (infile == NULL) {
 351         E_ERROR("No input file specified\n");
 352         usagemsg(argv[0]);
 353     }
 354
 355     if ((infp = fopen(infile, "rb")) == NULL)
 356         E_FATAL("Failed to open '%s' for reading: %s\n", infile, strerror(errno));
 357
 358     /*
 359      * Associate continuous listening module with opened input file and read function.
 360      * No A/D device is involved, but need to fill in ad->sps.
 361      * Calibrate input data using first few seconds of file, but then rewind it!!
 362      */
 363     ad.sps = sps;
 364     ad.bps = sizeof(int16);
 365     if (!rawmode)
 366         cont = cont_ad_init(&ad, file_ad_read);
 367     else
 368         cont = cont_ad_init_rawmode(&ad, file_ad_read);
 369
 370     printf("Calibrating ...");
 371     fflush(stdout);
 372     if (cont_ad_calib(cont) < 0)
 373         printf(" failed; file too short?\n");
 374     else
 375         printf(" done\n");
 376     rewind(infp);
 377
 378     /* Convert desired min. inter-utterance silence duration to #samples */
 379     siltime = (int32) (endsil * sps);
 380
 381     /* Enable writing raw input to output by the cont module if specified */
 382     if (copyfile) {
 383         if ((rawfp = fopen(copyfile, "wb")) == NULL)
 384             E_ERROR("Failed to open raw output file '%s' for writing: %s\n",
 385                     copyfile, strerror(errno));
 386         else
 387             cont_ad_set_rawfp(cont, rawfp);
 388     }
 389
 390     cont_ad_get_params(cont,
 391                        &orig_delta_sil, &orig_delta_speech,
 392                        &orig_min_noise, &orig_max_noise,
 393                        &winsize,
 394                        &orig_speech_onset, &orig_sil_onset,
 395                        &leader, &trailer, &orig_adapt_rate);
 396
 397     E_INFO("Default parameters:\n");
 398     E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
 399                orig_min_noise, orig_max_noise);
 400     E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
 401                orig_delta_sil, orig_delta_speech);
 402     E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
 403                orig_sil_onset, orig_speech_onset);
 404     E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
 405
 406     if (min_noise < 0)
 407         min_noise = orig_min_noise;
 408     if (max_noise < 0)
 409         max_noise = orig_max_noise;
 410     if (delta_sil < 0)
 411         delta_sil = orig_delta_sil;
 412     if (delta_speech < 0)
 413         delta_speech = orig_delta_speech;
 414     if (sil_onset < 0)
 415         sil_onset = orig_sil_onset;
 416     if (speech_onset < 0)
 417         speech_onset = orig_speech_onset;
 418     if (adapt_rate < 0.0)
 419         adapt_rate = orig_adapt_rate;
 420
 421     cont_ad_set_params(cont,
 422                        delta_sil, delta_speech,
 423                        min_noise, max_noise,
 424                        winsize,
 425                        speech_onset, sil_onset,
 426                        leader, trailer, adapt_rate);
 427
 428     E_INFO("Current parameters:\n");
 429     E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
 430     E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
 431                delta_speech);
 432     E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
 433                speech_onset);
 434     E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
 435
 436     E_INFO("Sampling rate: %d", sps);
 437     E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
 438     E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
 439
 440     if (debug)
 441         cont_ad_set_logfp(cont, stdout);
 442
 443     total_speech_samples = 0;
 444     total_speech_sec = 0.0;
 445
 446     uttid = 0;
 447     uttlen = 0;
 448     starttime = 0;
 449     fp = NULL;
 450
 451     /* Process data */
 452     for (;;) {
 453         /* Get audio data from continuous listening module */
 454         k = cont_ad_read(cont, buf, 4096);
 455
 456         if (k < 0) {            /* End of input audio file; close any open output file and exit */
 457             if (fp != NULL) {
 458                 fclose(fp);
 459                 fp = NULL;
 460
 461                 printf
 462                     ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
 463                      uttid, (double) starttime / (double) sps,
 464                      (double) (starttime + uttlen) / (double) sps,
 465                      (double) uttlen / (double) sps, uttlen);
 466                 fflush(stdout);
 467
 468                 total_speech_samples += uttlen;
 469                 total_speech_sec += (double) uttlen / (double) sps;
 470
 471                 uttid++;
 472             }
 473
 474             break;
 475         }
 476
 477         if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
 478             if (fp != NULL) {   /* Currently in an utterance */
 479                 if (cont->seglen > siltime) {   /* Long enough silence detected; end the utterance */
 480                     fclose(fp);
 481                     fp = NULL;
 482
 483                     printf
 484                         ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
 485                          uttid, (double) starttime / (double) sps,
 486                          (double) (starttime + uttlen) / (double) sps,
 487                          (double) uttlen / (double) sps, uttlen);
 488                     fflush(stdout);
 489
 490                     total_speech_samples += uttlen;
 491                     total_speech_sec += (double) uttlen / (double) sps;
 492
 493                     uttid++;
 494                 }
 495                 else {
 496                     /*
 497                      * Short silence within utt; write it to output.  (Some extra trailing silence
 498                      * is included in the utterance, as a result.  Not to worry about it.)
 499                      */
 500                     if (k > 0) {
 501                         fwrite(buf, sizeof(int16), k, fp);
 502                         uttlen += k;
 503                     }
 504                 }
 505             }
 506         }
 507         else {
 508             assert(cont->state == CONT_AD_STATE_SPEECH);
 509
 510             if (fp == NULL) {   /* Not in an utt; open a new output file */
 511                 if (writeseg)
 512                     sprintf(segfile, "%08d.raw", uttid);
 513                 else
 514                     strcpy(segfile, NULL_DEVICE);
 515                 if ((fp = fopen(segfile, "wb")) == NULL)
 516                     E_FATAL("Failed to open segmentation file '%s' for writing: %s\n", segfile, strerror(errno));
 517
 518                 starttime = cont->read_ts - k;
 519                 uttlen = 0;
 520             }
 521
 522             /* Write data obtained to output file */
 523             if (k > 0) {
 524                 fwrite(buf, sizeof(int16), k, fp);
 525                 uttlen += k;
 526             }
 527         }
 528     }
 529
 530     if (rawfp)
 531         fclose(rawfp);
 532
 533     E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
 534            cont->tot_frm, cont->tot_frm * cont->spf,
 535            (cont->tot_frm * cont->spf) / (float32) cont->sps);
 536     E_INFO("Total speech detected = %d samples, %.2f sec\n",
 537            total_speech_samples, total_speech_sec);
 538
 539     cont_ad_close(cont);
 540
 541     return 0;
 542 }