SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances. 00039 * 00040 * HISTORY 00041 * 00042 * $Log: cont_fileseg.c,v $ 00043 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins 00044 * re-importation 00045 * 00046 * Revision 1.13 2005/06/30 00:28:46 rkm 00047 * Kept within-utterance silences in rawmode 00048 * 00049 * 00050 * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00051 * Modified to use new state variables in cont_ad_t. 00052 * 00053 * Revision 1.12 2005/05/31 15:54:38 rkm 00054 * *** empty log message *** 00055 * 00056 * Revision 1.11 2005/05/24 20:56:58 rkm 00057 * Added min/max-noise parameters to cont_fileseg 00058 * 00059 * Revision 1.10 2005/05/13 23:28:43 egouvea 00060 * Changed null device to system dependent one: NUL for windows, /dev/null for everything else 00061 * 00062 * $Log: cont_fileseg.c,v $ 00063 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins 00064 * re-importation 00065 * 00066 * Revision 1.13 2005/06/30 00:28:46 rkm 00067 * Kept within-utterance silences in rawmode 00068 * 00069 * Revision 1.12 2005/05/31 15:54:38 rkm 00070 * *** empty log message *** 00071 * 00072 * Revision 1.11 2005/05/24 20:56:58 rkm 00073 * Added min/max-noise parameters to cont_fileseg 00074 * 00075 * Revision 1.9 2005/02/13 01:29:48 rkm 00076 * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode 00077 * 00078 * Revision 1.8 2005/02/01 22:21:13 rkm 00079 * Added raw data logging, and raw data pass-through mode to cont_ad 00080 * 00081 * Revision 1.7 2004/07/16 00:57:11 egouvea 00082 * Added Ravi's implementation of FSG support. 00083 * 00084 * Revision 1.3 2004/06/25 14:58:05 rkm 00085 * *** empty log message *** 00086 * 00087 * Revision 1.2 2004/06/23 20:32:08 rkm 00088 * Exposed several cont_ad config parameters 00089 * 00090 * 00091 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00092 * Created. 00093 */ 00094 00095 #include <stdio.h> 00096 #include <stdlib.h> 00097 #include <string.h> 00098 #include <assert.h> 00099 #include <math.h> 00100 00101 #include <sphinxbase/prim_type.h> 00102 #include <sphinxbase/ad.h> 00103 #include <sphinxbase/cont_ad.h> 00104 #include <sphinxbase/err.h> 00105 00106 static FILE *infp; /* File being segmented */ 00107 static int32 swap; 00108 00109 /* Max size read by file_ad_read function on each invocation, for debugging */ 00110 static int32 max_ad_read_size; 00111 00112 #if defined(WIN32) && !defined(GNUWINCE) 00113 #define NULL_DEVICE "NUL" 00114 #else 00115 #define NULL_DEVICE "/dev/null" 00116 #endif 00117 00118 00119 /* 00120 * Need to provide cont_ad_init with a read function to read the input file. 00121 * This is it. The ad_rec_t *r argument is ignored since there is no A/D 00122 * device involved. 00123 */ 00124 static int32 00125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max) 00126 { 00127 int32 i, k; 00128 00129 if (max > max_ad_read_size) 00130 max = max_ad_read_size; 00131 00132 k = fread(buf, sizeof(int16), max, infp); 00133 if (swap) { 00134 for (i = 0; i < k; i++) { 00135 buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00); 00136 } 00137 } 00138 00139 return ((k > 0) ? k : -1); 00140 } 00141 00142 00143 static void 00144 usagemsg(char *pgm) 00145 { 00146 E_INFO("Usage: %s \\\n", pgm); 00147 E_INFOCONT("\t[-? | -h] \\\n"); 00148 E_INFOCONT("\t[-d | -debug] \\\n"); 00149 E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n"); 00150 E_INFOCONT("\t[-b | -byteswap] \\\n"); 00151 E_INFOCONT 00152 ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n"); 00153 E_INFOCONT("\t[-w | -writeseg] \\\n"); 00154 E_INFOCONT("\t[-min-noise <min-noise>] \\\n"); 00155 E_INFOCONT("\t[-max-noise <max-noise>] \\\n"); 00156 E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n"); 00157 E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n"); 00158 E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n"); 00159 E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n"); 00160 E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n"); 00161 E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n"); 00162 E_INFOCONT("\t[-c <copy-input-file>] \\\n"); 00163 E_INFOCONT("\t[-r | -rawmode] \\\n"); 00164 E_INFOCONT("\t-i <input-file>\n"); 00165 00166 exit(0); 00167 } 00168 00169 /* 00170 * Read specified input file, segment it into utterances wherever a silence segment of 00171 * a given minimum duration is encountered. Filter out long silences. 00172 * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc. 00173 */ 00174 int 00175 main(int32 argc, char **argv) 00176 { 00177 cont_ad_t *cont; 00178 int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode; 00179 int16 buf[4096]; 00180 char *infile, *copyfile, segfile[1024]; 00181 FILE *fp; 00182 float endsil; 00183 ad_rec_t ad; 00184 int32 i, k; 00185 int32 winsize, leader, trailer; 00186 int32 orig_min_noise, orig_max_noise; 00187 int32 orig_delta_sil, orig_delta_speech; 00188 int32 orig_speech_onset, orig_sil_onset; 00189 int32 min_noise, max_noise; 00190 int32 delta_sil, delta_speech; 00191 int32 sil_onset, speech_onset; 00192 float32 orig_adapt_rate; 00193 float32 adapt_rate; 00194 int32 total_speech_samples; 00195 float32 total_speech_sec; 00196 FILE *rawfp; 00197 00198 /* Set argument defaults */ 00199 cont = NULL; 00200 sps = 16000; 00201 swap = 0; 00202 endsil = 0.5; 00203 writeseg = 0; 00204 min_noise = max_noise = -1; 00205 delta_sil = delta_speech = -1; 00206 sil_onset = speech_onset = -1; 00207 adapt_rate = -1.0; 00208 max_ad_read_size = (int32) 0x7ffffff0; 00209 debug = 0; 00210 infile = NULL; 00211 copyfile = NULL; 00212 rawfp = NULL; 00213 rawmode = 0; 00214 00215 /* Parse arguments */ 00216 for (i = 1; i < argc; i++) { 00217 if ((strcmp(argv[i], "-help") == 0) 00218 || (strcmp(argv[i], "-h") == 0) 00219 || (strcmp(argv[i], "-?") == 0)) { 00220 usagemsg(argv[0]); 00221 } 00222 else if ((strcmp(argv[i], "-debug") == 0) 00223 || (strcmp(argv[i], "-d") == 0)) { 00224 debug = 1; 00225 } 00226 else if (strcmp(argv[i], "-sps") == 0) { 00227 i++; 00228 if ((i == argc) 00229 || (sscanf(argv[i], "%d", &sps) != 1) 00230 || (sps <= 0)) { 00231 E_ERROR("Invalid -sps argument\n"); 00232 usagemsg(argv[0]); 00233 } 00234 } 00235 else if ((strcmp(argv[i], "-byteswap") == 0) 00236 || (strcmp(argv[i], "-b") == 0)) { 00237 swap = 1; 00238 } 00239 else if ((strcmp(argv[i], "-silsep") == 0) 00240 || (strcmp(argv[i], "-s") == 0)) { 00241 i++; 00242 if ((i == argc) 00243 || (sscanf(argv[i], "%f", &endsil) != 1) 00244 || (endsil <= 0.0)) { 00245 E_ERROR("Invalid -silsep argument\n"); 00246 usagemsg(argv[0]); 00247 } 00248 } 00249 else if ((strcmp(argv[i], "-writeseg") == 0) 00250 || (strcmp(argv[i], "-w") == 0)) { 00251 writeseg = 1; 00252 } 00253 else if (strcmp(argv[i], "-min-noise") == 0) { 00254 i++; 00255 if ((i == argc) || 00256 (sscanf(argv[i], "%d", &min_noise) != 1) || 00257 (min_noise < 0)) { 00258 E_ERROR("Invalid -min-noise argument\n"); 00259 usagemsg(argv[0]); 00260 } 00261 } 00262 else if (strcmp(argv[i], "-max-noise") == 0) { 00263 i++; 00264 if ((i == argc) || 00265 (sscanf(argv[i], "%d", &max_noise) != 1) || 00266 (max_noise < 0)) { 00267 E_ERROR("Invalid -max-noise argument\n"); 00268 usagemsg(argv[0]); 00269 } 00270 } 00271 else if (strcmp(argv[i], "-delta-sil") == 0) { 00272 i++; 00273 if ((i == argc) || 00274 (sscanf(argv[i], "%d", &delta_sil) != 1) || 00275 (delta_sil < 0)) { 00276 E_ERROR("Invalid -delta-sil argument\n"); 00277 usagemsg(argv[0]); 00278 } 00279 } 00280 else if (strcmp(argv[i], "-delta-speech") == 0) { 00281 i++; 00282 if ((i == argc) || 00283 (sscanf(argv[i], "%d", &delta_speech) != 1) || 00284 (delta_speech < 0)) { 00285 E_ERROR("Invalid -delta-speech argument\n"); 00286 usagemsg(argv[0]); 00287 } 00288 } 00289 else if (strcmp(argv[i], "-sil-onset") == 0) { 00290 i++; 00291 if ((i == argc) || 00292 (sscanf(argv[i], "%d", &sil_onset) != 1) || 00293 (sil_onset < 1)) { 00294 E_ERROR("Invalid -sil-onset argument\n"); 00295 usagemsg(argv[0]); 00296 } 00297 } 00298 else if (strcmp(argv[i], "-speech-onset") == 0) { 00299 i++; 00300 if ((i == argc) || 00301 (sscanf(argv[i], "%d", &speech_onset) != 1) || 00302 (speech_onset < 1)) { 00303 E_ERROR("Invalid -speech-onset argument\n"); 00304 usagemsg(argv[0]); 00305 } 00306 } 00307 else if (strcmp(argv[i], "-adapt-rate") == 0) { 00308 i++; 00309 if ((i == argc) || 00310 (sscanf(argv[i], "%f", &adapt_rate) != 1) || 00311 (adapt_rate < 0.0) || (adapt_rate > 1.0)) { 00312 E_ERROR("Invalid -adapt-rate argument\n"); 00313 usagemsg(argv[0]); 00314 } 00315 } 00316 else if (strcmp(argv[i], "-max-adreadsize") == 0) { 00317 i++; 00318 if ((i == argc) || 00319 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) || 00320 (max_ad_read_size < 1)) { 00321 E_ERROR("Invalid -max-adreadsize argument\n"); 00322 usagemsg(argv[0]); 00323 } 00324 } 00325 else if (strcmp(argv[i], "-c") == 0) { 00326 i++; 00327 if (i == argc) { 00328 E_ERROR("Invalid -c argument\n"); 00329 usagemsg(argv[0]); 00330 } 00331 copyfile = argv[i]; 00332 } 00333 else if ((strcmp(argv[i], "-rawmode") == 0) 00334 || (strcmp(argv[i], "-r") == 0)) { 00335 rawmode = 1; 00336 } 00337 else if (strcmp(argv[i], "-i") == 0) { 00338 i++; 00339 if (i == argc) { 00340 E_ERROR("Invalid -i argument\n"); 00341 usagemsg(argv[0]); 00342 } 00343 infile = argv[i]; 00344 } 00345 else { 00346 usagemsg(argv[0]); 00347 } 00348 } 00349 00350 if (infile == NULL) { 00351 E_ERROR("No input file specified\n"); 00352 usagemsg(argv[0]); 00353 } 00354 00355 if ((infp = fopen(infile, "rb")) == NULL) 00356 E_FATAL("Failed to open '%s' for reading: %s\n", infile, strerror(errno)); 00357 00358 /* 00359 * Associate continuous listening module with opened input file and read function. 00360 * No A/D device is involved, but need to fill in ad->sps. 00361 * Calibrate input data using first few seconds of file, but then rewind it!! 00362 */ 00363 ad.sps = sps; 00364 ad.bps = sizeof(int16); 00365 if (!rawmode) 00366 cont = cont_ad_init(&ad, file_ad_read); 00367 else 00368 cont = cont_ad_init_rawmode(&ad, file_ad_read); 00369 00370 printf("Calibrating ..."); 00371 fflush(stdout); 00372 if (cont_ad_calib(cont) < 0) 00373 printf(" failed; file too short?\n"); 00374 else 00375 printf(" done\n"); 00376 rewind(infp); 00377 00378 /* Convert desired min. inter-utterance silence duration to #samples */ 00379 siltime = (int32) (endsil * sps); 00380 00381 /* Enable writing raw input to output by the cont module if specified */ 00382 if (copyfile) { 00383 if ((rawfp = fopen(copyfile, "wb")) == NULL) 00384 E_ERROR("Failed to open raw output file '%s' for writing: %s\n", 00385 copyfile, strerror(errno)); 00386 else 00387 cont_ad_set_rawfp(cont, rawfp); 00388 } 00389 00390 cont_ad_get_params(cont, 00391 &orig_delta_sil, &orig_delta_speech, 00392 &orig_min_noise, &orig_max_noise, 00393 &winsize, 00394 &orig_speech_onset, &orig_sil_onset, 00395 &leader, &trailer, &orig_adapt_rate); 00396 00397 E_INFO("Default parameters:\n"); 00398 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", 00399 orig_min_noise, orig_max_noise); 00400 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", 00401 orig_delta_sil, orig_delta_speech); 00402 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", 00403 orig_sil_onset, orig_speech_onset); 00404 E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate); 00405 00406 if (min_noise < 0) 00407 min_noise = orig_min_noise; 00408 if (max_noise < 0) 00409 max_noise = orig_max_noise; 00410 if (delta_sil < 0) 00411 delta_sil = orig_delta_sil; 00412 if (delta_speech < 0) 00413 delta_speech = orig_delta_speech; 00414 if (sil_onset < 0) 00415 sil_onset = orig_sil_onset; 00416 if (speech_onset < 0) 00417 speech_onset = orig_speech_onset; 00418 if (adapt_rate < 0.0) 00419 adapt_rate = orig_adapt_rate; 00420 00421 cont_ad_set_params(cont, 00422 delta_sil, delta_speech, 00423 min_noise, max_noise, 00424 winsize, 00425 speech_onset, sil_onset, 00426 leader, trailer, adapt_rate); 00427 00428 E_INFO("Current parameters:\n"); 00429 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise); 00430 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil, 00431 delta_speech); 00432 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset, 00433 speech_onset); 00434 E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate); 00435 00436 E_INFO("Sampling rate: %d", sps); 00437 E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No"); 00438 E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size); 00439 00440 if (debug) 00441 cont_ad_set_logfp(cont, stdout); 00442 00443 total_speech_samples = 0; 00444 total_speech_sec = 0.0; 00445 00446 uttid = 0; 00447 uttlen = 0; 00448 starttime = 0; 00449 fp = NULL; 00450 00451 /* Process data */ 00452 for (;;) { 00453 /* Get audio data from continuous listening module */ 00454 k = cont_ad_read(cont, buf, 4096); 00455 00456 if (k < 0) { /* End of input audio file; close any open output file and exit */ 00457 if (fp != NULL) { 00458 fclose(fp); 00459 fp = NULL; 00460 00461 printf 00462 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n", 00463 uttid, (double) starttime / (double) sps, 00464 (double) (starttime + uttlen) / (double) sps, 00465 (double) uttlen / (double) sps, uttlen); 00466 fflush(stdout); 00467 00468 total_speech_samples += uttlen; 00469 total_speech_sec += (double) uttlen / (double) sps; 00470 00471 uttid++; 00472 } 00473 00474 break; 00475 } 00476 00477 if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */ 00478 if (fp != NULL) { /* Currently in an utterance */ 00479 if (cont->seglen > siltime) { /* Long enough silence detected; end the utterance */ 00480 fclose(fp); 00481 fp = NULL; 00482 00483 printf 00484 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n", 00485 uttid, (double) starttime / (double) sps, 00486 (double) (starttime + uttlen) / (double) sps, 00487 (double) uttlen / (double) sps, uttlen); 00488 fflush(stdout); 00489 00490 total_speech_samples += uttlen; 00491 total_speech_sec += (double) uttlen / (double) sps; 00492 00493 uttid++; 00494 } 00495 else { 00496 /* 00497 * Short silence within utt; write it to output. (Some extra trailing silence 00498 * is included in the utterance, as a result. Not to worry about it.) 00499 */ 00500 if (k > 0) { 00501 fwrite(buf, sizeof(int16), k, fp); 00502 uttlen += k; 00503 } 00504 } 00505 } 00506 } 00507 else { 00508 assert(cont->state == CONT_AD_STATE_SPEECH); 00509 00510 if (fp == NULL) { /* Not in an utt; open a new output file */ 00511 if (writeseg) 00512 sprintf(segfile, "%08d.raw", uttid); 00513 else 00514 strcpy(segfile, NULL_DEVICE); 00515 if ((fp = fopen(segfile, "wb")) == NULL) 00516 E_FATAL("Failed to open segmentation file '%s' for writing: %s\n", segfile, strerror(errno)); 00517 00518 starttime = cont->read_ts - k; 00519 uttlen = 0; 00520 } 00521 00522 /* Write data obtained to output file */ 00523 if (k > 0) { 00524 fwrite(buf, sizeof(int16), k, fp); 00525 uttlen += k; 00526 } 00527 } 00528 } 00529 00530 if (rawfp) 00531 fclose(rawfp); 00532 00533 E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n", 00534 cont->tot_frm, cont->tot_frm * cont->spf, 00535 (cont->tot_frm * cont->spf) / (float32) cont->sps); 00536 E_INFO("Total speech detected = %d samples, %.2f sec\n", 00537 total_speech_samples, total_speech_sec); 00538 00539 cont_ad_close(cont); 00540 00541 return 0; 00542 }