SphinxBase  0.6
src/sphinx_fe/sphinx_fe.c
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1996-2004 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 #include <stdio.h>
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include <time.h>
00041 #include <assert.h>
00042 
00043 #ifdef HAVE_CONFIG_H
00044 #include <config.h>
00045 #endif
00046 
00047 #ifdef HAVE_SNDFILE_H
00048 #include <sndfile.h>
00049 #endif
00050 
00051 #include <sphinxbase/fe.h>
00052 #include <sphinxbase/strfuncs.h>
00053 #include <sphinxbase/pio.h>
00054 #include <sphinxbase/filename.h>
00055 #include <sphinxbase/cmd_ln.h>
00056 #include <sphinxbase/err.h>
00057 #include <sphinxbase/ckd_alloc.h>
00058 #include <sphinxbase/byteorder.h>
00059 #include <sphinxbase/hash_table.h>
00060 
00061 #include "sphinx_wave2feat.h"
00062 #include "cmd_ln_defn.h"
00063 
00064 typedef struct audio_type_s {
00065     char const *name;
00066     int (*detect)(sphinx_wave2feat_t *wtf, char const *infile);
00067     int (*decode)(sphinx_wave2feat_t *wtf);
00068 } audio_type_t;
00069 
00070 typedef struct output_type_s {
00071     char const *name;
00072     int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
00073     int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
00074 } output_type_t;
00075 
00076 struct sphinx_wave2feat_s {
00077     int refcount;     
00078     cmd_ln_t *config; 
00079     fe_t *fe;         
00080     char *infile;     
00081     char *outfile;    
00082     FILE *infh;       
00083     FILE *outfh;      
00084     short *audio;     
00085     mfcc_t **feat;    
00086     int blocksize;    
00087     int featsize;     
00088     int veclen;       
00089     int in_veclen;    
00090     int byteswap;     
00091 #ifdef HAVE_SNDFILE_H
00092     SNDFILE *insfh;   
00093 #endif
00094     output_type_t const *ot;
00095 };
00096 
00098 typedef struct RIFFHeader{
00099     char rifftag[4];      /* "RIFF" string */
00100     int32 TotalLength;      /* Total length */
00101     char wavefmttag[8];   /* "WAVEfmt " string (note space after 't') */
00102     int32 RemainingLength;  /* Remaining length */
00103     int16 data_format;    /* data format tag, 1 = PCM */
00104     int16 numchannels;    /* Number of channels in file */
00105     int32 SamplingFreq;     /* Sampling frequency */
00106     int32 BytesPerSec;      /* Average bytes/sec */
00107     int16 BlockAlign;     /* Block align */
00108     int16 BitsPerSample;  /* 8 or 16 bit */
00109     char datatag[4];      /* "data" string */
00110     int32 datalength;       /* Raw data length */
00111 } MSWAV_hdr;
00112 
00118 static int
00119 detect_riff(sphinx_wave2feat_t *wtf, char const *infile)
00120 {
00121     FILE *fh;
00122     MSWAV_hdr hdr;
00123 
00124     if ((fh = fopen(infile, "rb")) == NULL) {
00125         E_ERROR_SYSTEM("Failed to open %s", infile);
00126         return -1;
00127     }
00128     if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
00129         E_ERROR_SYSTEM("Failed to read RIFF header");
00130         fclose(fh);
00131         return -1;
00132     }
00133     /* Make sure it is actually a RIFF file. */
00134     if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
00135         fclose(fh);
00136         return FALSE;
00137     }
00138 
00139     /* Get relevant information. */
00140     cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
00141     cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
00142     if (wtf->infile)
00143         ckd_free(wtf->infile);
00144     wtf->infile = ckd_salloc(infile);
00145     wtf->infh = fh;
00146 
00147     return TRUE;
00148 }
00149 
00150 static int
00151 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh)
00152 {
00153     char nist[7];
00154     lineiter_t *li;
00155     FILE *fh;
00156 
00157     if ((fh = fopen(infile, "rb")) == NULL) {
00158         E_ERROR_SYSTEM("Failed to open %s", infile);
00159         return -1;
00160     }
00161     if (fread(&nist, 1, 7, fh) != 7) {
00162         E_ERROR_SYSTEM("Failed to read NIST header");
00163         fclose(fh);
00164         return -1;
00165     }
00166     /* Is this actually a NIST file? */
00167     if (0 != strncmp(nist, "NIST_1A", 7)) {
00168         fclose(fh);
00169         return FALSE;
00170     }
00171     /* Rewind, parse lines. */
00172     fseek(fh, 0, SEEK_SET);
00173     for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
00174         char **words;
00175         int nword;
00176 
00177         string_trim(li->buf, STRING_BOTH);
00178         if (strlen(li->buf) == 0) {
00179             lineiter_free(li);
00180             break;
00181         }
00182         nword = str2words(li->buf, NULL, 0);
00183         if (nword != 3)
00184             continue;
00185         words = ckd_calloc(nword, sizeof(*words));
00186         str2words(li->buf, words, nword);
00187         if (0 == strcmp(words[0], "sample_rate")) {
00188             cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
00189         }
00190         if (0 == strcmp(words[0], "channel_count")) {
00191             cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
00192         }
00193         if (0 == strcmp(words[0], "sample_byte_format")) {
00194             cmd_ln_set_str_r(wtf->config, "-input_endian",
00195                              (0 == strcmp(words[2], "10")) ? "big" : "little");
00196         }
00197         ckd_free(words);
00198     }
00199 
00200     fseek(fh, 1024, SEEK_SET);
00201     if (out_fh)
00202         *out_fh = fh;
00203     else
00204         fclose(fh);
00205     return TRUE;
00206 }
00207 
00208 #ifdef HAVE_POPEN
00209 static int
00210 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
00211 {
00212     FILE *fh;
00213     char *cmdline;
00214     int rv;
00215 
00216     /* Determine if it's NIST file and get parameters. */
00217     if ((rv = open_nist_file(wtf, infile, NULL)) != TRUE)
00218         return rv;
00219 
00220     /* Now popen it with sph2pipe. */
00221     cmdline = string_join("sph2pipe -f raw '", infile, "'", NULL);
00222     if ((fh = popen(cmdline, "r")) == NULL) {
00223         E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", infile);
00224         ckd_free(cmdline);
00225         return -1;
00226     }
00227 
00228     if (wtf->infile)
00229         ckd_free(wtf->infile);
00230     wtf->infile = ckd_salloc(infile);
00231     wtf->infh = fh;
00232     return TRUE;
00233 }
00234 #else /* !HAVE_POPEN */
00235 static int
00236 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
00237 {
00238     E_ERROR("popen() not available, cannot run sph2pipe\n");
00239     return -1;
00240 }
00241 #endif /* !HAVE_POPEN */
00242 
00248 static int
00249 detect_nist(sphinx_wave2feat_t *wtf, char const *infile)
00250 {
00251     FILE *fh;
00252     int rv;
00253 
00254     if ((rv = open_nist_file(wtf, infile, &fh)) != TRUE)
00255         return rv;
00256     if (wtf->infile)
00257         ckd_free(wtf->infile);
00258     wtf->infile = ckd_salloc(infile);
00259     wtf->infh = fh;
00260     return TRUE;
00261 }
00262 
00263 
00270 static int
00271 detect_raw(sphinx_wave2feat_t *wtf, char const *infile)
00272 {
00273     FILE *fh;
00274 
00275     if ((fh = fopen(infile, "rb")) == NULL) {
00276         E_ERROR_SYSTEM("Failed to open %s", infile);
00277         return -1;
00278     }
00279     if (wtf->infile)
00280         ckd_free(wtf->infile);
00281     wtf->infile = ckd_salloc(infile);
00282     wtf->infh = fh;
00283     return TRUE;
00284 }
00285 
00292 static int
00293 detect_sphinx_mfc(sphinx_wave2feat_t *wtf, char const *infile)
00294 {
00295     FILE *fh;
00296     int32 len;
00297     long flen;
00298 
00299     if ((fh = fopen(infile, "rb")) == NULL) {
00300         E_ERROR_SYSTEM("Failed to open %s", infile);
00301         return -1;
00302     }
00303     if (fread(&len, 4, 1, fh) != 1) {
00304         E_ERROR_SYSTEM("Failed to read header from %s\n", infile);
00305         return -1;
00306     }
00307     fseek(fh, 0, SEEK_END);
00308     flen = ftell(fh);
00309 
00310     /* figure out whether to byteswap */
00311     flen = (flen / 4) - 1;
00312     if (flen != len) {
00313         /* First make sure this is an endianness problem, otherwise fail. */
00314         SWAP_INT32(&len);
00315         if (flen != len) {
00316             SWAP_INT32(&len);
00317             E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
00318                     len, flen);
00319             return -1;
00320         }
00321         /* Set the input endianness to the opposite of the machine endianness... */
00322         cmd_ln_set_str_r(wtf->config, "-input_endian",
00323                          (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
00324                           ? "little" : "big"));
00325     }
00326     
00327     fseek(fh, 4, SEEK_SET);
00328     if (wtf->infile)
00329         ckd_free(wtf->infile);
00330     wtf->infile = ckd_salloc(infile);
00331     wtf->infh = fh;
00332     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00333         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00334     }
00335     else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00336         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
00337         wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00338     }
00339     else {
00340         /* Should not happen. */
00341         E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
00342         assert(FALSE);
00343     }
00344             
00345     return TRUE;
00346 }
00347 
00348 int
00349 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
00350 {
00351     int i, j;
00352 
00353     if (whichchan > 0) {
00354         for (i = whichchan - 1; i < nsamp; i += nchans)
00355             buf[i/nchans] = buf[i];
00356     }
00357     else {
00358         for (i = 0; i < nsamp; i += nchans) {
00359             float64 tmp = 0.0;
00360             for (j = 0; j < nchans && i + j < nsamp; ++j) {
00361                 tmp += buf[i + j];
00362             }
00363             buf[i/nchans] = (int16)(tmp / nchans);
00364         }
00365     }
00366     return i/nchans;
00367 }
00368 
00369 #ifdef HAVE_SNDFILE_H
00370 
00375 static int
00376 detect_sndfile(sphinx_wave2feat_t *wtf, char const *infile)
00377 {
00378     SNDFILE *sf;
00379     SF_INFO sfinfo;
00380 
00381     memset(&sfinfo, 0, sizeof(sfinfo));
00382     /* We let other detectors catch I/O errors, since there is
00383        no way to tell them from format errors when opening :( */
00384     if ((sf = sf_open(infile, SFM_READ, &sfinfo)) == NULL) {
00385         return FALSE;
00386     }
00387     /* Get relevant information. */
00388     cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
00389     cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
00390     if (wtf->infile)
00391         ckd_free(wtf->infile);
00392     wtf->infile = ckd_salloc(infile);
00393     wtf->insfh = sf;
00394     wtf->infh = NULL;
00395 
00396     return TRUE;
00397 }
00398 
00403 static int
00404 decode_sndfile(sphinx_wave2feat_t *wtf)
00405 {
00406     size_t nsamp;
00407     int32 nfr, nchans, whichchan;
00408     int nfloat, n;
00409 
00410     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00411     whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
00412     fe_start_utt(wtf->fe);
00413     nfloat = 0;
00414     while ((nsamp = sf_read_short(wtf->insfh,
00415                                   wtf->audio,
00416                                   wtf->blocksize)) != 0) {
00417         int16 const *inspeech;
00418         size_t nvec;
00419 
00420         /* Mix or pick channels. */
00421         if (nchans > 1)
00422             nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
00423 
00424         inspeech = wtf->audio;
00425         nvec = wtf->featsize;
00426         /* Consume all samples. */
00427         while (nsamp) {
00428             nfr = nvec;
00429             fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
00430             if (nfr) {
00431                 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00432                     return -1;
00433                 nfloat += n;
00434             }
00435         }
00436         inspeech = wtf->audio;
00437     }
00438     /* Now process any leftover audio frames. */
00439     fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
00440     if (nfr) {
00441         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00442             return -1;
00443         nfloat += n;
00444     }
00445 
00446     sf_close(wtf->insfh);
00447     wtf->insfh = NULL;
00448     return nfloat;
00449 }
00450 #endif /* HAVE_SNDFILE_H */
00451 
00456 static int
00457 decode_pcm(sphinx_wave2feat_t *wtf)
00458 {
00459     size_t nsamp;
00460     int32 nfr, nchans, whichchan;
00461     int nfloat, n;
00462 
00463     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00464     whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
00465     fe_start_utt(wtf->fe);
00466     nfloat = 0;
00467     while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
00468         size_t nvec;
00469         int16 const *inspeech;
00470 
00471         /* Byteswap stuff here if necessary. */
00472         if (wtf->byteswap) {
00473             for (n = 0; n < nsamp; ++n)
00474                 SWAP_INT16(wtf->audio + n);
00475         }
00476 
00477         /* Mix or pick channels. */
00478         if (nchans > 1)
00479             nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
00480             
00481         inspeech = wtf->audio;
00482         nvec = wtf->featsize;
00483         /* Consume all samples. */
00484         while (nsamp) {
00485             nfr = nvec;
00486             fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
00487             if (nfr) {
00488                 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00489                     return -1;
00490                 nfloat += n;
00491             }
00492         }
00493         inspeech = wtf->audio;
00494     }
00495     /* Now process any leftover audio frames. */
00496     fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
00497     if (nfr) {
00498         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00499             return -1;
00500         nfloat += n;
00501     }
00502 
00503     if (fclose(wtf->infh) == EOF)
00504         E_ERROR_SYSTEM("Failed to close input file");
00505     wtf->infh = NULL;
00506     return nfloat;
00507 }
00508 
00513 static int
00514 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
00515 {
00516     int nfloat = 0, n;
00517     int featsize = wtf->featsize;
00518 
00519     /* If the input vector length is less than the output length, we
00520      * need to do this one frame at a time, because there's empty
00521      * space at the end of each vector in wtf->feat. */
00522     if (wtf->in_veclen < wtf->veclen)
00523         featsize = 1;
00524     while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
00525                       featsize * wtf->in_veclen, wtf->infh)) != 0) {
00526         int i, nfr = n / wtf->in_veclen;
00527         if (n % wtf->in_veclen) {
00528             E_ERROR("Size of file %d not a multiple of veclen %d\n",
00529                     n, wtf->in_veclen);
00530             return -1;
00531         }
00532         /* Byteswap stuff here if necessary. */
00533         if (wtf->byteswap) {
00534             for (i = 0; i < n; ++i)
00535                 SWAP_FLOAT32(wtf->feat[0] + i);
00536         }
00537         fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
00538         for (i = 0; i < nfr; ++i) {
00539             if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00540                 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
00541                     fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
00542                 else
00543                     fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
00544             }
00545             else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00546                 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
00547             }
00548         }
00549         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00550             return -1;
00551         nfloat += n;
00552     }
00553 
00554     if (fclose(wtf->infh) == EOF)
00555         E_ERROR_SYSTEM("Failed to close input file");
00556     wtf->infh = NULL;
00557     return nfloat;
00558 }
00559 
00560 static const audio_type_t types[] = {
00561 #ifdef HAVE_SNDFILE_H
00562     { "-sndfile", &detect_sndfile, &decode_sndfile },
00563 #endif
00564     { "-mswav", &detect_riff, &decode_pcm },
00565     { "-nist", &detect_nist, &decode_pcm },
00566     { "-raw", &detect_raw, &decode_pcm },
00567     { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
00568 };
00569 static const int ntypes = sizeof(types)/sizeof(types[0]);
00570 static const audio_type_t mfcc_type = {
00571     "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
00572 };
00573 
00579 static int
00580 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
00581 {
00582     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
00583         E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
00584         return -1;
00585     }
00586     return 0;
00587 }
00588 
00594 static int
00595 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00596 {
00597     int i, nfloat = 0;
00598 
00599     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00600     for (i = 0; i < nfr; ++i) {
00601         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00602             E_ERROR_SYSTEM("Writing %d values to %s failed",
00603                            wtf->veclen, wtf->outfile);
00604             return -1;
00605         }
00606         nfloat += wtf->veclen;
00607     }
00608     return nfloat;
00609 }
00610 
00611 typedef enum htk_feature_kind_e {
00612     WAVEFORM = 0,   /* PCM audio (rarely used) */
00613     LPC = 1,        /* LPC filter coefficients */
00614     LPCREFC = 2,    /* LPC reflection coefficients */
00615     LPCEPSTRA = 3,  /* LPC-based cepstral coefficients */
00616     LPCDELCEP = 4,  /* LPCC plus deltas */
00617     IREFC = 5,      /* 16-bit integer LPC reflection coefficients */
00618     MFCC = 6,       /* MFCCs */
00619     FBANK = 7,      /* Log mel spectrum */
00620     MELSPEC = 8,    /* Linear mel spectrum */
00621     USER = 9,       /* User defined */
00622     DISCRETE = 10,  /* Vector quantized data */
00623     PLP = 11        /* PLP coefficients */
00624 } htk_feature_kind_t;
00625 
00626 typedef enum htk_feature_flag_e {
00627     _E = 0000100, /* has energy */
00628     _N = 0000200, /* absolute energy supressed */
00629     _D = 0000400, /* has delta coefficients */
00630     _A = 0001000, /* has acceleration (delta-delta) coefficients */
00631     _C = 0002000, /* is compressed */
00632     _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
00633     _K = 0010000, /* has CRC checksum */
00634     _O = 0020000, /* has 0th cepstral coefficient */
00635     _V = 0040000, /* has VQ data */
00636     _T = 0100000  /* has third differential coefficients */
00637 } htk_feature_flag_t;
00638 
00642 static int
00643 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
00644 {
00645     int32 samp_period;
00646     int16 samp_size;
00647     int16 param_kind;
00648     int swap = FALSE;
00649 
00650     /* HTK files are big-endian. */
00651     if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
00652         swap = TRUE;
00653     /* Same file size thing as in Sphinx files (I think) */
00654     if (swap) SWAP_INT32(&nfloat);
00655     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
00656         return -1;
00657     /* Sample period in 100ns units. */
00658     samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
00659     if (swap) SWAP_INT32(&samp_period);
00660     if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
00661         return -1;
00662     /* Sample size - veclen * sizeof each sample. */
00663     samp_size = wtf->veclen * 4;
00664     if (swap) SWAP_INT16(&samp_size);
00665     if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
00666         return -1;
00667     /* Format and flags. */
00668     if (cmd_ln_boolean_r(wtf->config, "-logspec")
00669         || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
00670         param_kind = FBANK; /* log mel-filter bank outputs */
00671     else
00672         param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
00673     if (swap) SWAP_INT16(&param_kind);
00674     if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
00675         return -1;
00676 
00677     return 0;
00678 }
00679 
00683 static int
00684 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00685 {
00686     int i, j, swap, htk_reorder, nfloat = 0;
00687 
00688     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00689     /* This is possibly inefficient, but probably not a big deal. */
00690     swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
00691     htk_reorder = (0 == strcmp("htk", wtf->ot->name)
00692                    && !(cmd_ln_boolean_r(wtf->config, "-logspec")
00693                         || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
00694     for (i = 0; i < nfr; ++i) {
00695         if (htk_reorder) {
00696             mfcc_t c0 = frames[i][0];
00697             memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
00698             frames[i][wtf->veclen - 1] = c0;
00699         }
00700         if (swap)
00701             for (j = 0; j < wtf->veclen; ++j)
00702                 SWAP_FLOAT32(frames[i] + j);
00703         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00704             E_ERROR_SYSTEM("Writing %d values to %s failed",
00705                            wtf->veclen, wtf->outfile);
00706             return -1;
00707         }
00708         nfloat += wtf->veclen;
00709     }
00710     return nfloat;
00711 }
00712 
00716 static int
00717 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00718 {
00719     int i, j, nfloat = 0;
00720 
00721     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00722     for (i = 0; i < nfr; ++i) {
00723         for (j = 0; j < wtf->veclen; ++j) {
00724             fprintf(wtf->outfh, "%.5g", frames[i][j]);
00725             if (j == wtf->veclen - 1)
00726                 fprintf(wtf->outfh, "\n");
00727             else
00728                 fprintf(wtf->outfh, " ");
00729         }
00730         nfloat += wtf->veclen;
00731     }
00732     return nfloat;
00733 }
00734 
00735 static const output_type_t outtypes[] = {
00736     { "sphinx", &output_header_sphinx, &output_frames_sphinx },
00737     { "htk", &output_header_htk, &output_frames_htk },
00738     { "text", NULL, &output_frames_text }
00739 };
00740 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
00741 
00742 sphinx_wave2feat_t *
00743 sphinx_wave2feat_init(cmd_ln_t *config)
00744 {
00745     sphinx_wave2feat_t *wtf;
00746     int i;
00747 
00748     wtf = ckd_calloc(1, sizeof(*wtf));
00749     wtf->refcount = 1;
00750     wtf->config = cmd_ln_retain(config);
00751     wtf->fe = fe_init_auto_r(wtf->config);
00752     wtf->ot = outtypes; /* Default (sphinx) type. */
00753     for (i = 0; i < nouttypes; ++i) {
00754         output_type_t const *otype = &outtypes[i];
00755         if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
00756             wtf->ot = otype;
00757             break;
00758         }
00759     }
00760     if (i == nouttypes) {
00761         E_ERROR("Unknown output type: '%s'\n",
00762                 cmd_ln_str_r(config, "-ofmt"));
00763         sphinx_wave2feat_free(wtf);
00764         return NULL;
00765     }
00766 
00767     return wtf;
00768 }
00769 
00770 int
00771 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
00772 {
00773     if (wtf == NULL)
00774         return 0;
00775     if (--wtf->refcount > 0)
00776         return wtf->refcount;
00777 
00778     ckd_free(wtf->audio);
00779     ckd_free_2d(wtf->feat);
00780     ckd_free(wtf->infile);
00781     ckd_free(wtf->outfile);
00782     if (wtf->infh) {
00783         if (fclose(wtf->infh) == EOF)
00784             E_ERROR_SYSTEM("Failed to close input file");
00785     }
00786     if (wtf->outfh) {
00787         if (fclose(wtf->outfh) == EOF)
00788             E_ERROR_SYSTEM("Failed to close output file");
00789     }
00790     cmd_ln_free_r(wtf->config);
00791     fe_free(wtf->fe);
00792     ckd_free(wtf);
00793 
00794     return 0;
00795 }
00796 
00797 sphinx_wave2feat_t *
00798 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
00799 {
00800     ++wtf->refcount;
00801     return wtf;
00802 }
00803 
00804 static audio_type_t const *
00805 detect_audio_type(sphinx_wave2feat_t *wtf, char const *infile)
00806 {
00807     audio_type_t const *atype;
00808     int i;
00809 
00810     /* Special case audio type for Sphinx MFCC inputs. */
00811     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
00812         || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00813         int rv = mfcc_type.detect(wtf, infile);
00814         if (rv == -1)
00815             goto error_out;
00816         return &mfcc_type;
00817     }
00818 
00819     /* Try to use the type of infile given on the command line. */
00820     for (i = 0; i < ntypes; ++i) {
00821         int rv;
00822         atype = &types[i];
00823         if (cmd_ln_boolean_r(wtf->config, atype->name)) {
00824             rv = (*atype->detect)(wtf, infile);
00825             if (rv == -1)
00826                 goto error_out;
00827             else if (rv == TRUE)
00828                 break;
00829         }
00830     }
00831     if (i == ntypes) {
00832         /* Detect file type of infile and get parameters. */
00833         for (i = 0; i < ntypes; ++i) {
00834             int rv;
00835             atype = &types[i];
00836             rv = (*atype->detect)(wtf, infile);
00837             if (rv == -1)
00838                 goto error_out;
00839             else if (rv == TRUE)
00840                 break;
00841         }
00842         if (i == ntypes)
00843             goto error_out;
00844     }
00845     return atype;
00846  error_out:
00847     if (wtf->infh)
00848         fclose(wtf->infh);
00849     wtf->infh = NULL;
00850     return NULL;
00851 }
00852 
00853 int
00854 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
00855                               char const *infile, char const *outfile)
00856 {
00857     int nchans, minfft, nfft, nfloat, veclen;
00858     audio_type_t const *atype;
00859     int fshift, fsize;
00860 
00861     if (cmd_ln_boolean_r(wtf->config, "-verbose"))
00862         E_INFO("Converting %s to %s\n", infile, outfile);
00863 
00864     /* Detect input file type. */
00865     if ((atype = detect_audio_type(wtf, infile)) == NULL)
00866         return -1;
00867 
00868     /* Determine whether to byteswap input. */
00869     wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
00870                            cmd_ln_str_r(wtf->config, "-input_endian"));
00871 
00872     /* Make sure the FFT size is sufficiently large. */
00873     minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
00874                    * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
00875     for (nfft = 1; nfft < minfft; nfft <<= 1)
00876         ;
00877     if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
00878         E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
00879                cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
00880         cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
00881         fe_free(wtf->fe);
00882         wtf->fe = fe_init_auto_r(wtf->config);
00883     }
00884 
00885     /* Get the output frame size (if not already set). */
00886     if (wtf->veclen == 0)
00887         wtf->veclen = fe_get_output_size(wtf->fe);
00888 
00889     /* Set up the input and output buffers. */
00890     fe_get_input_size(wtf->fe, &fshift, &fsize);
00891     /* Want to get at least a whole frame plus shift in here.  Also we
00892        will either pick or mix multiple channels so we need to read
00893        them all at once. */
00894     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00895     wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
00896     if (wtf->blocksize < (fsize + fshift) * nchans) {
00897         E_INFO("Block size of %d too small, increasing to %d\n",
00898                wtf->blocksize,
00899                (fsize + fshift) * nchans);
00900         wtf->blocksize = (fsize + fshift) * nchans;
00901     }
00902     wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
00903     wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
00904 
00905     /* Use the maximum of the input and output frame sizes to allocate this. */
00906     veclen = wtf->veclen;
00907     if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
00908     wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
00909 
00910     /* Let's go! */
00911     if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
00912         E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
00913         return -1;
00914     }
00915     /* Write an empty header, which we'll fill in later. */
00916     if (wtf->ot->output_header &&
00917         (*wtf->ot->output_header)(wtf, 0) < 0) {
00918         E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
00919         goto error_out;
00920     }
00921     wtf->outfile = ckd_salloc(outfile);
00922 
00923     if ((nfloat = (*atype->decode)(wtf)) < 0)
00924         return -1;
00925 
00926     if (wtf->ot->output_header) {
00927         if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
00928             E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
00929             goto error_out;
00930         }
00931         if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
00932             E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
00933             goto error_out;
00934         }
00935     }
00936     if (fclose(wtf->outfh) == EOF)
00937         E_ERROR_SYSTEM("Failed to close output file");
00938     wtf->outfh = NULL;
00939 
00940     return 0;
00941 error_out:
00942     if (wtf->outfh) {
00943         fclose(wtf->outfh);
00944         wtf->outfh = NULL;
00945     }
00946     return -1;
00947 }
00948 
00949 void
00950 build_filenames(cmd_ln_t *config, char const *basename,
00951                 char **out_infile, char **out_outfile)
00952 {
00953     char const *di, *do_, *ei, *eo;
00954 
00955     di = cmd_ln_str_r(config, "-di");
00956     do_ = cmd_ln_str_r(config, "-do");
00957     ei = cmd_ln_str_r(config, "-ei");
00958     eo = cmd_ln_str_r(config, "-eo");
00959 
00960     *out_infile = string_join(di ? di : "",
00961                               di ? "/" : "",
00962                               basename,
00963                               ei ? "." : "",
00964                               ei ? ei : "",
00965                               NULL);
00966     *out_outfile = string_join(do_ ? do_ : "",
00967                                do_ ? "/" : "",
00968                                basename,
00969                                eo ? "." : "",
00970                                eo ? eo : "",
00971                               NULL);
00972     /* Build output directory structure if possible/requested (it is
00973      * by default). */
00974     if (cmd_ln_boolean_r(config, "-build_outdirs")) {
00975         char *dirname = ckd_salloc(*out_outfile);
00976         path2dirname(*out_outfile, dirname);
00977         build_directory(dirname);
00978         ckd_free(dirname);
00979     }
00980 }
00981 
00982 static int
00983 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
00984 {
00985     hash_table_t *files;
00986     hash_iter_t *itor;
00987     lineiter_t *li;
00988     FILE *ctlfh;
00989     int nskip, runlen, npart, rv = 0;
00990 
00991     if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
00992         E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
00993         return -1;
00994     }
00995     nskip = cmd_ln_int32_r(wtf->config, "-nskip");
00996     runlen = cmd_ln_int32_r(wtf->config, "-runlen");
00997     if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
00998         /* Count lines in the file. */
00999         int partlen, part, nlines = 0;
01000         part = cmd_ln_int32_r(wtf->config, "-part");
01001         for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
01002             ++nlines;
01003         fseek(ctlfh, 0, SEEK_SET);
01004         partlen = nlines / npart;
01005         nskip = partlen * (part - 1);
01006         if (part == npart)
01007             runlen = -1;
01008         else
01009             runlen = partlen;
01010     }
01011     if (runlen != -1){
01012         E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
01013         files = hash_table_new(runlen, HASH_CASE_YES);
01014     }
01015     else {
01016         E_INFO("Processing all remaining utterances at position %d\n", nskip);
01017         files = hash_table_new(1000, HASH_CASE_YES);
01018     }
01019     for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
01020         char *c, *infile, *outfile;
01021 
01022         if (nskip-- > 0)
01023             continue;
01024         if (runlen == 0) {
01025             lineiter_free(li);
01026             break;
01027         }
01028         --runlen;
01029 
01030         string_trim(li->buf, STRING_BOTH);
01031         /* Extract the file ID from the control line. */
01032         if ((c = strchr(li->buf, ' ')) != NULL)
01033             *c = '\0';
01034         build_filenames(wtf->config, li->buf, &infile, &outfile);
01035         if (hash_table_lookup(files, infile, NULL) == 0)
01036             continue;
01037         rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
01038         hash_table_enter(files, infile, outfile);
01039         if (rv != 0) {
01040             lineiter_free(li);
01041             if (fclose(ctlfh) == EOF)
01042                 E_ERROR_SYSTEM("Failed to close control file");
01043             break;
01044         }
01045     }
01046     for (itor = hash_table_iter(files); itor;
01047          itor = hash_table_iter_next(itor)) {
01048         ckd_free((void *)hash_entry_key(itor->ent));
01049         ckd_free(hash_entry_val(itor->ent));
01050     }
01051     hash_table_free(files);
01052     return rv;
01053 }
01054 
01055 int
01056 main(int argc, char *argv[])
01057 {
01058     sphinx_wave2feat_t *wtf;
01059     cmd_ln_t *config;
01060     int rv;
01061 
01062     /* Initialize config. */
01063     if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
01064         return 2;
01065 
01066     /* Parse an argument file if there's one in there. */
01067     if (cmd_ln_str_r(config, "-argfile"))
01068         config = cmd_ln_parse_file_r(config, defn,
01069                                      cmd_ln_str_r(config, "-argfile"), FALSE);
01070     if (config == NULL) {
01071         E_ERROR("Command line parsing failed\n");
01072         return 1;
01073     }
01074     if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
01075         E_ERROR("Failed to initialize wave2feat object\n");
01076         return 1;
01077     }
01078 
01079     /* If there's a control file run through it, otherwise we will do
01080      * a single file (which is what run_control_file will do
01081      * internally too) */
01082     if (cmd_ln_str_r(config, "-c"))
01083         rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
01084     else
01085         rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
01086                                            cmd_ln_str_r(config, "-o"));
01087 
01088     sphinx_wave2feat_free(wtf);
01089     return rv;
01090 }