• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_lmtools/sphinx_lm_eval.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <pio.h>
00047 #include <strfuncs.h>
00048 
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052 
00053 static const arg_t defn[] = {
00054   { "-help",
00055     ARG_BOOLEAN,
00056     "no",
00057     "Shows the usage of the tool"},
00058 
00059   { "-logbase",
00060     ARG_FLOAT64,
00061     "1.0001",
00062     "Base in which all log-likelihoods calculated" },
00063 
00064   { "-lm",
00065     ARG_STRING,
00066     NULL,
00067     "Language model file"},
00068 
00069   { "-probdef",
00070     ARG_STRING,
00071     NULL,
00072     "Probability definition file for classes in LM"},
00073 
00074   { "-lmctlfn",
00075     ARG_STRING,
00076     NULL,
00077     "Control file listing a set of language models"},
00078 
00079   { "-lmname",
00080     ARG_STRING,
00081     NULL,
00082     "Name of language model in -lmctlfn to use for all utterances" },
00083 
00084   { "-lsn",
00085     ARG_STRING,
00086     NULL,
00087     "Transcription file to evaluate"},
00088 
00089   { "-text",
00090     ARG_STRING,
00091     "Text string to evaluate"},
00092 
00093   { "-mmap",
00094     ARG_BOOLEAN,
00095     "no",
00096     "Use memory-mapped I/O for reading binary LM files"},
00097 
00098   { "-lw",
00099     ARG_FLOAT32,
00100     "1.0",
00101     "Language model weight" },
00102 
00103   { "-wip",
00104     ARG_FLOAT32,
00105     "1.0",
00106     "Word insertion probability" },
00107 
00108   { "-uw",
00109     ARG_FLOAT32,
00110     "1.0",
00111     "Unigram probability weight (interpolated with uniform distribution)"},
00112 
00113   /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
00114   { NULL, 0, NULL, NULL }
00115 };
00116 
00117 static int
00118 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00119              int32 *out_n_ccs, int32 *out_n_oovs)
00120 {
00121         int32 *wids;
00122         int32 startwid;
00123         int32 i, ch, nccs, noovs, unk;
00124 
00125         if (n == 0)
00126             return 0;
00127 
00128         unk = ngram_unknown_wid(lm);
00129 
00130         /* Reverse this array into an array of word IDs. */
00131         wids = ckd_calloc(n, sizeof(*wids));
00132         for (i = 0; i < n; ++i)
00133                 wids[n-i-1] = ngram_wid(lm, words[i]);
00134         /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00135         startwid = ngram_wid(lm, "<s>");
00136 
00137         /* Now evaluate the list of words in reverse using the
00138          * remainder of the array as the history. */
00139         ch = noovs = nccs = 0;
00140         for (i = 0; i < n; ++i) {
00141                 int32 n_used;
00142                 int32 prob;
00143 
00144                 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00145                 if (wids[i] == startwid) {
00146                         ++nccs;
00147                         continue;
00148                 }
00149                 /* Skip and count OOVs. */
00150                 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
00151                         ++noovs;
00152                         continue;
00153                 }
00154                 /* Sum up information for each N-gram */
00155                 prob = ngram_ng_score(lm,
00156                                       wids[i], wids + i + 1,
00157                                       n - i - 1, &n_used);
00158                 ch -= prob;
00159         }
00160 
00161         if (out_n_ccs) *out_n_ccs = nccs;
00162         if (out_n_oovs) *out_n_oovs = noovs;
00163 
00164         /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
00165         n -= (nccs + noovs);
00166         if (n <= 0)
00167             return 0;
00168         return ch / n;
00169 }
00170 
00171 static void
00172 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00173 {
00174         FILE *fh;
00175         lineiter_t *litor;
00176         int32 nccs, noovs, nwords;
00177         float64 ch, log_to_log2;;
00178 
00179         if ((fh = fopen(lsnfn, "r")) == NULL)
00180                 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00181 
00182         /* We have to keep ch in floating-point to avoid overflows, so
00183          * we might as well use log2. */
00184         log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00185         nccs = noovs = nwords = 0;
00186         ch = 0.0;
00187         for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
00188                 char **words;
00189                 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00190 
00191                 n = str2words(litor->buf, NULL, 0);
00192                 if (n < 0)
00193                         E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00194                 if (n == 0) /* Do nothing! */
00195                         continue;
00196                 words = ckd_calloc(n, sizeof(*words));
00197                 str2words(litor->buf, words, n);
00198 
00199                 /* Remove any utterance ID (FIXME: has to be a single "word") */
00200                 if (words[n-1][0] == '('
00201                     && words[n-1][strlen(words[n-1])-1] == ')')
00202                         n = n - 1;
00203 
00204                 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00205 
00206                 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
00207                 nccs += tmp_nccs;
00208                 noovs += tmp_noovs;
00209                 nwords += n;
00210                 
00211                 ckd_free(words);
00212         }
00213 
00214         ch /= (nwords - nccs - noovs);
00215         printf("cross-entropy: %f bits\n", ch);
00216 
00217         /* Calculate perplexity pplx = exp CH */
00218         printf("perplexity: %f\n", pow(2.0, ch));
00219 
00220         /* Report OOVs and CCs */
00221         printf("%d words evaluated\n", nwords);
00222         printf("%d OOVs (%.2f%%), %d context cues removed\n",
00223                noovs, (double)noovs / nwords * 100, nccs);
00224 }
00225 
00226 static void
00227 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00228 {
00229         char *textfoo;
00230         char **words;
00231         int32 n, ch, noovs, nccs;
00232 
00233         /* Split it into an array of strings. */
00234         textfoo = ckd_salloc(text);
00235         n = str2words(textfoo, NULL, 0);
00236         if (n < 0)
00237                 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00238         if (n == 0) /* Do nothing! */
00239                 return;
00240         words = ckd_calloc(n, sizeof(*words));
00241         str2words(textfoo, words, n);
00242 
00243         ch = calc_entropy(lm, words, n, &nccs, &noovs);
00244 
00245         printf("input: %s\n", text);
00246         printf("cross-entropy: %f bits\n",
00247                ch * log(logmath_get_base(lmath)) / log(2));
00248 
00249         /* Calculate perplexity pplx = exp CH */
00250         printf("perplexity: %f\n", logmath_exp(lmath, ch));
00251 
00252         /* Report OOVs and CCs */
00253         printf("%d words evaluated\n", n);
00254         printf("%d OOVs, %d context cues removed\n",
00255               noovs, nccs);
00256 
00257         ckd_free(textfoo);
00258         ckd_free(words);
00259 }
00260 
00261 int
00262 main(int argc, char *argv[])
00263 {
00264         cmd_ln_t *config;
00265         ngram_model_t *lm = NULL;
00266         logmath_t *lmath;
00267         const char *lmfn, *probdefn, *lsnfn, *text;
00268 
00269         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00270                 return 1;
00271 
00272         /* Create log math object. */
00273         if ((lmath = logmath_init
00274              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00275                 E_FATAL("Failed to initialize log math\n");
00276         }
00277 
00278         /* Load the language model. */
00279         lmfn = cmd_ln_str_r(config, "-lm");
00280         if (lmfn == NULL
00281             || (lm = ngram_model_read(config, lmfn,
00282                                       NGRAM_AUTO, lmath)) == NULL) {
00283                 E_FATAL("Failed to load language model from %s\n",
00284                         cmd_ln_str_r(config, "-lm"));
00285         }
00286         if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00287             ngram_model_read_classdef(lm, probdefn);
00288         ngram_model_apply_weights(lm,
00289                                   cmd_ln_float32_r(config, "-lw"),
00290                                   cmd_ln_float32_r(config, "-wip"),
00291                                   cmd_ln_float32_r(config, "-uw"));
00292 
00293         /* Now evaluate some text. */
00294         lsnfn = cmd_ln_str_r(config, "-lsn");
00295         text = cmd_ln_str_r(config, "-text");
00296         if (lsnfn) {
00297                 evaluate_file(lm, lmath, lsnfn);
00298         }
00299         else if (text) {
00300                 evaluate_string(lm, lmath, text);
00301         }
00302 
00303         return 0;
00304 }

Generated on Mon Aug 29 2011 for SphinxBase by  doxygen 1.7.1