• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_lmtools/sphinx_lm_convert.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2009 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <pio.h>
00047 #include <strfuncs.h>
00048 
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052 
00053 static const arg_t defn[] = {
00054   { "-help",
00055     ARG_BOOLEAN,
00056     "no",
00057     "Shows the usage of the tool"},
00058 
00059   { "-logbase",
00060     ARG_FLOAT64,
00061     "1.0001",
00062     "Base in which all log-likelihoods calculated" },
00063 
00064   { "-i",
00065     REQARG_STRING,
00066     NULL,
00067     "Input language model file (required)"},
00068 
00069   { "-o",
00070     REQARG_STRING,
00071     NULL,
00072     "Output language model file (required)"},
00073 
00074   { "-ifmt",
00075     ARG_STRING,
00076     NULL,
00077     "Input language model format (will guess if not specified)"},
00078 
00079   { "-ofmt",
00080     ARG_STRING,
00081     NULL,
00082     "Output language model file (will guess if not specified)"},
00083 
00084   { "-ienc",
00085     ARG_STRING,
00086     NULL,
00087     "Input language model text encoding (no conversion done if not specified)"},
00088 
00089   { "-oenc",
00090     ARG_STRING,
00091     "utf8",
00092     "Output language model text encoding"},
00093 
00094   { "-case",
00095     ARG_STRING,
00096     NULL,
00097     "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
00098 
00099   { "-mmap",
00100     ARG_BOOLEAN,
00101     "no",
00102     "Use memory-mapped I/O for reading binary LM files"},
00103 
00104   { "-debug",
00105     ARG_INT32,
00106     NULL,
00107     "Verbosity level for debugging messages"
00108   },
00109 
00110   { NULL, 0, NULL, NULL }
00111 };
00112 
00113 int
00114 main(int argc, char *argv[])
00115 {
00116         cmd_ln_t *config;
00117         ngram_model_t *lm = NULL;
00118         logmath_t *lmath;
00119         int itype, otype;
00120         char const *kase;
00121 
00122         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00123                 return 1;
00124 
00125         err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
00126 
00127         /* Create log math object. */
00128         if ((lmath = logmath_init
00129              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00130                 E_FATAL("Failed to initialize log math\n");
00131         }
00132 
00133         /* Load the input language model. */
00134         if (cmd_ln_str_r(config, "-ifmt")) {
00135             if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
00136                 == NGRAM_INVALID) {
00137                 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
00138                 goto error_out;
00139             }
00140             lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00141                                   itype, lmath);
00142         }
00143         else {
00144             lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00145                                   NGRAM_AUTO, lmath);
00146         }
00147 
00148         /* Guess or set the output language model type. */
00149         if (cmd_ln_str_r(config, "-ofmt")) {
00150             if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
00151                 == NGRAM_INVALID) {
00152                 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
00153                 goto error_out;
00154             }
00155         }
00156         else {
00157             otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
00158         }
00159 
00160         /* Recode the language model if desired. */
00161         if (cmd_ln_str_r(config, "-ienc")) {
00162             if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"),
00163                                    cmd_ln_str_r(config, "-oenc")) != 0) {
00164                 E_ERROR("Failed to recode language model from %s to %s\n",
00165                         cmd_ln_str_r(config, "-ienc"),
00166                         cmd_ln_str_r(config, "-oenc"));
00167                 goto error_out;
00168             }
00169         }
00170 
00171         /* Case fold if requested. */
00172         if ((kase = cmd_ln_str_r(config, "-case"))) {
00173             if (0 == strcmp(kase, "lower")) {
00174                 ngram_model_casefold(lm, NGRAM_LOWER);
00175             }
00176             else if (0 == strcmp(kase, "upper")) {
00177                 ngram_model_casefold(lm, NGRAM_UPPER);
00178             }
00179             else {
00180                 E_ERROR("Unknown value for -case: %s\n", kase);
00181                 goto error_out;
00182             }
00183         }
00184 
00185         /* Write the output language model. */
00186         if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
00187             E_ERROR("Failed to write language model in format %s to %s\n",
00188                     ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
00189             goto error_out;
00190         }
00191 
00192         /* That's all folks! */
00193         ngram_model_free(lm);
00194         return 0;
00195 
00196 error_out:
00197         ngram_model_free(lm);
00198         return 1;
00199 }

Generated on Mon Aug 29 2011 for SphinxBase by  doxygen 1.7.1