00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <pio.h>
00047 #include <strfuncs.h>
00048
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052
00053 static const arg_t defn[] = {
00054 { "-help",
00055 ARG_BOOLEAN,
00056 "no",
00057 "Shows the usage of the tool"},
00058
00059 { "-logbase",
00060 ARG_FLOAT64,
00061 "1.0001",
00062 "Base in which all log-likelihoods calculated" },
00063
00064 { "-i",
00065 REQARG_STRING,
00066 NULL,
00067 "Input language model file (required)"},
00068
00069 { "-o",
00070 REQARG_STRING,
00071 NULL,
00072 "Output language model file (required)"},
00073
00074 { "-ifmt",
00075 ARG_STRING,
00076 NULL,
00077 "Input language model format (will guess if not specified)"},
00078
00079 { "-ofmt",
00080 ARG_STRING,
00081 NULL,
00082 "Output language model file (will guess if not specified)"},
00083
00084 { "-ienc",
00085 ARG_STRING,
00086 NULL,
00087 "Input language model text encoding (no conversion done if not specified)"},
00088
00089 { "-oenc",
00090 ARG_STRING,
00091 "utf8",
00092 "Output language model text encoding"},
00093
00094 { "-case",
00095 ARG_STRING,
00096 NULL,
00097 "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
00098
00099 { "-mmap",
00100 ARG_BOOLEAN,
00101 "no",
00102 "Use memory-mapped I/O for reading binary LM files"},
00103
00104 { "-debug",
00105 ARG_INT32,
00106 NULL,
00107 "Verbosity level for debugging messages"
00108 },
00109
00110 { NULL, 0, NULL, NULL }
00111 };
00112
00113 int
00114 main(int argc, char *argv[])
00115 {
00116 cmd_ln_t *config;
00117 ngram_model_t *lm = NULL;
00118 logmath_t *lmath;
00119 int itype, otype;
00120 char const *kase;
00121
00122 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00123 return 1;
00124
00125 err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
00126
00127
00128 if ((lmath = logmath_init
00129 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00130 E_FATAL("Failed to initialize log math\n");
00131 }
00132
00133
00134 if (cmd_ln_str_r(config, "-ifmt")) {
00135 if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
00136 == NGRAM_INVALID) {
00137 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
00138 goto error_out;
00139 }
00140 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00141 itype, lmath);
00142 }
00143 else {
00144 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00145 NGRAM_AUTO, lmath);
00146 }
00147
00148
00149 if (cmd_ln_str_r(config, "-ofmt")) {
00150 if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
00151 == NGRAM_INVALID) {
00152 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
00153 goto error_out;
00154 }
00155 }
00156 else {
00157 otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
00158 }
00159
00160
00161 if (cmd_ln_str_r(config, "-ienc")) {
00162 if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"),
00163 cmd_ln_str_r(config, "-oenc")) != 0) {
00164 E_ERROR("Failed to recode language model from %s to %s\n",
00165 cmd_ln_str_r(config, "-ienc"),
00166 cmd_ln_str_r(config, "-oenc"));
00167 goto error_out;
00168 }
00169 }
00170
00171
00172 if ((kase = cmd_ln_str_r(config, "-case"))) {
00173 if (0 == strcmp(kase, "lower")) {
00174 ngram_model_casefold(lm, NGRAM_LOWER);
00175 }
00176 else if (0 == strcmp(kase, "upper")) {
00177 ngram_model_casefold(lm, NGRAM_UPPER);
00178 }
00179 else {
00180 E_ERROR("Unknown value for -case: %s\n", kase);
00181 goto error_out;
00182 }
00183 }
00184
00185
00186 if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
00187 E_ERROR("Failed to write language model in format %s to %s\n",
00188 ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
00189 goto error_out;
00190 }
00191
00192
00193 ngram_model_free(lm);
00194 return 0;
00195
00196 error_out:
00197 ngram_model_free(lm);
00198 return 1;
00199 }