SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file ngram_model_dmp.c DMP format language models 00039 * 00040 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> 00041 */ 00042 00043 #include <assert.h> 00044 #include <stdio.h> 00045 #include <string.h> 00046 #include <stdlib.h> 00047 #include <limits.h> 00048 00049 #include "sphinxbase/ckd_alloc.h" 00050 #include "sphinxbase/pio.h" 00051 #include "sphinxbase/err.h" 00052 #include "sphinxbase/byteorder.h" 00053 #include "sphinxbase/listelem_alloc.h" 00054 00055 #include "ngram_model_dmp.h" 00056 00057 static const char darpa_hdr[] = "Darpa Trigram LM"; 00058 static ngram_funcs_t ngram_model_dmp_funcs; 00059 00060 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) 00061 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) 00062 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) 00063 00064 static unigram_t * 00065 new_unigram_table(int32 n_ug) 00066 { 00067 unigram_t *table; 00068 int32 i; 00069 00070 table = ckd_calloc(n_ug, sizeof(unigram_t)); 00071 for (i = 0; i < n_ug; i++) { 00072 table[i].prob1.f = -99.0; 00073 table[i].bo_wt1.f = -99.0; 00074 } 00075 return table; 00076 } 00077 00078 ngram_model_t * 00079 ngram_model_dmp_read(cmd_ln_t *config, 00080 const char *file_name, 00081 logmath_t *lmath) 00082 { 00083 ngram_model_t *base; 00084 ngram_model_dmp_t *model; 00085 FILE *fp; 00086 int do_mmap, do_swap; 00087 int32 is_pipe; 00088 int32 i, j, k, vn, n, ts; 00089 int32 n_unigram; 00090 int32 n_bigram; 00091 int32 n_trigram; 00092 char str[1024]; 00093 unigram_t *ugptr; 00094 bigram_t *bgptr; 00095 trigram_t *tgptr; 00096 char *tmp_word_str; 00097 char *map_base = NULL; 00098 size_t offset = 0, filesize; 00099 00100 base = NULL; 00101 do_mmap = FALSE; 00102 if (config) 00103 do_mmap = cmd_ln_boolean_r(config, "-mmap"); 00104 00105 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { 00106 E_ERROR("Dump file %s not found\n", file_name); 00107 goto error_out; 00108 } 00109 00110 if (is_pipe && do_mmap) { 00111 E_WARN("Dump file is compressed, will not use memory-mapped I/O\n"); 00112 do_mmap = 0; 00113 } 00114 00115 do_swap = FALSE; 00116 if (fread(&k, sizeof(k), 1, fp) != 1) 00117 goto error_out; 00118 if (k != strlen(darpa_hdr)+1) { 00119 SWAP_INT32(&k); 00120 if (k != strlen(darpa_hdr)+1) { 00121 E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); 00122 goto error_out; 00123 } 00124 do_swap = 1; 00125 } 00126 if (fread(str, 1, k, fp) != (size_t) k) { 00127 E_ERROR("Cannot read header\n"); 00128 goto error_out; 00129 } 00130 if (strncmp(str, darpa_hdr, k) != 0) { 00131 E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr); 00132 goto error_out; 00133 } 00134 00135 if (do_mmap) { 00136 if (do_swap) { 00137 E_INFO 00138 ("Byteswapping required, will not use memory-mapped I/O for LM file\n"); 00139 do_mmap = 0; 00140 } 00141 else { 00142 E_INFO("Will use memory-mapped I/O for LM file\n"); 00143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */ 00144 E_FATAL("memory mapping is not supported at the moment."); 00145 #else 00146 #endif 00147 } 00148 } 00149 00150 if (fread(&k, sizeof(k), 1, fp) != 1) 00151 goto error_out; 00152 if (do_swap) SWAP_INT32(&k); 00153 if (fread(str, 1, k, fp) != (size_t) k) { 00154 E_ERROR("Cannot read LM filename in header\n"); 00155 goto error_out; 00156 } 00157 00158 /* read version#, if present (must be <= 0) */ 00159 if (fread(&vn, sizeof(vn), 1, fp) != 1) 00160 goto error_out; 00161 if (do_swap) SWAP_INT32(&vn); 00162 if (vn <= 0) { 00163 /* read and don't compare timestamps (we don't care) */ 00164 if (fread(&ts, sizeof(ts), 1, fp) != 1) 00165 goto error_out; 00166 if (do_swap) SWAP_INT32(&ts); 00167 00168 /* read and skip format description */ 00169 for (;;) { 00170 if (fread(&k, sizeof(k), 1, fp) != 1) 00171 goto error_out; 00172 if (do_swap) SWAP_INT32(&k); 00173 if (k == 0) 00174 break; 00175 if (fread(str, 1, k, fp) != (size_t) k) { 00176 E_ERROR("fread(word) failed\n"); 00177 goto error_out; 00178 } 00179 } 00180 /* read model->ucount */ 00181 if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1) 00182 goto error_out; 00183 if (do_swap) SWAP_INT32(&n_unigram); 00184 } 00185 else { 00186 n_unigram = vn; 00187 } 00188 00189 /* read model->bcount, tcount */ 00190 if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1) 00191 goto error_out; 00192 if (do_swap) SWAP_INT32(&n_bigram); 00193 if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1) 00194 goto error_out; 00195 if (do_swap) SWAP_INT32(&n_trigram); 00196 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); 00197 00198 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ 00199 model = ckd_calloc(1, sizeof(*model)); 00200 base = &model->base; 00201 if (n_trigram > 0) 00202 n = 3; 00203 else if (n_bigram > 0) 00204 n = 2; 00205 else 00206 n = 1; 00207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram); 00208 base->n_counts[0] = n_unigram; 00209 base->n_counts[1] = n_bigram; 00210 base->n_counts[2] = n_trigram; 00211 00212 /* read unigrams (always in memory, as they contain dictionary 00213 * mappings that can't be precomputed, and also could have OOVs added) */ 00214 model->lm3g.unigrams = new_unigram_table(n_unigram + 1); 00215 ugptr = model->lm3g.unigrams; 00216 for (i = 0; i <= n_unigram; ++i) { 00217 /* Skip over the mapping ID, we don't care about it. */ 00218 if (fread(ugptr, sizeof(int32), 1, fp) != 1) { 00219 E_ERROR("fread(mapid[%d]) failed\n", i); 00220 goto error_out; 00221 } 00222 /* Read the actual unigram structure. */ 00223 if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) { 00224 E_ERROR("fread(unigrams) failed\n"); 00225 ngram_model_free(base); 00226 fclose_comp(fp, is_pipe); 00227 return NULL; 00228 } 00229 /* Byte swap if necessary. */ 00230 if (do_swap) { 00231 SWAP_INT32(&ugptr->prob1.l); 00232 SWAP_INT32(&ugptr->bo_wt1.l); 00233 SWAP_INT32(&ugptr->bigrams); 00234 } 00235 /* Convert values to log. */ 00236 ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f); 00237 ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f); 00238 E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n", 00239 i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams)); 00240 ++ugptr; 00241 } 00242 E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram); 00243 00244 /* Now mmap() the file and read in the rest of the (read-only) stuff. */ 00245 if (do_mmap) { 00246 offset = ftell(fp); 00247 fseek(fp, 0, SEEK_END); 00248 filesize = ftell(fp); 00249 fseek(fp, offset, SEEK_SET); 00250 00251 /* Check for improper word alignment. */ 00252 if (offset & 0x3) { 00253 E_WARN("-mmap specified, but tseg_base is not word-aligned. Will not memory-map.\n"); 00254 do_mmap = FALSE; 00255 } 00256 else { 00257 model->dump_mmap = mmio_file_read(file_name); 00258 if (model->dump_mmap == NULL) { 00259 do_mmap = FALSE; 00260 } 00261 else { 00262 map_base = mmio_file_ptr(model->dump_mmap); 00263 } 00264 } 00265 } 00266 00267 if (n_bigram > 0) { 00268 /* read bigrams */ 00269 if (do_mmap) { 00270 model->lm3g.bigrams = (bigram_t *) (map_base + offset); 00271 offset += (n_bigram + 1) * sizeof(bigram_t); 00272 } 00273 else { 00274 model->lm3g.bigrams = 00275 ckd_calloc(n_bigram + 1, sizeof(bigram_t)); 00276 if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp) 00277 != (size_t) n_bigram + 1) { 00278 E_ERROR("fread(bigrams) failed\n"); 00279 goto error_out; 00280 } 00281 if (do_swap) { 00282 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram; 00283 i++, bgptr++) { 00284 SWAP_INT16(&bgptr->wid); 00285 SWAP_INT16(&bgptr->prob2); 00286 SWAP_INT16(&bgptr->bo_wt2); 00287 SWAP_INT16(&bgptr->trigrams); 00288 } 00289 } 00290 } 00291 E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram); 00292 } 00293 00294 /* read trigrams */ 00295 if (n_trigram > 0) { 00296 if (do_mmap) { 00297 model->lm3g.trigrams = (trigram_t *) (map_base + offset); 00298 offset += n_trigram * sizeof(trigram_t); 00299 } 00300 else { 00301 model->lm3g.trigrams = 00302 ckd_calloc(n_trigram, sizeof(trigram_t)); 00303 if (fread 00304 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp) 00305 != (size_t) n_trigram) { 00306 E_ERROR("fread(trigrams) failed\n"); 00307 goto error_out; 00308 } 00309 if (do_swap) { 00310 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram; 00311 i++, tgptr++) { 00312 SWAP_INT16(&tgptr->wid); 00313 SWAP_INT16(&tgptr->prob3); 00314 } 00315 } 00316 } 00317 E_INFO("%8d = LM.trigrams read\n", n_trigram); 00318 /* Initialize tginfo */ 00319 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); 00320 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); 00321 } 00322 00323 if (n_bigram > 0) { 00324 /* read n_prob2 and prob2 array (in memory) */ 00325 if (do_mmap) 00326 fseek(fp, offset, SEEK_SET); 00327 if (fread(&k, sizeof(k), 1, fp) != 1) 00328 goto error_out; 00329 if (do_swap) SWAP_INT32(&k); 00330 model->lm3g.n_prob2 = k; 00331 model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2)); 00332 if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) { 00333 E_ERROR("fread(prob2) failed\n"); 00334 goto error_out; 00335 } 00336 for (i = 0; i < k; i++) { 00337 if (do_swap) 00338 SWAP_INT32(&model->lm3g.prob2[i].l); 00339 /* Convert values to log. */ 00340 model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f); 00341 } 00342 E_INFO("%8d = LM.prob2 entries read\n", k); 00343 } 00344 00345 /* read n_bo_wt2 and bo_wt2 array (in memory) */ 00346 if (base->n > 2) { 00347 if (fread(&k, sizeof(k), 1, fp) != 1) 00348 goto error_out; 00349 if (do_swap) SWAP_INT32(&k); 00350 model->lm3g.n_bo_wt2 = k; 00351 model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2)); 00352 if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) { 00353 E_ERROR("fread(bo_wt2) failed\n"); 00354 goto error_out; 00355 } 00356 for (i = 0; i < k; i++) { 00357 if (do_swap) 00358 SWAP_INT32(&model->lm3g.bo_wt2[i].l); 00359 /* Convert values to log. */ 00360 model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f); 00361 } 00362 E_INFO("%8d = LM.bo_wt2 entries read\n", k); 00363 } 00364 00365 /* read n_prob3 and prob3 array (in memory) */ 00366 if (base->n > 2) { 00367 if (fread(&k, sizeof(k), 1, fp) != 1) 00368 goto error_out; 00369 if (do_swap) SWAP_INT32(&k); 00370 model->lm3g.n_prob3 = k; 00371 model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3)); 00372 if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) { 00373 E_ERROR("fread(prob3) failed\n"); 00374 goto error_out; 00375 } 00376 for (i = 0; i < k; i++) { 00377 if (do_swap) 00378 SWAP_INT32(&model->lm3g.prob3[i].l); 00379 /* Convert values to log. */ 00380 model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f); 00381 } 00382 E_INFO("%8d = LM.prob3 entries read\n", k); 00383 } 00384 00385 /* read tseg_base size and tseg_base */ 00386 if (do_mmap) 00387 offset = ftell(fp); 00388 if (n_trigram > 0) { 00389 if (do_mmap) { 00390 memcpy(&k, map_base + offset, sizeof(k)); 00391 offset += sizeof(int32); 00392 model->lm3g.tseg_base = (int32 *) (map_base + offset); 00393 offset += k * sizeof(int32); 00394 } 00395 else { 00396 k = (n_bigram + 1) / BG_SEG_SZ + 1; 00397 if (fread(&k, sizeof(k), 1, fp) != 1) 00398 goto error_out; 00399 if (do_swap) SWAP_INT32(&k); 00400 model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32)); 00401 if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) != 00402 (size_t) k) { 00403 E_ERROR("fread(tseg_base) failed\n"); 00404 goto error_out; 00405 } 00406 if (do_swap) 00407 for (i = 0; i < k; i++) 00408 SWAP_INT32(&model->lm3g.tseg_base[i]); 00409 } 00410 E_INFO("%8d = LM.tseg_base entries read\n", k); 00411 } 00412 00413 /* read ascii word strings */ 00414 if (do_mmap) { 00415 memcpy(&k, map_base + offset, sizeof(k)); 00416 offset += sizeof(int32); 00417 tmp_word_str = (char *) (map_base + offset); 00418 offset += k; 00419 } 00420 else { 00421 base->writable = TRUE; 00422 if (fread(&k, sizeof(k), 1, fp) != 1) 00423 goto error_out; 00424 if (do_swap) SWAP_INT32(&k); 00425 tmp_word_str = ckd_calloc(k, 1); 00426 if (fread(tmp_word_str, 1, k, fp) != (size_t) k) { 00427 E_ERROR("fread(word-string) failed\n"); 00428 goto error_out; 00429 } 00430 } 00431 00432 /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */ 00433 for (i = 0, j = 0; i < k; i++) 00434 if (tmp_word_str[i] == '\0') 00435 j++; 00436 if (j != n_unigram) { 00437 E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n", 00438 j, n_unigram); 00439 goto error_out; 00440 } 00441 00442 /* Break up string just read into words */ 00443 if (do_mmap) { 00444 j = 0; 00445 for (i = 0; i < n_unigram; i++) { 00446 base->word_str[i] = tmp_word_str + j; 00447 if (hash_table_enter(base->wid, base->word_str[i], 00448 (void *)(long)i) != (void *)(long)i) { 00449 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); 00450 } 00451 j += strlen(base->word_str[i]) + 1; 00452 } 00453 } 00454 else { 00455 j = 0; 00456 for (i = 0; i < n_unigram; i++) { 00457 base->word_str[i] = ckd_salloc(tmp_word_str + j); 00458 if (hash_table_enter(base->wid, base->word_str[i], 00459 (void *)(long)i) != (void *)(long)i) { 00460 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); 00461 } 00462 j += strlen(base->word_str[i]) + 1; 00463 } 00464 free(tmp_word_str); 00465 } 00466 E_INFO("%8d = ascii word strings read\n", i); 00467 00468 fclose_comp(fp, is_pipe); 00469 return base; 00470 00471 error_out: 00472 if (fp) 00473 fclose_comp(fp, is_pipe); 00474 ngram_model_free(base); 00475 return NULL; 00476 } 00477 00478 ngram_model_dmp_t * 00479 ngram_model_dmp_build(ngram_model_t *base) 00480 { 00481 ngram_model_dmp_t *model; 00482 ngram_model_t *newbase; 00483 ngram_iter_t *itor; 00484 sorted_list_t sorted_prob2; 00485 sorted_list_t sorted_bo_wt2; 00486 sorted_list_t sorted_prob3; 00487 bigram_t *bgptr; 00488 trigram_t *tgptr; 00489 int i, bgcount, tgcount, seg; 00490 00491 if (base->funcs == &ngram_model_dmp_funcs) { 00492 E_INFO("Using existing DMP model.\n"); 00493 return (ngram_model_dmp_t *)ngram_model_retain(base); 00494 } 00495 00496 /* Initialize new base model structure with params from base. */ 00497 E_INFO("Building DMP model...\n"); 00498 model = ckd_calloc(1, sizeof(*model)); 00499 newbase = &model->base; 00500 ngram_model_init(newbase, &ngram_model_dmp_funcs, 00501 logmath_retain(base->lmath), 00502 base->n, base->n_counts[0]); 00503 /* Copy N-gram counts over. */ 00504 memcpy(newbase->n_counts, base->n_counts, 00505 base->n * sizeof(*base->n_counts)); 00506 /* Make sure word strings are freed. */ 00507 newbase->writable = TRUE; 00508 /* Initialize unigram table and string table. */ 00509 model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1); 00510 for (itor = ngram_model_mgrams(base, 0); itor; 00511 itor = ngram_iter_next(itor)) { 00512 int32 prob1, bo_wt1; 00513 int32 const *wids; 00514 00515 /* Can't guarantee they will go in unigram order, so just to 00516 * be correct, we do this... */ 00517 wids = ngram_iter_get(itor, &prob1, &bo_wt1); 00518 model->lm3g.unigrams[wids[0]].prob1.l = prob1; 00519 model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1; 00520 newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0])); 00521 if ((hash_table_enter_int32(newbase->wid, 00522 newbase->word_str[wids[0]], wids[0])) 00523 != wids[0]) { 00524 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]); 00525 } 00526 } 00527 E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]); 00528 00529 if (newbase->n < 2) 00530 return model; 00531 00532 /* Construct quantized probability table for bigrams and 00533 * (optionally) trigrams. Hesitate to use the "sorted list" thing 00534 * since it isn't so useful, but it's there already. */ 00535 init_sorted_list(&sorted_prob2); 00536 if (newbase->n > 2) { 00537 init_sorted_list(&sorted_bo_wt2); 00538 init_sorted_list(&sorted_prob3); 00539 } 00540 /* Construct bigram and trigram arrays. */ 00541 bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t)); 00542 if (newbase->n > 2) { 00543 tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t)); 00544 model->lm3g.tseg_base = 00545 ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32)); 00546 } 00547 else 00548 tgptr = NULL; 00549 /* Since bigrams and trigrams have to be contiguous with others 00550 * with the same N-1-gram, we traverse them in depth-first order 00551 * to build the bigram and trigram arrays. */ 00552 for (i = 0; i < newbase->n_counts[0]; ++i) { 00553 ngram_iter_t *uitor; 00554 bgcount = bgptr - model->lm3g.bigrams; 00555 /* First bigram index (same as next if no bigrams...) */ 00556 model->lm3g.unigrams[i].bigrams = bgcount; 00557 E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount)); 00558 /* All bigrams corresponding to unigram i */ 00559 uitor = ngram_ng_iter(base, i, NULL, 0); 00560 for (itor = ngram_iter_successors(uitor); 00561 itor; ++bgptr, itor = ngram_iter_next(itor)) { 00562 int32 prob2, bo_wt2; 00563 int32 const *wids; 00564 ngram_iter_t *titor; 00565 00566 wids = ngram_iter_get(itor, &prob2, &bo_wt2); 00567 00568 assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]); 00569 00570 bgptr->wid = wids[1]; 00571 bgptr->prob2 = sorted_id(&sorted_prob2, &prob2); 00572 if (newbase->n > 2) { 00573 tgcount = (tgptr - model->lm3g.trigrams); 00574 bgcount = (bgptr - model->lm3g.bigrams); 00575 00576 /* Backoff weight (only if there are trigrams...) */ 00577 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2); 00578 00579 /* Find bigram segment for this bigram (this isn't 00580 * used unless there are trigrams) */ 00581 seg = bgcount >> LOG_BG_SEG_SZ; 00582 /* If we just crossed a bigram segment boundary, then 00583 * point tseg_base for the new segment to the current 00584 * trigram pointer. */ 00585 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) 00586 model->lm3g.tseg_base[seg] = tgcount; 00587 /* Now calculate the trigram offset. */ 00588 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; 00589 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n", 00590 bgcount, 00591 newbase->word_str[wids[0]], 00592 newbase->word_str[wids[1]], 00593 seg, bgptr->trigrams)); 00594 00595 /* And fill in successors' trigram info. */ 00596 for (titor = ngram_iter_successors(itor); 00597 titor; ++tgptr, titor = ngram_iter_next(titor)) { 00598 int32 prob3, dummy; 00599 00600 assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]); 00601 wids = ngram_iter_get(titor, &prob3, &dummy); 00602 tgptr->wid = wids[2]; 00603 tgptr->prob3 = sorted_id(&sorted_prob3, &prob3); 00604 E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n", 00605 tgcount, 00606 newbase->word_str[wids[0]], 00607 newbase->word_str[wids[1]], 00608 newbase->word_str[wids[2]], 00609 tgptr->prob3)); 00610 } 00611 } 00612 } 00613 ngram_iter_free(uitor); 00614 } 00615 /* Add sentinal unigram and bigram records. */ 00616 bgcount = bgptr - model->lm3g.bigrams; 00617 tgcount = tgptr - model->lm3g.trigrams; 00618 seg = bgcount >> LOG_BG_SEG_SZ; 00619 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) 00620 model->lm3g.tseg_base[seg] = tgcount; 00621 model->lm3g.unigrams[i].bigrams = bgcount; 00622 if (newbase->n > 2) 00623 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; 00624 00625 /* Now create probability tables. */ 00626 model->lm3g.n_prob2 = sorted_prob2.free; 00627 model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2); 00628 E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]); 00629 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); 00630 free_sorted_list(&sorted_prob2); 00631 if (newbase->n > 2) { 00632 /* Create trigram bo-wts array. */ 00633 model->lm3g.n_bo_wt2 = sorted_bo_wt2.free; 00634 model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2); 00635 free_sorted_list(&sorted_bo_wt2); 00636 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); 00637 /* Create trigram probability table. */ 00638 model->lm3g.n_prob3 = sorted_prob3.free; 00639 model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3); 00640 E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]); 00641 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); 00642 free_sorted_list(&sorted_prob3); 00643 /* Initialize tginfo */ 00644 model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *)); 00645 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); 00646 } 00647 00648 return model; 00649 } 00650 00651 static void 00652 fwrite_int32(FILE *fh, int32 val) 00653 { 00654 fwrite(&val, 4, 1, fh); 00655 } 00656 00657 static void 00658 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath) 00659 { 00660 int32 bogus = -1; 00661 float32 log10val; 00662 00663 /* Bogus dictionary mapping field. */ 00664 fwrite(&bogus, 4, 1, fh); 00665 /* Convert values to log10. */ 00666 log10val = logmath_log_to_log10(lmath, ug->prob1.l); 00667 fwrite(&log10val, 4, 1, fh); 00668 log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l); 00669 fwrite(&log10val, 4, 1, fh); 00670 fwrite_int32(fh, ug->bigrams); 00671 } 00672 00673 static void 00674 fwrite_bg(FILE *fh, bigram_t *bg) 00675 { 00676 fwrite(bg, sizeof(*bg), 1, fh); 00677 } 00678 00679 static void 00680 fwrite_tg(FILE *fh, trigram_t *tg) 00681 { 00682 fwrite(tg, sizeof(*tg), 1, fh); 00683 } 00684 00687 static char const *fmtdesc[] = { 00688 "BEGIN FILE FORMAT DESCRIPTION", 00689 "Header string length (int32) and string (including trailing 0)", 00690 "Original LM filename string-length (int32) and filename (including trailing 0)", 00691 "(int32) version number (present iff value <= 0)", 00692 "(int32) original LM file modification timestamp (iff version# present)", 00693 "(int32) string-length and string (including trailing 0) (iff version# present)", 00694 "... previous entry continued any number of times (iff version# present)", 00695 "(int32) 0 (terminating sequence of strings) (iff version# present)", 00696 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)", 00697 "(int32) lm_t.ucount (must be > 0)", 00698 "(int32) lm_t.bcount", 00699 "(int32) lm_t.tcount", 00700 "lm_t.ucount+1 unigrams (including sentinel)", 00701 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3", 00702 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)", 00703 "(int32) lm_t.n_prob2", 00704 "(int32) lm_t.prob2[]", 00705 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", 00706 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", 00707 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", 00708 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", 00709 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", 00710 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", 00711 "(int32) Sum(all word string-lengths, including trailing 0 for each)", 00712 "All word strings (including trailing 0 for each)", 00713 "END FILE FORMAT DESCRIPTION", 00714 NULL, 00715 }; 00716 00717 static void 00718 ngram_model_dmp_write_header(FILE * fh) 00719 { 00720 int32 k; 00721 k = strlen(darpa_hdr) + 1; 00722 fwrite_int32(fh, k); 00723 fwrite(darpa_hdr, 1, k, fh); 00724 } 00725 00726 static void 00727 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile) 00728 { 00729 int32 k; 00730 00731 k = strlen(lmfile) + 1; 00732 fwrite_int32(fh, k); 00733 fwrite(lmfile, 1, k, fh); 00734 } 00735 00736 #define LMDMP_VERSION_TG_16BIT -1 00740 static void 00741 ngram_model_dmp_write_version(FILE * fh, int32 mtime) 00742 { 00743 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */ 00744 fwrite_int32(fh, mtime); 00745 } 00746 00747 static void 00748 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model) 00749 { 00750 fwrite_int32(fh, model->n_counts[0]); 00751 fwrite_int32(fh, model->n_counts[1]); 00752 fwrite_int32(fh, model->n_counts[2]); 00753 } 00754 00755 static void 00756 ngram_model_dmp_write_fmtdesc(FILE * fh) 00757 { 00758 int32 i, k; 00759 long pos; 00760 00761 /* Write file format description into header */ 00762 for (i = 0; fmtdesc[i] != NULL; i++) { 00763 k = strlen(fmtdesc[i]) + 1; 00764 fwrite_int32(fh, k); 00765 fwrite(fmtdesc[i], 1, k, fh); 00766 } 00767 /* Pad it out in order to achieve 32-bit alignment */ 00768 pos = ftell(fh); 00769 k = pos & 3; 00770 if (k) { 00771 fwrite_int32(fh, 4-k); 00772 fwrite("!!!!", 1, 4-k, fh); 00773 } 00774 fwrite_int32(fh, 0); 00775 } 00776 00777 static void 00778 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model) 00779 { 00780 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00781 int32 i; 00782 00783 for (i = 0; i <= model->n_counts[0]; i++) { 00784 fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath); 00785 } 00786 } 00787 00788 00789 static void 00790 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model) 00791 { 00792 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00793 int32 i; 00794 00795 for (i = 0; i <= model->n_counts[1]; i++) { 00796 fwrite_bg(fh, &(lm->lm3g.bigrams[i])); 00797 } 00798 00799 } 00800 00801 static void 00802 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model) 00803 { 00804 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00805 int32 i; 00806 00807 for (i = 0; i < model->n_counts[2]; i++) { 00808 fwrite_tg(fh, &(lm->lm3g.trigrams[i])); 00809 } 00810 } 00811 00812 static void 00813 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model) 00814 { 00815 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00816 int32 i; 00817 00818 fwrite_int32(fh, lm->lm3g.n_prob2); 00819 for (i = 0; i < lm->lm3g.n_prob2; i++) { 00820 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l); 00821 fwrite(&log10val, 4, 1, fh); 00822 } 00823 } 00824 00825 static void 00826 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model) 00827 { 00828 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00829 int32 i; 00830 00831 fwrite_int32(fh, lm->lm3g.n_bo_wt2); 00832 for (i = 0; i < lm->lm3g.n_bo_wt2; i++) { 00833 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l); 00834 fwrite(&log10val, 4, 1, fh); 00835 } 00836 } 00837 00838 static void 00839 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model) 00840 { 00841 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00842 int32 i; 00843 00844 fwrite_int32(fh, lm->lm3g.n_prob3); 00845 for (i = 0; i < lm->lm3g.n_prob3; i++) { 00846 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l); 00847 fwrite(&log10val, 4, 1, fh); 00848 } 00849 } 00850 00851 static void 00852 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model) 00853 { 00854 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00855 int32 i, k; 00856 00857 k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1; 00858 fwrite_int32(fh, k); 00859 for (i = 0; i < k; i++) 00860 fwrite_int32(fh, lm->lm3g.tseg_base[i]); 00861 } 00862 00863 static void 00864 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model) 00865 { 00866 int32 i, k; 00867 00868 k = 0; 00869 for (i = 0; i < model->n_counts[0]; i++) 00870 k += strlen(model->word_str[i]) + 1; 00871 fwrite_int32(fh, k); 00872 for (i = 0; i < model->n_counts[0]; i++) 00873 fwrite(model->word_str[i], 1, 00874 strlen(model->word_str[i]) + 1, fh); 00875 } 00876 00877 int 00878 ngram_model_dmp_write(ngram_model_t *base, 00879 const char *file_name) 00880 { 00881 ngram_model_dmp_t *model; 00882 ngram_model_t *newbase; 00883 FILE *fh; 00884 00885 /* First, construct a DMP model from the base model. */ 00886 model = ngram_model_dmp_build(base); 00887 newbase = &model->base; 00888 00889 /* Now write it, confident in the knowledge that it's the right 00890 * kind of language model internally. */ 00891 if ((fh = fopen(file_name, "wb")) == NULL) { 00892 E_ERROR("Cannot create file %s\n", file_name); 00893 return -1; 00894 } 00895 ngram_model_dmp_write_header(fh); 00896 ngram_model_dmp_write_lm_filename(fh, file_name); 00897 ngram_model_dmp_write_version(fh, 0); 00898 ngram_model_dmp_write_fmtdesc(fh); 00899 ngram_model_dmp_write_ngram_counts(fh, newbase); 00900 ngram_model_dmp_write_unigram(fh, newbase); 00901 if (newbase->n > 1) { 00902 ngram_model_dmp_write_bigram(fh, newbase); 00903 if (newbase->n > 2) { 00904 ngram_model_dmp_write_trigram(fh, newbase); 00905 } 00906 ngram_model_dmp_write_bgprob(fh, newbase); 00907 if (newbase->n > 2) { 00908 ngram_model_dmp_write_tgbowt(fh, newbase); 00909 ngram_model_dmp_write_tgprob(fh, newbase); 00910 ngram_model_dmp_write_tg_segbase(fh, newbase); 00911 } 00912 } 00913 ngram_model_dmp_write_wordstr(fh, newbase); 00914 ngram_model_free(newbase); 00915 00916 return fclose(fh); 00917 } 00918 00919 static int 00920 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw, 00921 float32 wip, float32 uw) 00922 { 00923 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; 00924 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); 00925 return 0; 00926 } 00927 00928 /* Lousy "templating" for things that are largely the same in DMP and 00929 * ARPA models, except for the bigram and trigram types and some 00930 * names. */ 00931 #define NGRAM_MODEL_TYPE ngram_model_dmp_t 00932 #include "lm3g_templates.c" 00933 00934 static void 00935 ngram_model_dmp_free(ngram_model_t *base) 00936 { 00937 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; 00938 00939 ckd_free(model->lm3g.unigrams); 00940 ckd_free(model->lm3g.prob2); 00941 if (model->dump_mmap) { 00942 mmio_file_unmap(model->dump_mmap); 00943 } 00944 else { 00945 ckd_free(model->lm3g.bigrams); 00946 if (base->n > 2) { 00947 ckd_free(model->lm3g.trigrams); 00948 ckd_free(model->lm3g.tseg_base); 00949 } 00950 } 00951 if (base->n > 2) { 00952 ckd_free(model->lm3g.bo_wt2); 00953 ckd_free(model->lm3g.prob3); 00954 } 00955 00956 lm3g_tginfo_free(base, &model->lm3g); 00957 } 00958 00959 static ngram_funcs_t ngram_model_dmp_funcs = { 00960 ngram_model_dmp_free, /* free */ 00961 ngram_model_dmp_apply_weights, /* apply_weights */ 00962 lm3g_template_score, /* score */ 00963 lm3g_template_raw_score, /* raw_score */ 00964 lm3g_template_add_ug, /* add_ug */ 00965 lm3g_template_flush, /* flush */ 00966 lm3g_template_iter, /* iter */ 00967 lm3g_template_mgrams, /* mgrams */ 00968 lm3g_template_successors, /* successors */ 00969 lm3g_template_iter_get, /* iter_get */ 00970 lm3g_template_iter_next, /* iter_next */ 00971 lm3g_template_iter_free /* iter_free */ 00972 };