• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/ngram_model_dmp.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_dmp.c DMP format language models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #include "ckd_alloc.h"
00044 #include "ngram_model_dmp.h"
00045 #include "pio.h"
00046 #include "err.h"
00047 #include "byteorder.h"
00048 #include "listelem_alloc.h"
00049 
00050 #include <assert.h>
00051 #include <stdio.h>
00052 #include <string.h>
00053 #include <stdlib.h>
00054 #include <limits.h>
00055 
00056 static const char darpa_hdr[] = "Darpa Trigram LM";
00057 static ngram_funcs_t ngram_model_dmp_funcs;
00058 
00059 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00060 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
00061 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00062 
00063 static unigram_t *
00064 new_unigram_table(int32 n_ug)
00065 {
00066     unigram_t *table;
00067     int32 i;
00068 
00069     table = ckd_calloc(n_ug, sizeof(unigram_t));
00070     for (i = 0; i < n_ug; i++) {
00071         table[i].prob1.f = -99.0;
00072         table[i].bo_wt1.f = -99.0;
00073     }
00074     return table;
00075 }
00076 
00077 ngram_model_t *
00078 ngram_model_dmp_read(cmd_ln_t *config,
00079                      const char *file_name,
00080                      logmath_t *lmath)
00081 {
00082     ngram_model_t *base;
00083     ngram_model_dmp_t *model;
00084     FILE *fp;
00085     int do_mmap, do_swap;
00086     int32 is_pipe;
00087     int32 i, j, k, vn, n, ts;
00088     int32 n_unigram;
00089     int32 n_bigram;
00090     int32 n_trigram;
00091     char str[1024];
00092     unigram_t *ugptr;
00093     bigram_t *bgptr;
00094     trigram_t *tgptr;
00095     char *tmp_word_str;
00096     char *map_base = NULL;
00097     size_t offset = 0, filesize;
00098 
00099     base = NULL;
00100     do_mmap = FALSE;
00101     if (config)
00102         do_mmap = cmd_ln_boolean_r(config, "-mmap");
00103 
00104     if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
00105         E_ERROR("Dump file %s not found\n", file_name);
00106         goto error_out;
00107     }
00108 
00109     if (is_pipe && do_mmap) {
00110         E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
00111         do_mmap = 0;
00112     }
00113 
00114     do_swap = FALSE;
00115     if (fread(&k, sizeof(k), 1, fp) != 1)
00116         goto error_out;
00117     if (k != strlen(darpa_hdr)+1) {
00118         SWAP_INT32(&k);
00119         if (k != strlen(darpa_hdr)+1) {
00120             E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
00121             goto error_out;
00122         }
00123         do_swap = 1;
00124     }
00125     if (fread(str, 1, k, fp) != (size_t) k) {
00126         E_ERROR("Cannot read header\n");
00127         goto error_out;
00128     }
00129     if (strncmp(str, darpa_hdr, k) != 0) {
00130         E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
00131         goto error_out;
00132     }
00133 
00134     if (do_mmap) {
00135         if (do_swap) {
00136             E_INFO
00137                 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
00138             do_mmap = 0;
00139         }
00140         else {
00141             E_INFO("Will use memory-mapped I/O for LM file\n");
00142 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
00143             E_FATAL("memory mapping is not supported at the moment.");
00144 #else
00145 #endif
00146         }
00147     }
00148 
00149     if (fread(&k, sizeof(k), 1, fp) != 1)
00150         goto error_out;
00151     if (do_swap) SWAP_INT32(&k);
00152     if (fread(str, 1, k, fp) != (size_t) k) {
00153         E_ERROR("Cannot read LM filename in header\n");
00154         goto error_out;
00155     }
00156 
00157     /* read version#, if present (must be <= 0) */
00158     if (fread(&vn, sizeof(vn), 1, fp) != 1)
00159         goto error_out;
00160     if (do_swap) SWAP_INT32(&vn);
00161     if (vn <= 0) {
00162         /* read and don't compare timestamps (we don't care) */
00163         if (fread(&ts, sizeof(ts), 1, fp) != 1)
00164             goto error_out;
00165         if (do_swap) SWAP_INT32(&ts);
00166 
00167         /* read and skip format description */
00168         for (;;) {
00169             if (fread(&k, sizeof(k), 1, fp) != 1)
00170                 goto error_out;
00171             if (do_swap) SWAP_INT32(&k);
00172             if (k == 0)
00173                 break;
00174             if (fread(str, 1, k, fp) != (size_t) k) {
00175                 E_ERROR("fread(word) failed\n");
00176                 goto error_out;
00177             }
00178         }
00179         /* read model->ucount */
00180         if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
00181             goto error_out;
00182         if (do_swap) SWAP_INT32(&n_unigram);
00183     }
00184     else {
00185         n_unigram = vn;
00186     }
00187 
00188     /* read model->bcount, tcount */
00189     if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
00190         goto error_out;
00191     if (do_swap) SWAP_INT32(&n_bigram);
00192     if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
00193         goto error_out;
00194     if (do_swap) SWAP_INT32(&n_trigram);
00195     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00196 
00197     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
00198     model = ckd_calloc(1, sizeof(*model));
00199     base = &model->base;
00200     if (n_trigram > 0)
00201         n = 3;
00202     else if (n_bigram > 0)
00203         n = 2;
00204     else
00205         n = 1;
00206     ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
00207     base->n_counts[0] = n_unigram;
00208     base->n_counts[1] = n_bigram;
00209     base->n_counts[2] = n_trigram;
00210 
00211     /* read unigrams (always in memory, as they contain dictionary
00212      * mappings that can't be precomputed, and also could have OOVs added) */
00213     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00214     ugptr = model->lm3g.unigrams;
00215     for (i = 0; i <= n_unigram; ++i) {
00216         /* Skip over the mapping ID, we don't care about it. */
00217         if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
00218             E_ERROR("fread(mapid[%d]) failed\n", i);
00219             goto error_out;
00220         }
00221         /* Read the actual unigram structure. */
00222         if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1)  {
00223             E_ERROR("fread(unigrams) failed\n");
00224             ngram_model_free(base);
00225             fclose_comp(fp, is_pipe);
00226             return NULL;
00227         }
00228         /* Byte swap if necessary. */
00229         if (do_swap) {
00230             SWAP_INT32(&ugptr->prob1.l);
00231             SWAP_INT32(&ugptr->bo_wt1.l);
00232             SWAP_INT32(&ugptr->bigrams);
00233         }
00234         /* Convert values to log. */
00235         ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
00236         ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
00237         E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
00238                     i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
00239         ++ugptr;
00240     }
00241     E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
00242 
00243     /* Now mmap() the file and read in the rest of the (read-only) stuff. */
00244     if (do_mmap) {
00245         offset = ftell(fp);
00246         fseek(fp, 0, SEEK_END);
00247         filesize = ftell(fp);
00248         fseek(fp, offset, SEEK_SET);
00249 
00250         /* Check for improper word alignment. */
00251         if (offset & 0x3) {
00252             E_WARN("-mmap specified, but tseg_base is not word-aligned.  Will not memory-map.\n");
00253             do_mmap = FALSE;
00254         }
00255         else {
00256             model->dump_mmap = mmio_file_read(file_name);
00257             if (model->dump_mmap == NULL) {
00258                 do_mmap = FALSE;
00259             }
00260             else {
00261                 map_base = mmio_file_ptr(model->dump_mmap);
00262             }
00263         }
00264     }
00265 
00266     /* read bigrams */
00267     if (do_mmap) {
00268         model->lm3g.bigrams = (bigram_t *) (map_base + offset);
00269         offset += (n_bigram + 1) * sizeof(bigram_t);
00270     }
00271     else {
00272         model->lm3g.bigrams =
00273             ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00274         if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
00275             != (size_t) n_bigram + 1) {
00276             E_ERROR("fread(bigrams) failed\n");
00277             goto error_out;
00278         }
00279         if (do_swap) {
00280             for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
00281                  i++, bgptr++) {
00282                 SWAP_INT16(&bgptr->wid);
00283                 SWAP_INT16(&bgptr->prob2);
00284                 SWAP_INT16(&bgptr->bo_wt2);
00285                 SWAP_INT16(&bgptr->trigrams);
00286             }
00287         }
00288     }
00289     E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
00290 
00291     /* read trigrams */
00292     if (n_trigram > 0) {
00293         if (do_mmap) {
00294             model->lm3g.trigrams = (trigram_t *) (map_base + offset);
00295             offset += n_trigram * sizeof(trigram_t);
00296         }
00297         else {
00298             model->lm3g.trigrams =
00299                 ckd_calloc(n_trigram, sizeof(trigram_t));
00300             if (fread
00301                 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
00302                 != (size_t) n_trigram) {
00303                 E_ERROR("fread(trigrams) failed\n");
00304                 goto error_out;
00305             }
00306             if (do_swap) {
00307                 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
00308                      i++, tgptr++) {
00309                     SWAP_INT16(&tgptr->wid);
00310                     SWAP_INT16(&tgptr->prob3);
00311                 }
00312             }
00313         }
00314         E_INFO("%8d = LM.trigrams read\n", n_trigram);
00315         /* Initialize tginfo */
00316         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00317         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00318     }
00319 
00320     /* read n_prob2 and prob2 array (in memory) */
00321     if (do_mmap)
00322         fseek(fp, offset, SEEK_SET);
00323     if (fread(&k, sizeof(k), 1, fp) != 1)
00324         goto error_out;
00325     if (do_swap) SWAP_INT32(&k);
00326     model->lm3g.n_prob2 = k;
00327     model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
00328     if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
00329         E_ERROR("fread(prob2) failed\n");
00330         goto error_out;
00331     }
00332     for (i = 0; i < k; i++) {
00333         if (do_swap)
00334             SWAP_INT32(&model->lm3g.prob2[i].l);
00335         /* Convert values to log. */
00336         model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
00337     }
00338     E_INFO("%8d = LM.prob2 entries read\n", k);
00339 
00340     /* read n_bo_wt2 and bo_wt2 array (in memory) */
00341     if (base->n > 2) {
00342         if (fread(&k, sizeof(k), 1, fp) != 1)
00343             goto error_out;
00344         if (do_swap) SWAP_INT32(&k);
00345         model->lm3g.n_bo_wt2 = k;
00346         model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
00347         if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
00348             E_ERROR("fread(bo_wt2) failed\n");
00349             goto error_out;
00350         }
00351         for (i = 0; i < k; i++) {
00352             if (do_swap)
00353                 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
00354             /* Convert values to log. */
00355             model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
00356         }
00357         E_INFO("%8d = LM.bo_wt2 entries read\n", k);
00358     }
00359 
00360     /* read n_prob3 and prob3 array (in memory) */
00361     if (base->n > 2) {
00362         if (fread(&k, sizeof(k), 1, fp) != 1)
00363             goto error_out;
00364         if (do_swap) SWAP_INT32(&k);
00365         model->lm3g.n_prob3 = k;
00366         model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
00367         if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
00368             E_ERROR("fread(prob3) failed\n");
00369             goto error_out;
00370         }
00371         for (i = 0; i < k; i++) {
00372             if (do_swap)
00373                 SWAP_INT32(&model->lm3g.prob3[i].l);
00374             /* Convert values to log. */
00375             model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
00376         }
00377         E_INFO("%8d = LM.prob3 entries read\n", k);
00378     }
00379 
00380     /* read tseg_base size and tseg_base */
00381     if (do_mmap)
00382         offset = ftell(fp);
00383     if (n_trigram > 0) {
00384         if (do_mmap) {
00385             memcpy(&k, map_base + offset, sizeof(k));
00386             offset += sizeof(int32);
00387             model->lm3g.tseg_base = (int32 *) (map_base + offset);
00388             offset += k * sizeof(int32);
00389         }
00390         else {
00391             k = (n_bigram + 1) / BG_SEG_SZ + 1;
00392             if (fread(&k, sizeof(k), 1, fp) != 1)
00393                 goto error_out;
00394             if (do_swap) SWAP_INT32(&k);
00395             model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
00396             if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
00397                 (size_t) k) {
00398                 E_ERROR("fread(tseg_base) failed\n");
00399                 goto error_out;
00400             }
00401             if (do_swap)
00402                 for (i = 0; i < k; i++)
00403                     SWAP_INT32(&model->lm3g.tseg_base[i]);
00404         }
00405         E_INFO("%8d = LM.tseg_base entries read\n", k);
00406     }
00407 
00408     /* read ascii word strings */
00409     if (do_mmap) {
00410         memcpy(&k, map_base + offset, sizeof(k));
00411         offset += sizeof(int32);
00412         tmp_word_str = (char *) (map_base + offset);
00413         offset += k;
00414     }
00415     else {
00416         base->writable = TRUE;
00417         if (fread(&k, sizeof(k), 1, fp) != 1)
00418             goto error_out;
00419         if (do_swap) SWAP_INT32(&k);
00420         tmp_word_str = ckd_calloc(k, 1);
00421         if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
00422             E_ERROR("fread(word-string) failed\n");
00423             goto error_out;
00424         }
00425     }
00426 
00427     /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
00428     for (i = 0, j = 0; i < k; i++)
00429         if (tmp_word_str[i] == '\0')
00430             j++;
00431     if (j != n_unigram) {
00432         E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
00433                 j, n_unigram);
00434         goto error_out;
00435     }
00436 
00437     /* Break up string just read into words */
00438     if (do_mmap) {
00439         j = 0;
00440         for (i = 0; i < n_unigram; i++) {
00441             base->word_str[i] = tmp_word_str + j;
00442             if (hash_table_enter(base->wid, base->word_str[i],
00443                                  (void *)(long)i) != (void *)(long)i) {
00444                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00445             }
00446             j += strlen(base->word_str[i]) + 1;
00447         }
00448     }
00449     else {
00450         j = 0;
00451         for (i = 0; i < n_unigram; i++) {
00452             base->word_str[i] = ckd_salloc(tmp_word_str + j);
00453             if (hash_table_enter(base->wid, base->word_str[i],
00454                                  (void *)(long)i) != (void *)(long)i) {
00455                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00456             }
00457             j += strlen(base->word_str[i]) + 1;
00458         }
00459         free(tmp_word_str);
00460     }
00461     E_INFO("%8d = ascii word strings read\n", i);
00462 
00463     fclose_comp(fp, is_pipe);
00464     return base;
00465 
00466 error_out:
00467     if (fp)
00468         fclose_comp(fp, is_pipe);
00469     ngram_model_free(base);
00470     return NULL;
00471 }
00472 
00473 ngram_model_dmp_t *
00474 ngram_model_dmp_build(ngram_model_t *base)
00475 {
00476     ngram_model_dmp_t *model;
00477     ngram_model_t *newbase;
00478     ngram_iter_t *itor;
00479     sorted_list_t sorted_prob2;
00480     sorted_list_t sorted_bo_wt2;
00481     sorted_list_t sorted_prob3;
00482     bigram_t *bgptr;
00483     trigram_t *tgptr;
00484     int i, bgcount, tgcount, seg;
00485 
00486     if (base->funcs == &ngram_model_dmp_funcs) {
00487         E_INFO("Using existing DMP model.\n");
00488         return (ngram_model_dmp_t *)ngram_model_retain(base);
00489     }
00490 
00491     /* Initialize new base model structure with params from base. */
00492     E_INFO("Building DMP model...\n");
00493     model = ckd_calloc(1, sizeof(*model));
00494     newbase = &model->base;
00495     ngram_model_init(newbase, &ngram_model_dmp_funcs,
00496                      logmath_retain(base->lmath),
00497                      base->n, base->n_counts[0]);
00498     /* Copy N-gram counts over. */
00499     memcpy(newbase->n_counts, base->n_counts,
00500            base->n * sizeof(*base->n_counts));
00501     /* Make sure word strings are freed. */
00502     newbase->writable = TRUE;
00503     /* Initialize unigram table and string table. */
00504     model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
00505     for (itor = ngram_model_mgrams(base, 0); itor;
00506          itor = ngram_iter_next(itor)) {
00507         int32 prob1, bo_wt1;
00508         int32 const *wids;
00509 
00510         /* Can't guarantee they will go in unigram order, so just to
00511          * be correct, we do this... */
00512         wids = ngram_iter_get(itor, &prob1, &bo_wt1);
00513         model->lm3g.unigrams[wids[0]].prob1.l = prob1;
00514         model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
00515         newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
00516         if ((hash_table_enter_int32(newbase->wid,
00517                                     newbase->word_str[wids[0]], wids[0]))
00518             != wids[0]) {
00519                 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
00520         }
00521     }
00522     E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
00523 
00524     /* Construct quantized probability table for bigrams and
00525      * (optionally) trigrams.  Hesitate to use the "sorted list" thing
00526      * since it isn't so useful, but it's there already. */
00527     init_sorted_list(&sorted_prob2);
00528     if (newbase->n > 2) {
00529         init_sorted_list(&sorted_bo_wt2);
00530         init_sorted_list(&sorted_prob3);
00531     }
00532     /* Construct bigram and trigram arrays. */
00533     bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
00534     if (newbase->n > 2) {
00535         tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
00536         model->lm3g.tseg_base =
00537             ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
00538     }
00539     else
00540         tgptr = NULL;
00541     /* Since bigrams and trigrams have to be contiguous with others
00542      * with the same N-1-gram, we traverse them in depth-first order
00543      * to build the bigram and trigram arrays. */
00544     for (i = 0; i < newbase->n_counts[0]; ++i) {
00545         ngram_iter_t *uitor;
00546         bgcount = bgptr - model->lm3g.bigrams;
00547         /* First bigram index (same as next if no bigrams...) */
00548         model->lm3g.unigrams[i].bigrams = bgcount;
00549         E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
00550         /* All bigrams corresponding to unigram i */
00551         uitor = ngram_ng_iter(base, i, NULL, 0);
00552         for (itor = ngram_iter_successors(uitor);
00553              itor; ++bgptr, itor = ngram_iter_next(itor)) {
00554             int32 prob2, bo_wt2;
00555             int32 const *wids;
00556             ngram_iter_t *titor;
00557 
00558             wids = ngram_iter_get(itor, &prob2, &bo_wt2);
00559             /* FIXME FIXME FIXME: not sure why this happens... */
00560             if (bgptr - model->lm3g.bigrams >= newbase->n_counts[1]) {
00561                 ngram_iter_free(itor);
00562                 break;
00563             }
00564 
00565             bgptr->wid = wids[1];
00566             bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
00567             if (newbase->n > 2) {
00568                 tgcount = (tgptr - model->lm3g.trigrams);
00569                 bgcount = (bgptr - model->lm3g.bigrams);
00570 
00571                 /* Backoff weight (only if there are trigrams...) */
00572                 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
00573 
00574                 /* Find bigram segment for this bigram (this isn't
00575                  * used unless there are trigrams) */
00576                 seg = bgcount >> LOG_BG_SEG_SZ;
00577                 /* If we just crossed a bigram segment boundary, then
00578                  * point tseg_base for the new segment to the current
00579                  * trigram pointer. */
00580                 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00581                     model->lm3g.tseg_base[seg] = tgcount;
00582                 /* Now calculate the trigram offset. */
00583                 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00584                 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
00585                             bgcount,
00586                             newbase->word_str[wids[0]],
00587                             newbase->word_str[wids[1]],
00588                             seg, bgptr->trigrams));
00589 
00590                 /* And fill in successors' trigram info. */
00591                 for (titor = ngram_iter_successors(itor);
00592                      titor; ++tgptr, titor = ngram_iter_next(titor)) {
00593                     int32 prob3, dummy;
00594 
00595                     assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
00596                     wids = ngram_iter_get(titor, &prob3, &dummy);
00597                     tgptr->wid = wids[2];
00598                     tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
00599                     E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
00600                                 tgcount,
00601                                 newbase->word_str[wids[0]],
00602                                 newbase->word_str[wids[1]],
00603                                 newbase->word_str[wids[2]],
00604                                 tgptr->prob3));
00605                 }
00606             }
00607         }
00608         ngram_iter_free(uitor);
00609     }
00610     /* Add sentinal unigram and bigram records. */
00611     bgcount = bgptr - model->lm3g.bigrams;
00612     tgcount = tgptr - model->lm3g.trigrams;
00613     seg = bgcount >> LOG_BG_SEG_SZ;
00614     if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00615         model->lm3g.tseg_base[seg] = tgcount;
00616     model->lm3g.unigrams[i].bigrams = bgcount;
00617     bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00618 
00619     /* Now create probability tables. */
00620     model->lm3g.n_prob2 = sorted_prob2.free;
00621     model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
00622     E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
00623     E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
00624     free_sorted_list(&sorted_prob2);
00625     if (newbase->n > 2) {
00626         /* Create trigram bo-wts array. */
00627         model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
00628         model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
00629         free_sorted_list(&sorted_bo_wt2);
00630         E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
00631         /* Create trigram probability table. */
00632         model->lm3g.n_prob3 = sorted_prob3.free;
00633         model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
00634         E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
00635         E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
00636         free_sorted_list(&sorted_prob3);
00637         /* Initialize tginfo */
00638         model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
00639         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00640     }
00641 
00642     return model;
00643 }
00644 
00645 static void
00646 fwrite_int32(FILE *fh, int32 val)
00647 {
00648     fwrite(&val, 4, 1, fh);
00649 }
00650 
00651 static void
00652 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
00653 {
00654     int32 bogus = -1;
00655     float32 log10val;
00656 
00657     /* Bogus dictionary mapping field. */
00658     fwrite(&bogus, 4, 1, fh);
00659     /* Convert values to log10. */
00660     log10val = logmath_log_to_log10(lmath, ug->prob1.l);
00661     fwrite(&log10val, 4, 1, fh);
00662     log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
00663     fwrite(&log10val, 4, 1, fh);
00664     fwrite_int32(fh, ug->bigrams);
00665 }
00666 
00667 static void
00668 fwrite_bg(FILE *fh, bigram_t *bg)
00669 {
00670     fwrite(bg, sizeof(*bg), 1, fh);
00671 }
00672 
00673 static void
00674 fwrite_tg(FILE *fh, trigram_t *tg)
00675 {
00676     fwrite(tg, sizeof(*tg), 1, fh);
00677 }
00678 
00681 static char const *fmtdesc[] = {
00682     "BEGIN FILE FORMAT DESCRIPTION",
00683     "Header string length (int32) and string (including trailing 0)",
00684     "Original LM filename string-length (int32) and filename (including trailing 0)",
00685     "(int32) version number (present iff value <= 0)",
00686     "(int32) original LM file modification timestamp (iff version# present)",
00687     "(int32) string-length and string (including trailing 0) (iff version# present)",
00688     "... previous entry continued any number of times (iff version# present)",
00689     "(int32) 0 (terminating sequence of strings) (iff version# present)",
00690     "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
00691     "(int32) lm_t.ucount (must be > 0)",
00692     "(int32) lm_t.bcount",
00693     "(int32) lm_t.tcount",
00694     "lm_t.ucount+1 unigrams (including sentinel)",
00695     "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
00696     "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
00697     "(int32) lm_t.n_prob2",
00698     "(int32) lm_t.prob2[]",
00699     "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
00700     "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
00701     "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
00702     "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
00703     "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
00704     "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
00705     "(int32) Sum(all word string-lengths, including trailing 0 for each)",
00706     "All word strings (including trailing 0 for each)",
00707     "END FILE FORMAT DESCRIPTION",
00708     NULL,
00709 };
00710 
00711 static void
00712 ngram_model_dmp_write_header(FILE * fh)
00713 {
00714     int32 k;
00715     k = strlen(darpa_hdr) + 1;
00716     fwrite_int32(fh, k);
00717     fwrite(darpa_hdr, 1, k, fh);
00718 }
00719 
00720 static void
00721 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
00722 {
00723     int32 k;
00724 
00725     k = strlen(lmfile) + 1;
00726     fwrite_int32(fh, k);
00727     fwrite(lmfile, 1, k, fh);
00728 }
00729 
00730 #define LMDMP_VERSION_TG_16BIT -1 
00734 static void
00735 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
00736 {
00737     fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);   /* version # */
00738     fwrite_int32(fh, mtime);
00739 }
00740 
00741 static void
00742 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
00743 {
00744     fwrite_int32(fh, model->n_counts[0]);
00745     fwrite_int32(fh, model->n_counts[1]);
00746     fwrite_int32(fh, model->n_counts[2]);
00747 }
00748 
00749 static void
00750 ngram_model_dmp_write_fmtdesc(FILE * fh)
00751 {
00752     int32 i, k;
00753     long pos;
00754 
00755     /* Write file format description into header */
00756     for (i = 0; fmtdesc[i] != NULL; i++) {
00757         k = strlen(fmtdesc[i]) + 1;
00758         fwrite_int32(fh, k);
00759         fwrite(fmtdesc[i], 1, k, fh);
00760     }
00761     /* Pad it out in order to achieve 32-bit alignment */
00762     pos = ftell(fh);
00763     k = pos & 3;
00764     if (k) {
00765         fwrite_int32(fh, 4-k);
00766         fwrite("!!!!", 1, 4-k, fh);
00767     }
00768     fwrite_int32(fh, 0);
00769 }
00770 
00771 static void
00772 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
00773 {
00774     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00775     int32 i;
00776 
00777     for (i = 0; i <= model->n_counts[0]; i++) {
00778         fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
00779     }
00780 }
00781 
00782 
00783 static void
00784 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
00785 {
00786     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00787     int32 i;
00788 
00789     for (i = 0; i <= model->n_counts[1]; i++) {
00790         fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
00791     }
00792 
00793 }
00794 
00795 static void
00796 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
00797 {
00798     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00799     int32 i;
00800 
00801     for (i = 0; i < model->n_counts[2]; i++) {
00802         fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
00803     }
00804 }
00805 
00806 static void
00807 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
00808 {
00809     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00810     int32 i;
00811 
00812     fwrite_int32(fh, lm->lm3g.n_prob2);
00813     for (i = 0; i < lm->lm3g.n_prob2; i++) {
00814         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
00815         fwrite(&log10val, 4, 1, fh);
00816     }
00817 }
00818 
00819 static void
00820 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
00821 {
00822     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00823     int32 i;
00824 
00825     fwrite_int32(fh, lm->lm3g.n_bo_wt2);
00826     for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
00827         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
00828         fwrite(&log10val, 4, 1, fh);
00829     }
00830 }
00831 
00832 static void
00833 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
00834 {
00835     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00836     int32 i;
00837 
00838     fwrite_int32(fh, lm->lm3g.n_prob3);
00839     for (i = 0; i < lm->lm3g.n_prob3; i++) {
00840         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
00841         fwrite(&log10val, 4, 1, fh);
00842     }
00843 }
00844 
00845 static void
00846 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
00847 {
00848     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00849     int32 i, k;
00850 
00851     k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
00852     fwrite_int32(fh, k);
00853     for (i = 0; i < k; i++)
00854         fwrite_int32(fh, lm->lm3g.tseg_base[i]);
00855 }
00856 
00857 static void
00858 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
00859 {
00860     int32 i, k;
00861 
00862     k = 0;
00863     for (i = 0; i < model->n_counts[0]; i++)
00864         k += strlen(model->word_str[i]) + 1;
00865     fwrite_int32(fh, k);
00866     for (i = 0; i < model->n_counts[0]; i++)
00867         fwrite(model->word_str[i], 1,
00868                strlen(model->word_str[i]) + 1, fh);
00869 }
00870 
00871 int
00872 ngram_model_dmp_write(ngram_model_t *base,
00873                       const char *file_name)
00874 {
00875     ngram_model_dmp_t *model;
00876     ngram_model_t *newbase;
00877     FILE *fh;
00878 
00879     /* First, construct a DMP model from the base model. */
00880     model = ngram_model_dmp_build(base);
00881     newbase = &model->base;
00882 
00883     /* Now write it, confident in the knowledge that it's the right
00884      * kind of language model internally. */
00885     if ((fh = fopen(file_name, "wb")) == NULL) {
00886         E_ERROR("Cannot create file %s\n", file_name);
00887         return -1;
00888     }
00889     ngram_model_dmp_write_header(fh);
00890     ngram_model_dmp_write_lm_filename(fh, file_name);
00891     ngram_model_dmp_write_version(fh, 0);
00892     ngram_model_dmp_write_fmtdesc(fh);
00893     ngram_model_dmp_write_ngram_counts(fh, newbase);
00894     ngram_model_dmp_write_unigram(fh, newbase);
00895     ngram_model_dmp_write_bigram(fh, newbase);
00896     ngram_model_dmp_write_trigram(fh, newbase);
00897     ngram_model_dmp_write_bgprob(fh, newbase);
00898     if (newbase->n > 2) {
00899         ngram_model_dmp_write_tgbowt(fh, newbase);
00900         ngram_model_dmp_write_tgprob(fh, newbase);
00901         ngram_model_dmp_write_tg_segbase(fh, newbase);
00902     }
00903     ngram_model_dmp_write_wordstr(fh, newbase);
00904     ngram_model_free(newbase);
00905 
00906     return fclose(fh);
00907 }
00908 
00909 static int
00910 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
00911                               float32 wip, float32 uw)
00912 {
00913     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00914     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00915     return 0;
00916 }
00917 
00918 /* Lousy "templating" for things that are largely the same in DMP and
00919  * ARPA models, except for the bigram and trigram types and some
00920  * names. */
00921 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
00922 #include "lm3g_templates.c"
00923 
00924 static void
00925 ngram_model_dmp_free(ngram_model_t *base)
00926 {
00927     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00928 
00929     ckd_free(model->lm3g.unigrams);
00930     ckd_free(model->lm3g.prob2);
00931     if (model->dump_mmap) {
00932         mmio_file_unmap(model->dump_mmap);
00933     } 
00934     else {
00935         ckd_free(model->lm3g.bigrams);
00936         if (base->n > 2) {
00937             ckd_free(model->lm3g.trigrams);
00938             ckd_free(model->lm3g.tseg_base);
00939         }
00940     }
00941     if (base->n > 2) {
00942         ckd_free(model->lm3g.bo_wt2);
00943         ckd_free(model->lm3g.prob3);
00944     }
00945 
00946     lm3g_tginfo_free(base, &model->lm3g);
00947 }
00948 
00949 static ngram_funcs_t ngram_model_dmp_funcs = {
00950     ngram_model_dmp_free,          /* free */
00951     ngram_model_dmp_apply_weights, /* apply_weights */
00952     lm3g_template_score,           /* score */
00953     lm3g_template_raw_score,       /* raw_score */
00954     lm3g_template_add_ug,          /* add_ug */
00955     lm3g_template_flush,           /* flush */
00956     lm3g_template_iter,             /* iter */
00957     lm3g_template_mgrams,          /* mgrams */
00958     lm3g_template_successors,      /* successors */
00959     lm3g_template_iter_get,        /* iter_get */
00960     lm3g_template_iter_next,       /* iter_next */
00961     lm3g_template_iter_free        /* iter_free */
00962 };

Generated on Mon Aug 29 2011 for SphinxBase by  doxygen 1.7.1