SphinxBase  0.6
lm3g_model.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file lm3g_model.c Core Sphinx 3-gram code used in
39  * DMP/DMP32/ARPA (for now) model code.
40  *
41  * Author: A cast of thousands, probably.
42  */
43 #include <string.h>
44 #include <assert.h>
45 #include <limits.h>
46 
48 #include "sphinxbase/ckd_alloc.h"
49 #include "sphinxbase/err.h"
50 
51 #include "lm3g_model.h"
52 
53 void
54 lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g)
55 {
56  if (lm3g->tginfo == NULL)
57  return;
58  listelem_alloc_free(lm3g->le);
59  ckd_free(lm3g->tginfo);
60 }
61 
62 void
63 lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g)
64 {
65  if (lm3g->tginfo == NULL)
66  return;
67  listelem_alloc_free(lm3g->le);
68  memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *));
69  lm3g->le = listelem_alloc_init(sizeof(tginfo_t));
70 }
71 
72 void
73 lm3g_apply_weights(ngram_model_t *base,
74  lm3g_model_t *lm3g,
75  float32 lw, float32 wip, float32 uw)
76 {
77  int32 log_wip, log_uw, log_uniform_weight;
78  int i;
79 
80  /* Precalculate some log values we will like. */
81  log_wip = logmath_log(base->lmath, wip);
82  log_uw = logmath_log(base->lmath, uw);
83  log_uniform_weight = logmath_log(base->lmath, 1.0 - uw);
84 
85  for (i = 0; i < base->n_counts[0]; ++i) {
86  int32 prob1, bo_wt, n_used;
87 
88  /* Backoff weights just get scaled by the lw. */
89  bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw);
90  /* Unscaling unigram probs is a bit more complicated, so punt
91  * it back to the general code. */
92  prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used);
93  /* Now compute the new scaled probabilities. */
94  lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw);
95  if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */
96  /* Apply language weight and WIP */
97  lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
98  }
99  else {
100  /* Interpolate unigram probability with uniform. */
101  prob1 += log_uw;
102  prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight);
103  /* Apply language weight and WIP */
104  lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
105  }
106  }
107 
108  for (i = 0; i < lm3g->n_prob2; ++i) {
109  int32 prob2;
110  /* Can't just punt this back to general code since it is quantized. */
111  prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw);
112  lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip;
113  }
114 
115  if (base->n > 2) {
116  for (i = 0; i < lm3g->n_bo_wt2; ++i) {
117  lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l / base->lw * lw);
118  }
119  for (i = 0; i < lm3g->n_prob3; i++) {
120  int32 prob3;
121  /* Can't just punt this back to general code since it is quantized. */
122  prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw);
123  lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip;
124  }
125  }
126 
127  /* Store updated values in the model. */
128  base->log_wip = log_wip;
129  base->log_uw = log_uw;
130  base->log_uniform_weight = log_uniform_weight;
131  base->lw = lw;
132 }
133 
134 int32
135 lm3g_add_ug(ngram_model_t *base,
136  lm3g_model_t *lm3g, int32 wid, int32 lweight)
137 {
138  int32 score;
139 
140  /* This would be very bad if this happened! */
141  assert(!NGRAM_IS_CLASSWID(wid));
142 
143  /* Reallocate unigram array. */
144  lm3g->unigrams = ckd_realloc(lm3g->unigrams,
145  sizeof(*lm3g->unigrams) * base->n_1g_alloc);
146  memset(lm3g->unigrams + base->n_counts[0], 0,
147  (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams));
148  /* Reallocate tginfo array. */
149  lm3g->tginfo = ckd_realloc(lm3g->tginfo,
150  sizeof(*lm3g->tginfo) * base->n_1g_alloc);
151  memset(lm3g->tginfo + base->n_counts[0], 0,
152  (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo));
153  /* FIXME: we really ought to update base->log_uniform *and*
154  * renormalize all the other unigrams. This is really slow, so I
155  * will probably just provide a function to renormalize after
156  * adding unigrams, for anyone who really cares. */
157  /* This could be simplified but then we couldn't do it in logmath */
158  score = lweight + base->log_uniform + base->log_uw;
159  score = logmath_add(base->lmath, score,
160  base->log_uniform + base->log_uniform_weight);
161  lm3g->unigrams[wid].prob1.l = score;
162  /* This unigram by definition doesn't participate in any bigrams,
163  * so its backoff weight and bigram pointer are both undefined. */
164  lm3g->unigrams[wid].bo_wt1.l = 0;
165  lm3g->unigrams[wid].bigrams = 0;
166  /* Finally, increase the unigram count */
167  ++base->n_counts[0];
168  /* FIXME: Note that this can actually be quite bogus due to the
169  * presence of class words. If wid falls outside the unigram
170  * count, increase it to compensate, at the cost of no longer
171  * really knowing how many unigrams we have :( */
172  if (wid >= base->n_counts[0])
173  base->n_counts[0] = wid + 1;
174 
175  return score;
176 }
177 
178 void
179 init_sorted_list(sorted_list_t * l)
180 {
181  /* FIXME FIXME FIXME: Fixed size array!??! */
182  l->list = ckd_calloc(MAX_SORTED_ENTRIES,
183  sizeof(sorted_entry_t));
184  l->list[0].val.l = INT_MIN;
185  l->list[0].lower = 0;
186  l->list[0].higher = 0;
187  l->free = 1;
188 }
189 
190 void
191 free_sorted_list(sorted_list_t * l)
192 {
193  free(l->list);
194 }
195 
196 lmprob_t *
197 vals_in_sorted_list(sorted_list_t * l)
198 {
199  lmprob_t *vals;
200  int32 i;
201 
202  vals = ckd_calloc(l->free, sizeof(lmprob_t));
203  for (i = 0; i < l->free; i++)
204  vals[i] = l->list[i].val;
205  return (vals);
206 }
207 
208 int32
209 sorted_id(sorted_list_t * l, int32 *val)
210 {
211  int32 i = 0;
212 
213  for (;;) {
214  if (*val == l->list[i].val.l)
215  return (i);
216  if (*val < l->list[i].val.l) {
217  if (l->list[i].lower == 0) {
218  if (l->free >= MAX_SORTED_ENTRIES) {
219  /* Make the best of a bad situation. */
220  E_WARN("sorted list overflow (%d => %d)\n",
221  *val, l->list[i].val.l);
222  return i;
223  }
224 
225  l->list[i].lower = l->free;
226  (l->free)++;
227  i = l->list[i].lower;
228  l->list[i].val.l = *val;
229  return (i);
230  }
231  else
232  i = l->list[i].lower;
233  }
234  else {
235  if (l->list[i].higher == 0) {
236  if (l->free >= MAX_SORTED_ENTRIES) {
237  /* Make the best of a bad situation. */
238  E_WARN("sorted list overflow (%d => %d)\n",
239  *val, l->list[i].val);
240  return i;
241  }
242 
243  l->list[i].higher = l->free;
244  (l->free)++;
245  i = l->list[i].higher;
246  l->list[i].val.l = *val;
247  return (i);
248  }
249  else
250  i = l->list[i].higher;
251  }
252  }
253 }
254