00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif
00059
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063 const char *ext;
00064
00065 ext = strrchr(file_name, '.');
00066 if (ext == NULL) {
00067 return NGRAM_INVALID;
00068 }
00069 if (0 == strcmp_nocase(ext, ".gz")) {
00070 while (--ext >= file_name) {
00071 if (*ext == '.') break;
00072 }
00073 if (ext < file_name) {
00074 return NGRAM_INVALID;
00075 }
00076 }
00077 else if (0 == strcmp_nocase(ext, ".bz2")) {
00078 while (--ext >= file_name) {
00079 if (*ext == '.') break;
00080 }
00081 if (ext < file_name) {
00082 return NGRAM_INVALID;
00083 }
00084 }
00085
00086 if (0 == strncmp_nocase(ext, ".ARPA", 5))
00087 return NGRAM_ARPA;
00088 if (0 == strncmp_nocase(ext, ".DMP", 4))
00089 return NGRAM_DMP;
00090 return NGRAM_INVALID;
00091 }
00092
00093 ngram_file_type_t
00094 ngram_str_to_type(const char *str_name)
00095 {
00096 if (0 == strcmp_nocase(str_name, "arpa"))
00097 return NGRAM_ARPA;
00098 if (0 == strcmp_nocase(str_name, "dmp"))
00099 return NGRAM_DMP;
00100 return NGRAM_INVALID;
00101 }
00102
00103 char const *
00104 ngram_type_to_str(int type)
00105 {
00106 switch (type) {
00107 case NGRAM_ARPA:
00108 return "arpa";
00109 case NGRAM_DMP:
00110 return "dmp";
00111 default:
00112 return NULL;
00113 }
00114 }
00115
00116
00117 ngram_model_t *
00118 ngram_model_read(cmd_ln_t *config,
00119 const char *file_name,
00120 ngram_file_type_t file_type,
00121 logmath_t *lmath)
00122 {
00123 ngram_model_t *model = NULL;
00124
00125 switch (file_type) {
00126 case NGRAM_AUTO: {
00127 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00128 break;
00129 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00130 break;
00131 return NULL;
00132 }
00133 case NGRAM_ARPA:
00134 model = ngram_model_arpa_read(config, file_name, lmath);
00135 break;
00136 case NGRAM_DMP:
00137 model = ngram_model_dmp_read(config, file_name, lmath);
00138 break;
00139 default:
00140 E_ERROR("language model file type not supported\n");
00141 return NULL;
00142 }
00143
00144
00145 if (config) {
00146 float32 lw = 1.0;
00147 float32 wip = 1.0;
00148 float32 uw = 1.0;
00149
00150 if (cmd_ln_exists_r(config, "-lw"))
00151 lw = cmd_ln_float32_r(config, "-lw");
00152 if (cmd_ln_exists_r(config, "-wip"))
00153 wip = cmd_ln_float32_r(config, "-wip");
00154 if (cmd_ln_exists_r(config, "-uw"))
00155 uw = cmd_ln_float32_r(config, "-uw");
00156
00157 ngram_model_apply_weights(model, lw, wip, uw);
00158 }
00159
00160 return model;
00161 }
00162
00163 int
00164 ngram_model_write(ngram_model_t *model, const char *file_name,
00165 ngram_file_type_t file_type)
00166 {
00167 switch (file_type) {
00168 case NGRAM_AUTO: {
00169 file_type = ngram_file_name_to_type(file_name);
00170
00171 if (file_type == NGRAM_INVALID)
00172 file_type = NGRAM_ARPA;
00173 return ngram_model_write(model, file_name, file_type);
00174 }
00175 case NGRAM_ARPA:
00176 return ngram_model_arpa_write(model, file_name);
00177 case NGRAM_DMP:
00178 return ngram_model_dmp_write(model, file_name);
00179 default:
00180 E_ERROR("language model file type not supported\n");
00181 return -1;
00182 }
00183 E_ERROR("language model file type not supported\n");
00184 return -1;
00185 }
00186
00187 int32
00188 ngram_model_init(ngram_model_t *base,
00189 ngram_funcs_t *funcs,
00190 logmath_t *lmath,
00191 int32 n, int32 n_unigram)
00192 {
00193 base->refcount = 1;
00194 base->funcs = funcs;
00195 base->n = n;
00196
00197 if (base->n_counts == NULL)
00198 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00199
00200 if (base->lmath != lmath) {
00201
00202 base->lw = 1.0;
00203 base->log_wip = 0;
00204 base->log_uw = 0;
00205 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00206 base->log_uniform_weight = logmath_get_zero(lmath);
00207 base->log_zero = logmath_get_zero(lmath);
00208 base->lmath = lmath;
00209 }
00210
00211 if (base->word_str) {
00212
00213 if (base->writable) {
00214 int32 i;
00215 for (i = 0; i < base->n_words; ++i) {
00216 ckd_free(base->word_str[i]);
00217 base->word_str[i] = NULL;
00218 }
00219 }
00220 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00221 }
00222 else
00223 base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00224
00225
00226 if (base->wid)
00227 hash_table_empty(base->wid);
00228 else
00229 base->wid = hash_table_new(n_unigram, FALSE);
00230 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
00231
00232 return 0;
00233 }
00234
00235 ngram_model_t *
00236 ngram_model_retain(ngram_model_t *model)
00237 {
00238 ++model->refcount;
00239 return model;
00240 }
00241
00242
00243 void
00244 ngram_model_flush(ngram_model_t *model)
00245 {
00246 if (model->funcs && model->funcs->flush)
00247 (*model->funcs->flush)(model);
00248 }
00249
00250 int
00251 ngram_model_free(ngram_model_t *model)
00252 {
00253 int i;
00254
00255 if (model == NULL)
00256 return 0;
00257 if (--model->refcount > 0)
00258 return model->refcount;
00259 if (model->funcs && model->funcs->free)
00260 (*model->funcs->free)(model);
00261 if (model->writable) {
00262
00263 for (i = 0; i < model->n_words; ++i) {
00264 ckd_free(model->word_str[i]);
00265 }
00266 }
00267 else {
00268
00269 for (i = 0; i < model->n_classes; ++i) {
00270 ngram_class_t *lmclass;
00271 int32 j;
00272
00273 lmclass = model->classes[i];
00274 for (j = 0; j < lmclass->n_words; ++j) {
00275 ckd_free(model->word_str[lmclass->start_wid + j]);
00276 }
00277 for (j = 0; j < lmclass->n_hash; ++j) {
00278 if (lmclass->nword_hash[j].wid != -1) {
00279 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00280 }
00281 }
00282 }
00283 }
00284 for (i = 0; i < model->n_classes; ++i) {
00285 ngram_class_free(model->classes[i]);
00286 }
00287 ckd_free(model->classes);
00288 hash_table_free(model->wid);
00289 ckd_free(model->word_str);
00290 ckd_free(model->n_counts);
00291 ckd_free(model);
00292 return 0;
00293 }
00294
00295 int
00296 ngram_model_casefold(ngram_model_t *model, int kase)
00297 {
00298 int writable, i;
00299 hash_table_t *new_wid;
00300
00301
00302 writable = model->writable;
00303
00304 model->writable = TRUE;
00305
00306
00307
00308 new_wid = hash_table_new(model->n_words, FALSE);
00309 for (i = 0; i < model->n_words; ++i) {
00310 char *outstr;
00311 if (writable) {
00312 outstr = model->word_str[i];
00313 }
00314 else {
00315 outstr = ckd_salloc(model->word_str[i]);
00316 }
00317
00318 if (outstr[0] == '<' || outstr[0] == '[') {
00319 }
00320 else {
00321 switch (kase) {
00322 case NGRAM_UPPER:
00323 ucase(outstr);
00324 break;
00325 case NGRAM_LOWER:
00326 lcase(outstr);
00327 break;
00328 default:
00329 ;
00330 }
00331 }
00332 model->word_str[i] = outstr;
00333
00334
00335
00336 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00337 E_WARN("Duplicate word in dictionary after conversion: %s\n",
00338 model->word_str[i]);
00339 }
00340 }
00341
00342 hash_table_free(model->wid);
00343 model->wid = new_wid;
00344 return 0;
00345 }
00346
00347 #ifdef HAVE_ICONV
00348 int
00349 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00350 {
00351 iconv_t ic;
00352 char *outbuf;
00353 size_t maxlen;
00354 int i, writable;
00355 hash_table_t *new_wid;
00356
00357
00358
00359 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00360 E_ERROR_SYSTEM("iconv_open() failed");
00361 return -1;
00362 }
00363
00364
00365
00366
00367
00368
00369
00370 maxlen = 0;
00371 for (i = 0; i < model->n_words; ++i) {
00372 if (strlen(model->word_str[i]) > maxlen)
00373 maxlen = strlen(model->word_str[i]);
00374 }
00375
00376 writable = model->writable;
00377
00378 model->writable = TRUE;
00379
00380 maxlen = maxlen * sizeof(int) + 15;
00381 outbuf = ckd_calloc(maxlen, 1);
00382
00383
00384 new_wid = hash_table_new(model->n_words, FALSE);
00385 for (i = 0; i < model->n_words; ++i) {
00386 ICONV_CONST char *in;
00387 char *out;
00388 size_t inleft, outleft, result;
00389
00390 start_conversion:
00391 in = (ICONV_CONST char *)model->word_str[i];
00392
00393 inleft = strlen(in);
00394 out = outbuf;
00395 outleft = maxlen;
00396
00397 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00398 if (errno != E2BIG) {
00399
00400
00401 E_ERROR_SYSTEM("iconv() failed");
00402 ckd_free(outbuf);
00403 hash_table_free(new_wid);
00404 return -1;
00405 }
00406
00407 iconv(ic, NULL, NULL, NULL, NULL);
00408
00409 maxlen *= 2;
00410 out = outbuf = ckd_realloc(outbuf, maxlen);
00411
00412 in = (ICONV_CONST char *)model->word_str[i];
00413 inleft = strlen(in);
00414 }
00415
00416
00417 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00418 if (errno != E2BIG) {
00419
00420
00421 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00422 ckd_free(outbuf);
00423 hash_table_free(new_wid);
00424 return -1;
00425 }
00426
00427 iconv(ic, NULL, NULL, NULL, NULL);
00428
00429 maxlen *= 2;
00430 outbuf = ckd_realloc(outbuf, maxlen);
00431
00432 goto start_conversion;
00433 }
00434
00435 result = maxlen - outleft;
00436
00437 if (writable) {
00438
00439 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00440 model->word_str[i][result] = '\0';
00441 }
00442 else {
00443
00444 model->word_str[i] = ckd_calloc(result + 1, 1);
00445 }
00446
00447 memcpy(model->word_str[i], outbuf, result);
00448
00449
00450
00451
00452 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00453 E_WARN("Duplicate word in dictionary after conversion: %s\n",
00454 model->word_str[i]);
00455 }
00456 }
00457 ckd_free(outbuf);
00458 iconv_close(ic);
00459
00460 hash_table_free(model->wid);
00461 model->wid = new_wid;
00462
00463 return 0;
00464 }
00465 #else
00466 int
00467 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00468 {
00469 return -1;
00470 }
00471 #endif
00472
00473 int
00474 ngram_model_apply_weights(ngram_model_t *model,
00475 float32 lw, float32 wip, float32 uw)
00476 {
00477 return (*model->funcs->apply_weights)(model, lw, wip, uw);
00478 }
00479
00480 float32
00481 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00482 int32 *out_log_uw)
00483 {
00484 if (out_log_wip) *out_log_wip = model->log_wip;
00485 if (out_log_uw) *out_log_uw = model->log_uw;
00486 return model->lw;
00487 }
00488
00489
00490 int32
00491 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00492 int32 n_hist, int32 *n_used)
00493 {
00494 int32 score, class_weight = 0;
00495 int i;
00496
00497
00498 if (wid == NGRAM_INVALID_WID)
00499 return model->log_zero;
00500
00501
00502 if (NGRAM_IS_CLASSWID(wid)) {
00503 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00504
00505 class_weight = ngram_class_prob(lmclass, wid);
00506 if (class_weight == 1)
00507 return model->log_zero;
00508 wid = lmclass->tag_wid;
00509 }
00510 for (i = 0; i < n_hist; ++i) {
00511 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00512 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00513 }
00514 score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00515
00516
00517 return score + class_weight;
00518 }
00519
00520 int32
00521 ngram_score(ngram_model_t *model, const char *word, ...)
00522 {
00523 va_list history;
00524 const char *hword;
00525 int32 *histid;
00526 int32 n_hist;
00527 int32 n_used;
00528 int32 prob;
00529
00530 va_start(history, word);
00531 n_hist = 0;
00532 while ((hword = va_arg(history, const char *)) != NULL)
00533 ++n_hist;
00534 va_end(history);
00535
00536 histid = ckd_calloc(n_hist, sizeof(*histid));
00537 va_start(history, word);
00538 n_hist = 0;
00539 while ((hword = va_arg(history, const char *)) != NULL) {
00540 histid[n_hist] = ngram_wid(model, hword);
00541 ++n_hist;
00542 }
00543 va_end(history);
00544
00545 prob = ngram_ng_score(model, ngram_wid(model, word),
00546 histid, n_hist, &n_used);
00547 ckd_free(histid);
00548 return prob;
00549 }
00550
00551 int32
00552 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00553 {
00554 int32 hist[2];
00555 hist[0] = w2;
00556 hist[1] = w1;
00557 return ngram_ng_score(model, w3, hist, 2, n_used);
00558 }
00559
00560 int32
00561 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00562 {
00563 return ngram_ng_score(model, w2, &w1, 1, n_used);
00564 }
00565
00566 int32
00567 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00568 int32 n_hist, int32 *n_used)
00569 {
00570 int32 prob, class_weight = 0;
00571 int i;
00572
00573
00574 if (wid == NGRAM_INVALID_WID)
00575 return model->log_zero;
00576
00577
00578 if (NGRAM_IS_CLASSWID(wid)) {
00579 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00580
00581 class_weight = ngram_class_prob(lmclass, wid);
00582 if (class_weight == 1)
00583 return class_weight;
00584 wid = lmclass->tag_wid;
00585 }
00586 for (i = 0; i < n_hist; ++i) {
00587 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00588 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00589 }
00590 prob = (*model->funcs->raw_score)(model, wid, history,
00591 n_hist, n_used);
00592
00593 return prob + class_weight;
00594 }
00595
00596 int32
00597 ngram_prob(ngram_model_t *model, const char *word, ...)
00598 {
00599 va_list history;
00600 const char *hword;
00601 int32 *histid;
00602 int32 n_hist;
00603 int32 n_used;
00604 int32 prob;
00605
00606 va_start(history, word);
00607 n_hist = 0;
00608 while ((hword = va_arg(history, const char *)) != NULL)
00609 ++n_hist;
00610 va_end(history);
00611
00612 histid = ckd_calloc(n_hist, sizeof(*histid));
00613 va_start(history, word);
00614 n_hist = 0;
00615 while ((hword = va_arg(history, const char *)) != NULL) {
00616 histid[n_hist] = ngram_wid(model, hword);
00617 ++n_hist;
00618 }
00619 va_end(history);
00620
00621 prob = ngram_ng_prob(model, ngram_wid(model, word),
00622 histid, n_hist, &n_used);
00623 ckd_free(histid);
00624 return prob;
00625 }
00626
00627 int32
00628 ngram_score_to_prob(ngram_model_t *base, int32 score)
00629 {
00630 int32 prob;
00631
00632
00633 prob = score - base->log_wip;
00634
00635 prob = (int32)(prob / base->lw);
00636
00637 return prob;
00638 }
00639
00640 int32
00641 ngram_unknown_wid(ngram_model_t *model)
00642 {
00643 int32 val;
00644
00645
00646
00647 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00648 return NGRAM_INVALID_WID;
00649 else
00650 return val;
00651 }
00652
00653 int32
00654 ngram_zero(ngram_model_t *model)
00655 {
00656 return model->log_zero;
00657 }
00658
00659 int32
00660 ngram_model_get_size(ngram_model_t *model)
00661 {
00662 if (model != NULL)
00663 return model->n;
00664 return 0;
00665 }
00666
00667 int32 const *
00668 ngram_model_get_counts(ngram_model_t *model)
00669 {
00670 if (model != NULL)
00671 return model->n_counts;
00672 return NULL;
00673 }
00674
00675 void
00676 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00677 int m, int successor)
00678 {
00679 itor->model = model;
00680 itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
00681 itor->m = m;
00682 itor->successor = successor;
00683 }
00684
00685 ngram_iter_t *
00686 ngram_model_mgrams(ngram_model_t *model, int m)
00687 {
00688 ngram_iter_t *itor;
00689
00690 if (m >= model->n)
00691 return NULL;
00692 if (model->funcs->mgrams == NULL)
00693 return NULL;
00694 itor = (*model->funcs->mgrams)(model, m);
00695 return itor;
00696 }
00697
00698 ngram_iter_t *
00699 ngram_iter(ngram_model_t *model, const char *word, ...)
00700 {
00701 va_list history;
00702 const char *hword;
00703 int32 *histid;
00704 int32 n_hist;
00705 ngram_iter_t *itor;
00706
00707 va_start(history, word);
00708 n_hist = 0;
00709 while ((hword = va_arg(history, const char *)) != NULL)
00710 ++n_hist;
00711 va_end(history);
00712
00713 histid = ckd_calloc(n_hist, sizeof(*histid));
00714 va_start(history, word);
00715 n_hist = 0;
00716 while ((hword = va_arg(history, const char *)) != NULL) {
00717 histid[n_hist] = ngram_wid(model, hword);
00718 ++n_hist;
00719 }
00720 va_end(history);
00721
00722 itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
00723 ckd_free(histid);
00724 return itor;
00725 }
00726
00727 ngram_iter_t *
00728 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
00729 {
00730 if (n_hist >= model->n)
00731 return NULL;
00732 if (model->funcs->iter == NULL)
00733 return NULL;
00734 return (*model->funcs->iter)(model, wid, history, n_hist);
00735 }
00736
00737 ngram_iter_t *
00738 ngram_iter_successors(ngram_iter_t *itor)
00739 {
00740
00741 if (itor->m == itor->model->n - 1)
00742 return NULL;
00743 return (*itor->model->funcs->successors)(itor);
00744 }
00745
00746 int32 const *
00747 ngram_iter_get(ngram_iter_t *itor,
00748 int32 *out_score,
00749 int32 *out_bowt)
00750 {
00751 return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
00752 }
00753
00754 ngram_iter_t *
00755 ngram_iter_next(ngram_iter_t *itor)
00756 {
00757 return (*itor->model->funcs->iter_next)(itor);
00758 }
00759
00760 void
00761 ngram_iter_free(ngram_iter_t *itor)
00762 {
00763 ckd_free(itor->wids);
00764 (*itor->model->funcs->iter_free)(itor);
00765 }
00766
00767 int32
00768 ngram_wid(ngram_model_t *model, const char *word)
00769 {
00770 int32 val;
00771
00772 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00773 return ngram_unknown_wid(model);
00774 else
00775 return val;
00776 }
00777
00778 const char *
00779 ngram_word(ngram_model_t *model, int32 wid)
00780 {
00781
00782 wid = NGRAM_BASEWID(wid);
00783 if (wid >= model->n_words)
00784 return NULL;
00785 return model->word_str[wid];
00786 }
00787
00791 int32
00792 ngram_add_word_internal(ngram_model_t *model,
00793 const char *word,
00794 int32 classid)
00795 {
00796 void *dummy;
00797 int32 wid;
00798
00799
00800 wid = model->n_words;
00801 if (classid >= 0) {
00802 wid = NGRAM_CLASSWID(wid, classid);
00803 }
00804
00805 if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00806 E_ERROR("Duplicate definition of word %s\n", word);
00807 return NGRAM_INVALID_WID;
00808 }
00809
00810 if (model->n_words >= model->n_1g_alloc) {
00811 model->n_1g_alloc += UG_ALLOC_STEP;
00812 model->word_str = ckd_realloc(model->word_str,
00813 sizeof(*model->word_str) * model->n_1g_alloc);
00814 }
00815
00816
00817 model->word_str[model->n_words] = ckd_salloc(word);
00818
00819 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00820 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00821 model->word_str[model->n_words], (void *)(long)(wid));
00822 }
00823
00824 ++model->n_words;
00825 return wid;
00826 }
00827
00828 int32
00829 ngram_model_add_word(ngram_model_t *model,
00830 const char *word, float32 weight)
00831 {
00832 int32 wid, prob = model->log_zero;
00833
00834 wid = ngram_add_word_internal(model, word, -1);
00835 if (wid == NGRAM_INVALID_WID)
00836 return wid;
00837
00838
00839 if (model->funcs && model->funcs->add_ug)
00840 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00841 if (prob == 0) {
00842 if (model->writable)
00843 ckd_free(model->word_str[wid]);
00844 return -1;
00845 }
00846 return wid;
00847 }
00848
00849 ngram_class_t *
00850 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00851 {
00852 ngram_class_t *lmclass;
00853 gnode_t *gn;
00854 float32 tprob;
00855 int i;
00856
00857 lmclass = ckd_calloc(1, sizeof(*lmclass));
00858 lmclass->tag_wid = tag_wid;
00859
00860 lmclass->start_wid = start_wid;
00861 lmclass->n_words = glist_count(classwords);
00862 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00863 lmclass->nword_hash = NULL;
00864 lmclass->n_hash = 0;
00865 tprob = 0.0;
00866 for (gn = classwords; gn; gn = gnode_next(gn)) {
00867 tprob += gnode_float32(gn);
00868 }
00869 if (tprob > 1.1 || tprob < 0.9) {
00870 E_WARN("Total class probability is %f, will normalize\n", tprob);
00871 for (gn = classwords; gn; gn = gnode_next(gn)) {
00872 gn->data.fl /= tprob;
00873 }
00874 }
00875 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00876 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00877 }
00878
00879 return lmclass;
00880 }
00881
00882 int32
00883 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00884 {
00885 int32 hash;
00886
00887 if (lmclass->nword_hash == NULL) {
00888
00889 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00890 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00891 lmclass->n_hash = NGRAM_HASH_SIZE;
00892 lmclass->n_hash_inuse = 0;
00893 }
00894
00895
00896
00897 hash = wid & (lmclass->n_hash - 1);
00898 if (lmclass->nword_hash[hash].wid == -1) {
00899
00900 lmclass->nword_hash[hash].wid = wid;
00901 lmclass->nword_hash[hash].prob1 = lweight;
00902 ++lmclass->n_hash_inuse;
00903 return hash;
00904 }
00905 else {
00906 int32 next;
00907
00908 while (lmclass->nword_hash[hash].next != -1)
00909 hash = lmclass->nword_hash[hash].next;
00910 assert(hash != -1);
00911
00912 if (lmclass->n_hash_inuse == lmclass->n_hash) {
00913
00914 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
00915 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00916 memset(lmclass->nword_hash + lmclass->n_hash,
00917 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00918
00919 next = lmclass->n_hash;
00920 lmclass->n_hash *= 2;
00921 }
00922 else {
00923
00924 for (next = 0; next < lmclass->n_hash; ++next)
00925 if (lmclass->nword_hash[next].wid == -1)
00926 break;
00927
00928 assert(next != lmclass->n_hash);
00929 }
00930 lmclass->nword_hash[next].wid = wid;
00931 lmclass->nword_hash[next].prob1 = lweight;
00932 lmclass->nword_hash[hash].next = next;
00933 ++lmclass->n_hash_inuse;
00934 return next;
00935 }
00936 }
00937
00938 void
00939 ngram_class_free(ngram_class_t *lmclass)
00940 {
00941 ckd_free(lmclass->nword_hash);
00942 ckd_free(lmclass->prob1);
00943 ckd_free(lmclass);
00944 }
00945
00946 int32
00947 ngram_model_add_class_word(ngram_model_t *model,
00948 const char *classname,
00949 const char *word,
00950 float32 weight)
00951 {
00952 ngram_class_t *lmclass;
00953 int32 classid, tag_wid, wid, i, scale;
00954 float32 fprob;
00955
00956
00957
00958
00959 tag_wid = ngram_wid(model, classname);
00960 if (tag_wid == NGRAM_INVALID_WID) {
00961 E_ERROR("No such word or class tag: %s\n", classname);
00962 return tag_wid;
00963 }
00964 for (classid = 0; classid < model->n_classes; ++classid) {
00965 if (model->classes[classid]->tag_wid == tag_wid)
00966 break;
00967 }
00968
00969 if (classid == model->n_classes) {
00970 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00971 return NGRAM_INVALID_WID;
00972 }
00973 lmclass = model->classes[classid];
00974
00975
00976 wid = ngram_add_word_internal(model, word, classid);
00977 if (wid == NGRAM_INVALID_WID)
00978 return wid;
00979
00980
00981 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00982
00983
00984
00985 scale = logmath_log(model->lmath, 1.0 - fprob);
00986 for (i = 0; i < lmclass->n_words; ++i)
00987 lmclass->prob1[i] += scale;
00988 for (i = 0; i < lmclass->n_hash; ++i)
00989 if (lmclass->nword_hash[i].wid != -1)
00990 lmclass->nword_hash[i].prob1 += scale;
00991
00992
00993 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00994 }
00995
00996 int32
00997 ngram_model_add_class(ngram_model_t *model,
00998 const char *classname,
00999 float32 classweight,
01000 char **words,
01001 const float32 *weights,
01002 int32 n_words)
01003 {
01004 ngram_class_t *lmclass;
01005 glist_t classwords = NULL;
01006 int32 i, start_wid = -1;
01007 int32 classid, tag_wid;
01008
01009
01010 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
01011 tag_wid = ngram_model_add_word(model, classname, classweight);
01012 if (tag_wid == NGRAM_INVALID_WID)
01013 return -1;
01014 }
01015
01016 if (model->n_classes == 128) {
01017 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
01018 return -1;
01019 }
01020 classid = model->n_classes;
01021 for (i = 0; i < n_words; ++i) {
01022 int32 wid;
01023
01024 wid = ngram_add_word_internal(model, words[i], classid);
01025 if (wid == NGRAM_INVALID_WID)
01026 return -1;
01027 if (start_wid == -1)
01028 start_wid = NGRAM_BASEWID(wid);
01029 classwords = glist_add_float32(classwords, weights[i]);
01030 }
01031 classwords = glist_reverse(classwords);
01032 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
01033 glist_free(classwords);
01034 if (lmclass == NULL)
01035 return -1;
01036
01037 ++model->n_classes;
01038 if (model->classes == NULL)
01039 model->classes = ckd_calloc(1, sizeof(*model->classes));
01040 else
01041 model->classes = ckd_realloc(model->classes,
01042 model->n_classes * sizeof(*model->classes));
01043 model->classes[classid] = lmclass;
01044 return classid;
01045 }
01046
01047 int32
01048 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
01049 {
01050 int32 base_wid = NGRAM_BASEWID(wid);
01051
01052 if (base_wid < lmclass->start_wid
01053 || base_wid > lmclass->start_wid + lmclass->n_words) {
01054 int32 hash;
01055
01056
01057 hash = wid & (lmclass->n_hash - 1);
01058 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
01059 hash = lmclass->nword_hash[hash].next;
01060 if (hash == -1)
01061 return 1;
01062 return lmclass->nword_hash[hash].prob1;
01063 }
01064 else {
01065 return lmclass->prob1[base_wid - lmclass->start_wid];
01066 }
01067 }
01068
01069 int32
01070 read_classdef_file(hash_table_t *classes, const char *file_name)
01071 {
01072 FILE *fp;
01073 int32 is_pipe;
01074 int inclass;
01075 int32 rv = -1;
01076 gnode_t *gn;
01077 glist_t classwords = NULL;
01078 glist_t classprobs = NULL;
01079 char *classname = NULL;
01080
01081 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
01082 E_ERROR("File %s not found\n", file_name);
01083 return -1;
01084 }
01085
01086 inclass = FALSE;
01087 while (!feof(fp)) {
01088 char line[512];
01089 char *wptr[2];
01090 int n_words;
01091
01092 if (fgets(line, sizeof(line), fp) == NULL)
01093 break;
01094
01095 n_words = str2words(line, wptr, 2);
01096 if (n_words <= 0)
01097 continue;
01098
01099 if (inclass) {
01100
01101 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
01102 classdef_t *classdef;
01103 gnode_t *word, *weight;
01104 int32 i;
01105
01106 if (classname == NULL || 0 != strcmp(wptr[1], classname))
01107 goto error_out;
01108 inclass = FALSE;
01109
01110
01111 classdef = ckd_calloc(1, sizeof(*classdef));
01112 classwords = glist_reverse(classwords);
01113 classprobs = glist_reverse(classprobs);
01114 classdef->n_words = glist_count(classwords);
01115 classdef->words = ckd_calloc(classdef->n_words,
01116 sizeof(*classdef->words));
01117 classdef->weights = ckd_calloc(classdef->n_words,
01118 sizeof(*classdef->weights));
01119 word = classwords;
01120 weight = classprobs;
01121 for (i = 0; i < classdef->n_words; ++i) {
01122 classdef->words[i] = gnode_ptr(word);
01123 classdef->weights[i] = gnode_float32(weight);
01124 word = gnode_next(word);
01125 weight = gnode_next(weight);
01126 }
01127
01128
01129 if (hash_table_enter(classes, classname, classdef) != classdef) {
01130 classdef_free(classdef);
01131 goto error_out;
01132 }
01133
01134
01135 glist_free(classwords);
01136 glist_free(classprobs);
01137 classwords = NULL;
01138 classprobs = NULL;
01139 classname = NULL;
01140 }
01141 else {
01142 float32 fprob;
01143
01144 if (n_words == 2)
01145 fprob = (float32)atof_c(wptr[1]);
01146 else
01147 fprob = 1.0f;
01148
01149 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
01150 classprobs = glist_add_float32(classprobs, fprob);
01151 }
01152 }
01153 else {
01154
01155 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
01156 if (inclass)
01157 goto error_out;
01158 inclass = TRUE;
01159 classname = ckd_salloc(wptr[1]);
01160 }
01161
01162 }
01163 }
01164 rv = 0;
01165
01166 error_out:
01167
01168 fclose_comp(fp, is_pipe);
01169 for (gn = classwords; gn; gn = gnode_next(gn))
01170 ckd_free(gnode_ptr(gn));
01171 glist_free(classwords);
01172 glist_free(classprobs);
01173 ckd_free(classname);
01174
01175 return rv;
01176 }
01177
01178 void
01179 classdef_free(classdef_t *classdef)
01180 {
01181 int32 i;
01182 for (i = 0; i < classdef->n_words; ++i)
01183 ckd_free(classdef->words[i]);
01184 ckd_free(classdef->words);
01185 ckd_free(classdef->weights);
01186 ckd_free(classdef);
01187 }
01188
01189
01190 int32
01191 ngram_model_read_classdef(ngram_model_t *model,
01192 const char *file_name)
01193 {
01194 hash_table_t *classes;
01195 glist_t hl = NULL;
01196 gnode_t *gn;
01197 int32 rv = -1;
01198
01199 classes = hash_table_new(0, FALSE);
01200 if (read_classdef_file(classes, file_name) < 0) {
01201 hash_table_free(classes);
01202 return -1;
01203 }
01204
01205
01206 hl = hash_table_tolist(classes, NULL);
01207 for (gn = hl; gn; gn = gnode_next(gn)) {
01208 hash_entry_t *he = gnode_ptr(gn);
01209 classdef_t *classdef = he->val;
01210
01211 if (ngram_model_add_class(model, he->key, 1.0,
01212 classdef->words,
01213 classdef->weights,
01214 classdef->n_words) < 0)
01215 goto error_out;
01216 }
01217 rv = 0;
01218
01219 error_out:
01220 for (gn = hl; gn; gn = gnode_next(gn)) {
01221 hash_entry_t *he = gnode_ptr(gn);
01222 ckd_free((char *)he->key);
01223 classdef_free(he->val);
01224 }
01225 glist_free(hl);
01226 hash_table_free(classes);
01227 return rv;
01228 }