From 012af21e6305b5a0c566b67d5584fa0edb939b14 Mon Sep 17 00:00:00 2001 From: Andre Natal Date: Wed, 5 Aug 2015 00:33:00 +0200 Subject: [PATCH] Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug Signed-off-by: Andre Natal --- media/pocketsphinx/src/dict.c | 250 ++++++++++++++++++- media/pocketsphinx/src/dict.h | 20 +- media/pocketsphinx/src/fsg_search.c | 19 +- media/pocketsphinx/src/fsg_search_internal.h | 3 +- media/pocketsphinx/src/pocketsphinx.c | 4 +- media/pocketsphinx/src/ps_lattice.c | 3 +- 6 files changed, 289 insertions(+), 10 deletions(-) diff --git a/media/pocketsphinx/src/dict.c b/media/pocketsphinx/src/dict.c index a74249322084..edbe14b5c751 100644 --- a/media/pocketsphinx/src/dict.c +++ b/media/pocketsphinx/src/dict.c @@ -37,6 +37,7 @@ /* System headers. */ #include +#include // We need this for LONG_MIN /* SphinxBase headers. */ #include @@ -249,14 +250,14 @@ dict_write(dict_t *dict, char const *filename, char const *format) dict_t * -dict_init(cmd_ln_t *config, bin_mdef_t * mdef) +dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath) { FILE *fp, *fp2; int32 n; lineiter_t *li; dict_t *d; s3cipid_t sil; - char const *dictfile = NULL, *fillerfile = NULL; + char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL; if (config) { dictfile = cmd_ln_str_r(config, "-dict"); @@ -303,6 +304,19 @@ dict_init(cmd_ln_t *config, bin_mdef_t * mdef) * Also check for type size restrictions. */ d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */ + if (config){ + arpafile = string_join(dictfile, ".dmp", NULL); + } + if (arpafile) { + ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath); + ckd_free(arpafile); + if (!ngram_g2p_model) { + E_ERROR("No arpa model found \n"); + return NULL; + } + d->ngram_g2p_model = ngram_g2p_model; + } + d->refcnt = 1; d->max_words = (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID; @@ -474,6 +488,8 @@ dict_free(dict_t * d) hash_table_free(d->ht); if (d->mdef) bin_mdef_free(d->mdef); + if (d->ngram_g2p_model) + ngram_model_free(d->ngram_g2p_model); ckd_free((void *) d); return 0; @@ -487,3 +503,233 @@ dict_report(dict_t * d) E_INFO_NOFN("No of word: %d\n", d->n_word); E_INFO_NOFN("\n"); } + +// This function returns if a string (str) starts with the passed prefix (*pre) +int +dict_starts_with(const char *pre, const char *str) +{ + size_t lenpre = strlen(pre), lenstr = strlen(str); + return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0; +} + +// Helper function to clear unigram +void +free_unigram_t(unigram_t *unigram) +{ + ckd_free(unigram->word); + ckd_free(unigram->phone); +} + +// This function splits an unigram received (in format e|w}UW) and return a structure +// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone +unigram_t +dict_split_unigram(const char * word) +{ + size_t total_graphemes = 0; + size_t total_phone = 0; + int token_pos = 0; + int w ; + char *phone; + char *letter; + size_t lenword = 0; + char unigram_letter; + int add; + + lenword = strlen(word); + for (w = 0; w < lenword; w++) { + unigram_letter = word[w]; + if (unigram_letter == '}') { + token_pos = w; + continue; + } + if (!token_pos) + total_graphemes++; + else + total_phone++; + } + + letter = ckd_calloc(1, total_graphemes+1); + add = 0; + for (w = 0; w < total_graphemes; w++) { + if (word[w] == '|') + { + add++; + continue; + } + letter[w - add] = word[w]; + } + + phone = ckd_calloc(1, total_phone+1); + for (w = 0; w < total_phone; w++) { + if (word[w + 1 + total_graphemes] == '|') { + phone[w] = ' '; + } else { + phone[w] = word[w + 1 + total_graphemes]; + } + } + + unigram_t unigram = { letter , phone}; + + return unigram; +}; + +// This function calculates the most likely unigram to appear in the current position at the word +// based on the three latest chosen/winners unigrams (history) and return a structure containing +// the word id (wid), and lengths of the phoneme and the word +struct winner_t +dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset) +{ + long current_prob = LONG_MIN; + struct winner_t winner; + int32 i = 0, j = 0; + int nused; + int32 ngram_order = ngram_model_get_size(model); + int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32)); + gnode_t *gn; + const char *vocab; + const char *sub; + int32 prob; + unigram_t unigram; + const int32 *total_unigrams = ngram_model_get_counts(model); + + for (gn = history_list; gn; gn = gnode_next(gn)) { + // we need to build history from last to first because glist returns itens from last to first + history[ngram_order - j - 1] = gnode_int32(gn); + j++; + if (j >= ngram_order) + break; + } + + for (i = 0; i < *total_unigrams; i++) { + vocab = ngram_word(model, i); + unigram = dict_split_unigram(vocab); + sub = word_grapheme + word_offset; + if (dict_starts_with(unigram.word, sub)) { + prob = ngram_ng_prob(model, i, history, j, &nused); + if (current_prob < prob) { + current_prob = prob; + winner.winner_wid = i; + winner.length_match = strlen(unigram.word); + winner.len_phoneme = strlen(unigram.phone); + } + } + + free_unigram_t(&unigram); + } + + if (history) + ckd_free(history); + + return winner; +} + +// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part, +// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we +// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history. +char * +dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model) +{ + char *final_phone = NULL; + int totalh = 0; + size_t increment = 1; + int word_offset = 0; + int j; + size_t grapheme_len = 0, final_phoneme_len = 0; + glist_t history_list = NULL; + gnode_t *gn; + int first = 0; + struct winner_t winner; + const char *word; + unigram_t unigram; + + int32 wid_sentence = ngram_wid(ngram_g2p_model,""); // start with sentence + history_list = glist_add_int32(history_list, wid_sentence); + grapheme_len = strlen(word_grapheme); + for (j = 0 ; j < grapheme_len ; j += increment) { + winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset); + increment = winner.length_match; + if (increment == 0) { + E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme); + ckd_free(history_list); + return NULL; + } + history_list = glist_add_int32(history_list, winner.winner_wid); + totalh = j + 1; + word_offset += winner.length_match; + final_phoneme_len += winner.len_phoneme; + } + + history_list = glist_reverse(history_list); + final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1); + for (gn = history_list; gn; gn = gnode_next(gn)) { + if (!first) { + first = 1; + continue; + } + word = ngram_word(ngram_g2p_model, gnode_int32(gn)); + + if (!word) + continue; + + unigram = dict_split_unigram(word); + + if (strcmp(unigram.phone, "_") == 0) { + free_unigram_t(&unigram); + continue; + } + strcat(final_phone, unigram.phone); + strcat(final_phone, " "); + + free_unigram_t(&unigram); + } + + if (history_list) + glist_free(history_list); + + return final_phone; +} + +// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict. +// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816 +int +dict_add_g2p_word(dict_t *dict, char const *word) +{ + int32 wid = 0; + s3cipid_t *pron; + char **phonestr, *tmp; + int np, i; + char *phones; + + phones = dict_g2p(word, dict->ngram_g2p_model); + if (phones == NULL) + return 0; + + E_INFO("Adding phone %s for word %s \n", phones, word); + tmp = ckd_salloc(phones); + np = str2words(tmp, NULL, 0); + phonestr = ckd_calloc(np, sizeof(*phonestr)); + str2words(tmp, phonestr, np); + pron = ckd_calloc(np, sizeof(*pron)); + for (i = 0; i < np; ++i) { + pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]); + if (pron[i] == -1) { + E_ERROR("Unknown phone %s in phone string %s\n", + phonestr[i], tmp); + ckd_free(phonestr); + ckd_free(tmp); + ckd_free(pron); + ckd_free(phones); + return -1; + } + } + ckd_free(phonestr); + ckd_free(tmp); + ckd_free(phones); + if ((wid = dict_add_word(dict, word, pron, np)) == -1) { + ckd_free(pron); + return -1; + } + ckd_free(pron); + + return wid; +} diff --git a/media/pocketsphinx/src/dict.h b/media/pocketsphinx/src/dict.h index 26ffd2b49942..ee57e5a57aef 100644 --- a/media/pocketsphinx/src/dict.h +++ b/media/pocketsphinx/src/dict.h @@ -44,6 +44,7 @@ /* SphinxBase headers. */ #include +#include /* Local headers. */ #include "s3types.h" @@ -86,8 +87,21 @@ typedef struct { s3wid_t finishwid; /**< FOR INTERNAL-USE ONLY */ s3wid_t silwid; /**< FOR INTERNAL-USE ONLY */ int nocase; + ngram_model_t *ngram_g2p_model; } dict_t; +struct winner_t +{ + size_t length_match; + int winner_wid; + size_t len_phoneme; +}; + +typedef struct +{ + char *word; + char *phone; +} unigram_t; /** * Initialize a new dictionary. @@ -101,7 +115,8 @@ typedef struct { * Return ptr to dict_t if successful, NULL otherwise. */ dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */ - bin_mdef_t *mdef /**< For looking up CI phone IDs (or NULL) */ + bin_mdef_t *mdef, /**< For looking up CI phone IDs (or NULL) */ + logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere. ); /** @@ -203,6 +218,9 @@ int dict_free(dict_t *d); void dict_report(dict_t *d /**< A dictionary structure */ ); +// g2p functions +int dict_add_g2p_word(dict_t * dict, char const *word); + #ifdef __cplusplus } #endif diff --git a/media/pocketsphinx/src/fsg_search.c b/media/pocketsphinx/src/fsg_search.c index 14ed91ca4e64..f24a0fb83f8f 100644 --- a/media/pocketsphinx/src/fsg_search.c +++ b/media/pocketsphinx/src/fsg_search.c @@ -65,6 +65,7 @@ #include "fsg_search_internal.h" #include "fsg_history.h" #include "fsg_lextree.h" +#include "dict.h" /* Turn this on for detailed debugging dump */ #define __FSG_DBG__ 0 @@ -139,9 +140,21 @@ fsg_search_check_dict(fsg_search_t *fsgs, fsg_model_t *fsg) word = fsg_model_word_str(fsg, i); wid = dict_wordid(dict, word); if (wid == BAD_S3WID) { - E_ERROR("The word '%s' is missing in the dictionary\n", word); - return FALSE; - } + E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word); + if (!dict->ngram_g2p_model) { + E_ERROR("NO dict->ngram_g2p_model. Aborting.."); + return FALSE; + } + + int new_wid = dict_add_g2p_word(dict, word); + if (new_wid > 0){ + /* Now we also have to add it to dict2pid. */ + dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid); + } else { + E_ERROR("Exiting... \n"); + return FALSE; + } + } } return TRUE; diff --git a/media/pocketsphinx/src/fsg_search_internal.h b/media/pocketsphinx/src/fsg_search_internal.h index 0f6258789b99..15324daf7adb 100644 --- a/media/pocketsphinx/src/fsg_search_internal.h +++ b/media/pocketsphinx/src/fsg_search_internal.h @@ -69,7 +69,8 @@ typedef struct fsg_search_s { ps_search_t base; hmm_context_t *hmmctx; /**< HMM context. */ - + char const *arpafile; + cmd_ln_t *config; fsg_model_t *fsg; /**< FSG model */ struct fsg_lextree_s *lextree;/**< Lextree structure for the currently active FSG */ diff --git a/media/pocketsphinx/src/pocketsphinx.c b/media/pocketsphinx/src/pocketsphinx.c index abce5255b03e..7514c1fb5dc8 100644 --- a/media/pocketsphinx/src/pocketsphinx.c +++ b/media/pocketsphinx/src/pocketsphinx.c @@ -278,7 +278,7 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *config) /* Dictionary and triphone mappings (depends on acmod). */ /* FIXME: pass config, change arguments, implement LTS, etc. */ - if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL) + if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL) return -1; if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL) return -1; @@ -720,7 +720,7 @@ ps_load_dict(ps_decoder_t *ps, char const *dictfile, cmd_ln_str_r(ps->config, "-fdict")); /* Try to load it. */ - if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) { + if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) { cmd_ln_free_r(newconfig); return -1; } diff --git a/media/pocketsphinx/src/ps_lattice.c b/media/pocketsphinx/src/ps_lattice.c index 7426cc2989da..6f44a282b57e 100644 --- a/media/pocketsphinx/src/ps_lattice.c +++ b/media/pocketsphinx/src/ps_lattice.c @@ -404,10 +404,11 @@ ps_lattice_read(ps_decoder_t *ps, dag->search = ps->search; dag->dict = dict_retain(ps->dict); dag->lmath = logmath_retain(ps->lmath); + dag->dict = dict_init(NULL, NULL, dag->lmath); dag->frate = cmd_ln_int32_r(dag->search->config, "-frate"); } else { - dag->dict = dict_init(NULL, NULL); + dag->dict = dict_init(NULL, NULL, dag->lmath); dag->lmath = logmath_init(1.0001, 0, FALSE); dag->frate = 100; }