Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug

Signed-off-by: Andre Natal <anatal@gmail.com>
This commit is contained in:
Andre Natal 2015-08-05 00:33:00 +02:00
parent af73e8302b
commit 012af21e63
6 changed files with 289 additions and 10 deletions

View File

@ -37,6 +37,7 @@
/* System headers. */
#include <string.h>
#include <limits.h> // We need this for LONG_MIN
/* SphinxBase headers. */
#include <sphinxbase/pio.h>
@ -249,14 +250,14 @@ dict_write(dict_t *dict, char const *filename, char const *format)
dict_t *
dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath)
{
FILE *fp, *fp2;
int32 n;
lineiter_t *li;
dict_t *d;
s3cipid_t sil;
char const *dictfile = NULL, *fillerfile = NULL;
char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL;
if (config) {
dictfile = cmd_ln_str_r(config, "-dict");
@ -303,6 +304,19 @@ dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
* Also check for type size restrictions.
*/
d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */
if (config){
arpafile = string_join(dictfile, ".dmp", NULL);
}
if (arpafile) {
ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath);
ckd_free(arpafile);
if (!ngram_g2p_model) {
E_ERROR("No arpa model found \n");
return NULL;
}
d->ngram_g2p_model = ngram_g2p_model;
}
d->refcnt = 1;
d->max_words =
(n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
@ -474,6 +488,8 @@ dict_free(dict_t * d)
hash_table_free(d->ht);
if (d->mdef)
bin_mdef_free(d->mdef);
if (d->ngram_g2p_model)
ngram_model_free(d->ngram_g2p_model);
ckd_free((void *) d);
return 0;
@ -487,3 +503,233 @@ dict_report(dict_t * d)
E_INFO_NOFN("No of word: %d\n", d->n_word);
E_INFO_NOFN("\n");
}
// This function returns if a string (str) starts with the passed prefix (*pre)
int
dict_starts_with(const char *pre, const char *str)
{
size_t lenpre = strlen(pre), lenstr = strlen(str);
return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0;
}
// Helper function to clear unigram
void
free_unigram_t(unigram_t *unigram)
{
ckd_free(unigram->word);
ckd_free(unigram->phone);
}
// This function splits an unigram received (in format e|w}UW) and return a structure
// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone
unigram_t
dict_split_unigram(const char * word)
{
size_t total_graphemes = 0;
size_t total_phone = 0;
int token_pos = 0;
int w ;
char *phone;
char *letter;
size_t lenword = 0;
char unigram_letter;
int add;
lenword = strlen(word);
for (w = 0; w < lenword; w++) {
unigram_letter = word[w];
if (unigram_letter == '}') {
token_pos = w;
continue;
}
if (!token_pos)
total_graphemes++;
else
total_phone++;
}
letter = ckd_calloc(1, total_graphemes+1);
add = 0;
for (w = 0; w < total_graphemes; w++) {
if (word[w] == '|')
{
add++;
continue;
}
letter[w - add] = word[w];
}
phone = ckd_calloc(1, total_phone+1);
for (w = 0; w < total_phone; w++) {
if (word[w + 1 + total_graphemes] == '|') {
phone[w] = ' ';
} else {
phone[w] = word[w + 1 + total_graphemes];
}
}
unigram_t unigram = { letter , phone};
return unigram;
};
// This function calculates the most likely unigram to appear in the current position at the word
// based on the three latest chosen/winners unigrams (history) and return a structure containing
// the word id (wid), and lengths of the phoneme and the word
struct winner_t
dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset)
{
long current_prob = LONG_MIN;
struct winner_t winner;
int32 i = 0, j = 0;
int nused;
int32 ngram_order = ngram_model_get_size(model);
int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32));
gnode_t *gn;
const char *vocab;
const char *sub;
int32 prob;
unigram_t unigram;
const int32 *total_unigrams = ngram_model_get_counts(model);
for (gn = history_list; gn; gn = gnode_next(gn)) {
// we need to build history from last to first because glist returns itens from last to first
history[ngram_order - j - 1] = gnode_int32(gn);
j++;
if (j >= ngram_order)
break;
}
for (i = 0; i < *total_unigrams; i++) {
vocab = ngram_word(model, i);
unigram = dict_split_unigram(vocab);
sub = word_grapheme + word_offset;
if (dict_starts_with(unigram.word, sub)) {
prob = ngram_ng_prob(model, i, history, j, &nused);
if (current_prob < prob) {
current_prob = prob;
winner.winner_wid = i;
winner.length_match = strlen(unigram.word);
winner.len_phoneme = strlen(unigram.phone);
}
}
free_unigram_t(&unigram);
}
if (history)
ckd_free(history);
return winner;
}
// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part,
// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we
// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history.
char *
dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model)
{
char *final_phone = NULL;
int totalh = 0;
size_t increment = 1;
int word_offset = 0;
int j;
size_t grapheme_len = 0, final_phoneme_len = 0;
glist_t history_list = NULL;
gnode_t *gn;
int first = 0;
struct winner_t winner;
const char *word;
unigram_t unigram;
int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence
history_list = glist_add_int32(history_list, wid_sentence);
grapheme_len = strlen(word_grapheme);
for (j = 0 ; j < grapheme_len ; j += increment) {
winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset);
increment = winner.length_match;
if (increment == 0) {
E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme);
ckd_free(history_list);
return NULL;
}
history_list = glist_add_int32(history_list, winner.winner_wid);
totalh = j + 1;
word_offset += winner.length_match;
final_phoneme_len += winner.len_phoneme;
}
history_list = glist_reverse(history_list);
final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1);
for (gn = history_list; gn; gn = gnode_next(gn)) {
if (!first) {
first = 1;
continue;
}
word = ngram_word(ngram_g2p_model, gnode_int32(gn));
if (!word)
continue;
unigram = dict_split_unigram(word);
if (strcmp(unigram.phone, "_") == 0) {
free_unigram_t(&unigram);
continue;
}
strcat(final_phone, unigram.phone);
strcat(final_phone, " ");
free_unigram_t(&unigram);
}
if (history_list)
glist_free(history_list);
return final_phone;
}
// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict.
// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816
int
dict_add_g2p_word(dict_t *dict, char const *word)
{
int32 wid = 0;
s3cipid_t *pron;
char **phonestr, *tmp;
int np, i;
char *phones;
phones = dict_g2p(word, dict->ngram_g2p_model);
if (phones == NULL)
return 0;
E_INFO("Adding phone %s for word %s \n", phones, word);
tmp = ckd_salloc(phones);
np = str2words(tmp, NULL, 0);
phonestr = ckd_calloc(np, sizeof(*phonestr));
str2words(tmp, phonestr, np);
pron = ckd_calloc(np, sizeof(*pron));
for (i = 0; i < np; ++i) {
pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]);
if (pron[i] == -1) {
E_ERROR("Unknown phone %s in phone string %s\n",
phonestr[i], tmp);
ckd_free(phonestr);
ckd_free(tmp);
ckd_free(pron);
ckd_free(phones);
return -1;
}
}
ckd_free(phonestr);
ckd_free(tmp);
ckd_free(phones);
if ((wid = dict_add_word(dict, word, pron, np)) == -1) {
ckd_free(pron);
return -1;
}
ckd_free(pron);
return wid;
}

View File

@ -44,6 +44,7 @@
/* SphinxBase headers. */
#include <sphinxbase/hash_table.h>
#include <sphinxbase/ngram_model.h>
/* Local headers. */
#include "s3types.h"
@ -86,8 +87,21 @@ typedef struct {
s3wid_t finishwid; /**< FOR INTERNAL-USE ONLY */
s3wid_t silwid; /**< FOR INTERNAL-USE ONLY */
int nocase;
ngram_model_t *ngram_g2p_model;
} dict_t;
struct winner_t
{
size_t length_match;
int winner_wid;
size_t len_phoneme;
};
typedef struct
{
char *word;
char *phone;
} unigram_t;
/**
* Initialize a new dictionary.
@ -101,7 +115,8 @@ typedef struct {
* Return ptr to dict_t if successful, NULL otherwise.
*/
dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */
bin_mdef_t *mdef /**< For looking up CI phone IDs (or NULL) */
bin_mdef_t *mdef, /**< For looking up CI phone IDs (or NULL) */
logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere.
);
/**
@ -203,6 +218,9 @@ int dict_free(dict_t *d);
void dict_report(dict_t *d /**< A dictionary structure */
);
// g2p functions
int dict_add_g2p_word(dict_t * dict, char const *word);
#ifdef __cplusplus
}
#endif

View File

@ -65,6 +65,7 @@
#include "fsg_search_internal.h"
#include "fsg_history.h"
#include "fsg_lextree.h"
#include "dict.h"
/* Turn this on for detailed debugging dump */
#define __FSG_DBG__ 0
@ -139,9 +140,21 @@ fsg_search_check_dict(fsg_search_t *fsgs, fsg_model_t *fsg)
word = fsg_model_word_str(fsg, i);
wid = dict_wordid(dict, word);
if (wid == BAD_S3WID) {
E_ERROR("The word '%s' is missing in the dictionary\n", word);
return FALSE;
}
E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word);
if (!dict->ngram_g2p_model) {
E_ERROR("NO dict->ngram_g2p_model. Aborting..");
return FALSE;
}
int new_wid = dict_add_g2p_word(dict, word);
if (new_wid > 0){
/* Now we also have to add it to dict2pid. */
dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid);
} else {
E_ERROR("Exiting... \n");
return FALSE;
}
}
}
return TRUE;

View File

@ -69,7 +69,8 @@ typedef struct fsg_search_s {
ps_search_t base;
hmm_context_t *hmmctx; /**< HMM context. */
char const *arpafile;
cmd_ln_t *config;
fsg_model_t *fsg; /**< FSG model */
struct fsg_lextree_s *lextree;/**< Lextree structure for the currently
active FSG */

View File

@ -278,7 +278,7 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)
/* Dictionary and triphone mappings (depends on acmod). */
/* FIXME: pass config, change arguments, implement LTS, etc. */
if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL)
if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL)
return -1;
if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL)
return -1;
@ -720,7 +720,7 @@ ps_load_dict(ps_decoder_t *ps, char const *dictfile,
cmd_ln_str_r(ps->config, "-fdict"));
/* Try to load it. */
if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) {
if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) {
cmd_ln_free_r(newconfig);
return -1;
}

View File

@ -404,10 +404,11 @@ ps_lattice_read(ps_decoder_t *ps,
dag->search = ps->search;
dag->dict = dict_retain(ps->dict);
dag->lmath = logmath_retain(ps->lmath);
dag->dict = dict_init(NULL, NULL, dag->lmath);
dag->frate = cmd_ln_int32_r(dag->search->config, "-frate");
}
else {
dag->dict = dict_init(NULL, NULL);
dag->dict = dict_init(NULL, NULL, dag->lmath);
dag->lmath = logmath_init(1.0001, 0, FALSE);
dag->frate = 100;
}