mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-07 15:12:28 +00:00
Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug
Signed-off-by: Andre Natal <anatal@gmail.com>
This commit is contained in:
parent
af73e8302b
commit
012af21e63
@ -37,6 +37,7 @@
|
||||
|
||||
/* System headers. */
|
||||
#include <string.h>
|
||||
#include <limits.h> // We need this for LONG_MIN
|
||||
|
||||
/* SphinxBase headers. */
|
||||
#include <sphinxbase/pio.h>
|
||||
@ -249,14 +250,14 @@ dict_write(dict_t *dict, char const *filename, char const *format)
|
||||
|
||||
|
||||
dict_t *
|
||||
dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
|
||||
dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath)
|
||||
{
|
||||
FILE *fp, *fp2;
|
||||
int32 n;
|
||||
lineiter_t *li;
|
||||
dict_t *d;
|
||||
s3cipid_t sil;
|
||||
char const *dictfile = NULL, *fillerfile = NULL;
|
||||
char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL;
|
||||
|
||||
if (config) {
|
||||
dictfile = cmd_ln_str_r(config, "-dict");
|
||||
@ -303,6 +304,19 @@ dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
|
||||
* Also check for type size restrictions.
|
||||
*/
|
||||
d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */
|
||||
if (config){
|
||||
arpafile = string_join(dictfile, ".dmp", NULL);
|
||||
}
|
||||
if (arpafile) {
|
||||
ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath);
|
||||
ckd_free(arpafile);
|
||||
if (!ngram_g2p_model) {
|
||||
E_ERROR("No arpa model found \n");
|
||||
return NULL;
|
||||
}
|
||||
d->ngram_g2p_model = ngram_g2p_model;
|
||||
}
|
||||
|
||||
d->refcnt = 1;
|
||||
d->max_words =
|
||||
(n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
|
||||
@ -474,6 +488,8 @@ dict_free(dict_t * d)
|
||||
hash_table_free(d->ht);
|
||||
if (d->mdef)
|
||||
bin_mdef_free(d->mdef);
|
||||
if (d->ngram_g2p_model)
|
||||
ngram_model_free(d->ngram_g2p_model);
|
||||
ckd_free((void *) d);
|
||||
|
||||
return 0;
|
||||
@ -487,3 +503,233 @@ dict_report(dict_t * d)
|
||||
E_INFO_NOFN("No of word: %d\n", d->n_word);
|
||||
E_INFO_NOFN("\n");
|
||||
}
|
||||
|
||||
// This function returns if a string (str) starts with the passed prefix (*pre)
|
||||
int
|
||||
dict_starts_with(const char *pre, const char *str)
|
||||
{
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
||||
// Helper function to clear unigram
|
||||
void
|
||||
free_unigram_t(unigram_t *unigram)
|
||||
{
|
||||
ckd_free(unigram->word);
|
||||
ckd_free(unigram->phone);
|
||||
}
|
||||
|
||||
// This function splits an unigram received (in format e|w}UW) and return a structure
|
||||
// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone
|
||||
unigram_t
|
||||
dict_split_unigram(const char * word)
|
||||
{
|
||||
size_t total_graphemes = 0;
|
||||
size_t total_phone = 0;
|
||||
int token_pos = 0;
|
||||
int w ;
|
||||
char *phone;
|
||||
char *letter;
|
||||
size_t lenword = 0;
|
||||
char unigram_letter;
|
||||
int add;
|
||||
|
||||
lenword = strlen(word);
|
||||
for (w = 0; w < lenword; w++) {
|
||||
unigram_letter = word[w];
|
||||
if (unigram_letter == '}') {
|
||||
token_pos = w;
|
||||
continue;
|
||||
}
|
||||
if (!token_pos)
|
||||
total_graphemes++;
|
||||
else
|
||||
total_phone++;
|
||||
}
|
||||
|
||||
letter = ckd_calloc(1, total_graphemes+1);
|
||||
add = 0;
|
||||
for (w = 0; w < total_graphemes; w++) {
|
||||
if (word[w] == '|')
|
||||
{
|
||||
add++;
|
||||
continue;
|
||||
}
|
||||
letter[w - add] = word[w];
|
||||
}
|
||||
|
||||
phone = ckd_calloc(1, total_phone+1);
|
||||
for (w = 0; w < total_phone; w++) {
|
||||
if (word[w + 1 + total_graphemes] == '|') {
|
||||
phone[w] = ' ';
|
||||
} else {
|
||||
phone[w] = word[w + 1 + total_graphemes];
|
||||
}
|
||||
}
|
||||
|
||||
unigram_t unigram = { letter , phone};
|
||||
|
||||
return unigram;
|
||||
};
|
||||
|
||||
// This function calculates the most likely unigram to appear in the current position at the word
|
||||
// based on the three latest chosen/winners unigrams (history) and return a structure containing
|
||||
// the word id (wid), and lengths of the phoneme and the word
|
||||
struct winner_t
|
||||
dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset)
|
||||
{
|
||||
long current_prob = LONG_MIN;
|
||||
struct winner_t winner;
|
||||
int32 i = 0, j = 0;
|
||||
int nused;
|
||||
int32 ngram_order = ngram_model_get_size(model);
|
||||
int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32));
|
||||
gnode_t *gn;
|
||||
const char *vocab;
|
||||
const char *sub;
|
||||
int32 prob;
|
||||
unigram_t unigram;
|
||||
const int32 *total_unigrams = ngram_model_get_counts(model);
|
||||
|
||||
for (gn = history_list; gn; gn = gnode_next(gn)) {
|
||||
// we need to build history from last to first because glist returns itens from last to first
|
||||
history[ngram_order - j - 1] = gnode_int32(gn);
|
||||
j++;
|
||||
if (j >= ngram_order)
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; i < *total_unigrams; i++) {
|
||||
vocab = ngram_word(model, i);
|
||||
unigram = dict_split_unigram(vocab);
|
||||
sub = word_grapheme + word_offset;
|
||||
if (dict_starts_with(unigram.word, sub)) {
|
||||
prob = ngram_ng_prob(model, i, history, j, &nused);
|
||||
if (current_prob < prob) {
|
||||
current_prob = prob;
|
||||
winner.winner_wid = i;
|
||||
winner.length_match = strlen(unigram.word);
|
||||
winner.len_phoneme = strlen(unigram.phone);
|
||||
}
|
||||
}
|
||||
|
||||
free_unigram_t(&unigram);
|
||||
}
|
||||
|
||||
if (history)
|
||||
ckd_free(history);
|
||||
|
||||
return winner;
|
||||
}
|
||||
|
||||
// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part,
|
||||
// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we
|
||||
// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history.
|
||||
char *
|
||||
dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model)
|
||||
{
|
||||
char *final_phone = NULL;
|
||||
int totalh = 0;
|
||||
size_t increment = 1;
|
||||
int word_offset = 0;
|
||||
int j;
|
||||
size_t grapheme_len = 0, final_phoneme_len = 0;
|
||||
glist_t history_list = NULL;
|
||||
gnode_t *gn;
|
||||
int first = 0;
|
||||
struct winner_t winner;
|
||||
const char *word;
|
||||
unigram_t unigram;
|
||||
|
||||
int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence
|
||||
history_list = glist_add_int32(history_list, wid_sentence);
|
||||
grapheme_len = strlen(word_grapheme);
|
||||
for (j = 0 ; j < grapheme_len ; j += increment) {
|
||||
winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset);
|
||||
increment = winner.length_match;
|
||||
if (increment == 0) {
|
||||
E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme);
|
||||
ckd_free(history_list);
|
||||
return NULL;
|
||||
}
|
||||
history_list = glist_add_int32(history_list, winner.winner_wid);
|
||||
totalh = j + 1;
|
||||
word_offset += winner.length_match;
|
||||
final_phoneme_len += winner.len_phoneme;
|
||||
}
|
||||
|
||||
history_list = glist_reverse(history_list);
|
||||
final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1);
|
||||
for (gn = history_list; gn; gn = gnode_next(gn)) {
|
||||
if (!first) {
|
||||
first = 1;
|
||||
continue;
|
||||
}
|
||||
word = ngram_word(ngram_g2p_model, gnode_int32(gn));
|
||||
|
||||
if (!word)
|
||||
continue;
|
||||
|
||||
unigram = dict_split_unigram(word);
|
||||
|
||||
if (strcmp(unigram.phone, "_") == 0) {
|
||||
free_unigram_t(&unigram);
|
||||
continue;
|
||||
}
|
||||
strcat(final_phone, unigram.phone);
|
||||
strcat(final_phone, " ");
|
||||
|
||||
free_unigram_t(&unigram);
|
||||
}
|
||||
|
||||
if (history_list)
|
||||
glist_free(history_list);
|
||||
|
||||
return final_phone;
|
||||
}
|
||||
|
||||
// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict.
|
||||
// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816
|
||||
int
|
||||
dict_add_g2p_word(dict_t *dict, char const *word)
|
||||
{
|
||||
int32 wid = 0;
|
||||
s3cipid_t *pron;
|
||||
char **phonestr, *tmp;
|
||||
int np, i;
|
||||
char *phones;
|
||||
|
||||
phones = dict_g2p(word, dict->ngram_g2p_model);
|
||||
if (phones == NULL)
|
||||
return 0;
|
||||
|
||||
E_INFO("Adding phone %s for word %s \n", phones, word);
|
||||
tmp = ckd_salloc(phones);
|
||||
np = str2words(tmp, NULL, 0);
|
||||
phonestr = ckd_calloc(np, sizeof(*phonestr));
|
||||
str2words(tmp, phonestr, np);
|
||||
pron = ckd_calloc(np, sizeof(*pron));
|
||||
for (i = 0; i < np; ++i) {
|
||||
pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]);
|
||||
if (pron[i] == -1) {
|
||||
E_ERROR("Unknown phone %s in phone string %s\n",
|
||||
phonestr[i], tmp);
|
||||
ckd_free(phonestr);
|
||||
ckd_free(tmp);
|
||||
ckd_free(pron);
|
||||
ckd_free(phones);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
ckd_free(phonestr);
|
||||
ckd_free(tmp);
|
||||
ckd_free(phones);
|
||||
if ((wid = dict_add_word(dict, word, pron, np)) == -1) {
|
||||
ckd_free(pron);
|
||||
return -1;
|
||||
}
|
||||
ckd_free(pron);
|
||||
|
||||
return wid;
|
||||
}
|
||||
|
@ -44,6 +44,7 @@
|
||||
|
||||
/* SphinxBase headers. */
|
||||
#include <sphinxbase/hash_table.h>
|
||||
#include <sphinxbase/ngram_model.h>
|
||||
|
||||
/* Local headers. */
|
||||
#include "s3types.h"
|
||||
@ -86,8 +87,21 @@ typedef struct {
|
||||
s3wid_t finishwid; /**< FOR INTERNAL-USE ONLY */
|
||||
s3wid_t silwid; /**< FOR INTERNAL-USE ONLY */
|
||||
int nocase;
|
||||
ngram_model_t *ngram_g2p_model;
|
||||
} dict_t;
|
||||
|
||||
struct winner_t
|
||||
{
|
||||
size_t length_match;
|
||||
int winner_wid;
|
||||
size_t len_phoneme;
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *word;
|
||||
char *phone;
|
||||
} unigram_t;
|
||||
|
||||
/**
|
||||
* Initialize a new dictionary.
|
||||
@ -101,7 +115,8 @@ typedef struct {
|
||||
* Return ptr to dict_t if successful, NULL otherwise.
|
||||
*/
|
||||
dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */
|
||||
bin_mdef_t *mdef /**< For looking up CI phone IDs (or NULL) */
|
||||
bin_mdef_t *mdef, /**< For looking up CI phone IDs (or NULL) */
|
||||
logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere.
|
||||
);
|
||||
|
||||
/**
|
||||
@ -203,6 +218,9 @@ int dict_free(dict_t *d);
|
||||
void dict_report(dict_t *d /**< A dictionary structure */
|
||||
);
|
||||
|
||||
// g2p functions
|
||||
int dict_add_g2p_word(dict_t * dict, char const *word);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -65,6 +65,7 @@
|
||||
#include "fsg_search_internal.h"
|
||||
#include "fsg_history.h"
|
||||
#include "fsg_lextree.h"
|
||||
#include "dict.h"
|
||||
|
||||
/* Turn this on for detailed debugging dump */
|
||||
#define __FSG_DBG__ 0
|
||||
@ -139,9 +140,21 @@ fsg_search_check_dict(fsg_search_t *fsgs, fsg_model_t *fsg)
|
||||
word = fsg_model_word_str(fsg, i);
|
||||
wid = dict_wordid(dict, word);
|
||||
if (wid == BAD_S3WID) {
|
||||
E_ERROR("The word '%s' is missing in the dictionary\n", word);
|
||||
return FALSE;
|
||||
}
|
||||
E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word);
|
||||
if (!dict->ngram_g2p_model) {
|
||||
E_ERROR("NO dict->ngram_g2p_model. Aborting..");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int new_wid = dict_add_g2p_word(dict, word);
|
||||
if (new_wid > 0){
|
||||
/* Now we also have to add it to dict2pid. */
|
||||
dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid);
|
||||
} else {
|
||||
E_ERROR("Exiting... \n");
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
@ -69,7 +69,8 @@ typedef struct fsg_search_s {
|
||||
ps_search_t base;
|
||||
|
||||
hmm_context_t *hmmctx; /**< HMM context. */
|
||||
|
||||
char const *arpafile;
|
||||
cmd_ln_t *config;
|
||||
fsg_model_t *fsg; /**< FSG model */
|
||||
struct fsg_lextree_s *lextree;/**< Lextree structure for the currently
|
||||
active FSG */
|
||||
|
@ -278,7 +278,7 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)
|
||||
|
||||
/* Dictionary and triphone mappings (depends on acmod). */
|
||||
/* FIXME: pass config, change arguments, implement LTS, etc. */
|
||||
if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL)
|
||||
if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL)
|
||||
return -1;
|
||||
if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL)
|
||||
return -1;
|
||||
@ -720,7 +720,7 @@ ps_load_dict(ps_decoder_t *ps, char const *dictfile,
|
||||
cmd_ln_str_r(ps->config, "-fdict"));
|
||||
|
||||
/* Try to load it. */
|
||||
if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) {
|
||||
if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) {
|
||||
cmd_ln_free_r(newconfig);
|
||||
return -1;
|
||||
}
|
||||
|
@ -404,10 +404,11 @@ ps_lattice_read(ps_decoder_t *ps,
|
||||
dag->search = ps->search;
|
||||
dag->dict = dict_retain(ps->dict);
|
||||
dag->lmath = logmath_retain(ps->lmath);
|
||||
dag->dict = dict_init(NULL, NULL, dag->lmath);
|
||||
dag->frate = cmd_ln_int32_r(dag->search->config, "-frate");
|
||||
}
|
||||
else {
|
||||
dag->dict = dict_init(NULL, NULL);
|
||||
dag->dict = dict_init(NULL, NULL, dag->lmath);
|
||||
dag->lmath = logmath_init(1.0001, 0, FALSE);
|
||||
dag->frate = 100;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user