Bug 1180113 - Introducing g2p algorithm inside pocketsphinx to allow out of dictionary words to be added to grammars. r=smaug

Signed-off-by: Andre Natal <anatal@gmail.com>
2025-02-07 15:12:28 +00:00 · 2015-08-05 00:33:00 +02:00 · 2015-08-05 00:33:00 +02:00 · 012af21e63
commit 012af21e63
parent af73e8302b
6 changed files with 289 additions and 10 deletions
--- a/media/pocketsphinx/src/dict.c
+++ b/media/pocketsphinx/src/dict.c
@ -37,6 +37,7 @@

 /* System headers. */
 #include <string.h>
+#include <limits.h> // We need this for LONG_MIN

 /* SphinxBase headers. */
 #include <sphinxbase/pio.h>
@ -249,14 +250,14 @@ dict_write(dict_t *dict, char const *filename, char const *format)


 dict_t *
-dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
+dict_init(cmd_ln_t *config, bin_mdef_t * mdef, logmath_t *logmath)
 {
    FILE *fp, *fp2;
    int32 n;
    lineiter_t *li;
    dict_t *d;
    s3cipid_t sil;
-    char const *dictfile = NULL, *fillerfile = NULL;
+    char const *dictfile = NULL, *fillerfile = NULL, *arpafile = NULL;

    if (config) {
        dictfile = cmd_ln_str_r(config, "-dict");
@ -303,6 +304,19 @@ dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
     * Also check for type size restrictions.
     */
    d = (dict_t *) ckd_calloc(1, sizeof(dict_t));       /* freed in dict_free() */
+    if (config){
+        arpafile = string_join(dictfile, ".dmp",  NULL);
+    }
+    if (arpafile) {
+        ngram_model_t *ngram_g2p_model = ngram_model_read(NULL,arpafile,NGRAM_AUTO,logmath);
+        ckd_free(arpafile);
+        if (!ngram_g2p_model) {
+            E_ERROR("No arpa model found  \n");
+            return NULL;
+        }
+        d->ngram_g2p_model = ngram_g2p_model;
+    }
+
    d->refcnt = 1;
    d->max_words =
        (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
@ -474,6 +488,8 @@ dict_free(dict_t * d)
        hash_table_free(d->ht);
    if (d->mdef)
        bin_mdef_free(d->mdef);
+    if (d->ngram_g2p_model)
+        ngram_model_free(d->ngram_g2p_model);
    ckd_free((void *) d);

    return 0;
@ -487,3 +503,233 @@ dict_report(dict_t * d)
    E_INFO_NOFN("No of word: %d\n", d->n_word);
    E_INFO_NOFN("\n");
 }
+
+// This function returns if a string (str) starts with the passed prefix (*pre)
+int
+dict_starts_with(const char *pre, const char *str)
+{
+    size_t lenpre = strlen(pre), lenstr = strlen(str);
+    return lenstr < lenpre ? 0 : strncmp(pre, str, lenpre) == 0;
+}
+
+// Helper function to clear unigram
+void
+free_unigram_t(unigram_t *unigram)
+{
+    ckd_free(unigram->word);
+    ckd_free(unigram->phone);
+}
+
+// This function splits an unigram received (in format e|w}UW) and return a structure
+// containing two fields: the grapheme (before }) in unigram.word and the phoneme (after }) unigram.phone
+unigram_t
+dict_split_unigram(const char * word)
+{
+    size_t total_graphemes = 0;
+    size_t total_phone = 0;
+    int token_pos = 0;
+    int w ;
+    char *phone;
+    char *letter;
+    size_t lenword = 0;
+    char unigram_letter;
+    int add;
+
+    lenword = strlen(word);
+    for (w = 0; w < lenword; w++) {
+        unigram_letter = word[w];
+        if (unigram_letter == '}') {
+            token_pos = w;
+            continue;
+        }
+        if (!token_pos)
+            total_graphemes++;
+        else
+            total_phone++;
+    }
+
+    letter = ckd_calloc(1, total_graphemes+1);
+    add = 0;
+    for (w = 0; w < total_graphemes; w++) {
+        if (word[w] == '|')
+        {
+            add++;
+            continue;
+        }
+        letter[w - add] = word[w];
+    }
+
+    phone = ckd_calloc(1, total_phone+1);
+    for (w = 0; w < total_phone; w++) {
+        if (word[w + 1 + total_graphemes] == '|') {
+            phone[w] = ' ';
+        } else {
+            phone[w] = word[w + 1 + total_graphemes];
+        }
+    }
+
+    unigram_t unigram = { letter , phone};
+
+    return unigram;
+};
+
+// This function calculates the most likely unigram to appear in the current position at the word
+// based on the three latest chosen/winners unigrams (history) and return a structure containing
+// the word id (wid), and lengths of the phoneme and the word
+struct winner_t
+dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, int word_offset)
+{
+    long current_prob = LONG_MIN;
+    struct winner_t winner;
+    int32 i = 0, j = 0;
+    int nused;
+    int32 ngram_order = ngram_model_get_size(model);
+    int32 *history = ckd_calloc((size_t)ngram_order, sizeof(int32));
+    gnode_t *gn;
+    const char *vocab;
+    const char *sub;
+    int32 prob;
+    unigram_t unigram;
+    const int32 *total_unigrams = ngram_model_get_counts(model);
+
+    for (gn = history_list; gn; gn = gnode_next(gn)) {
+        // we need to build history from last to first because glist returns itens from last to first
+        history[ngram_order - j - 1] = gnode_int32(gn);
+        j++;
+        if (j >= ngram_order)
+            break;
+    }
+
+    for (i = 0; i < *total_unigrams; i++) {
+        vocab = ngram_word(model, i);
+        unigram  = dict_split_unigram(vocab);
+        sub = word_grapheme + word_offset;
+        if (dict_starts_with(unigram.word, sub)) {
+            prob = ngram_ng_prob(model, i, history, j, &nused);
+            if (current_prob < prob) {
+                current_prob = prob;
+                winner.winner_wid = i;
+                winner.length_match = strlen(unigram.word);
+                winner.len_phoneme = strlen(unigram.phone);
+            }
+        }
+
+        free_unigram_t(&unigram);
+    }
+
+    if (history)
+        ckd_free(history);
+
+    return winner;
+}
+
+// This function manages the winner unigrams and builds the history of winners to properly generate the final phoneme. In the first part,
+// it gets the most likely unigrams which graphemes compose the word and build a history of wids that is used in this search. In second part, the we
+// use the history of wids to get each correspondent unigram, and on third part, we build the final phoneme word from this history.
+char *
+dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model)
+{
+    char *final_phone = NULL;
+    int totalh = 0;
+    size_t increment = 1;
+    int word_offset = 0;
+    int j;
+    size_t grapheme_len = 0, final_phoneme_len = 0;
+    glist_t history_list = NULL;
+    gnode_t *gn;
+    int first = 0;
+    struct winner_t winner;
+    const char *word;
+    unigram_t unigram;
+
+    int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence
+    history_list = glist_add_int32(history_list, wid_sentence);
+    grapheme_len = strlen(word_grapheme);
+    for (j = 0 ; j < grapheme_len ; j += increment) {
+        winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, word_offset);
+        increment = winner.length_match;
+        if (increment == 0) {
+            E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme);
+            ckd_free(history_list);
+            return NULL;
+        }
+        history_list = glist_add_int32(history_list, winner.winner_wid);
+        totalh = j + 1;
+        word_offset += winner.length_match;
+        final_phoneme_len += winner.len_phoneme;
+    }
+
+    history_list = glist_reverse(history_list);
+    final_phone = ckd_calloc(1, (final_phoneme_len * 2)+1);
+    for (gn = history_list; gn; gn = gnode_next(gn)) {
+        if (!first) {
+            first = 1;
+            continue;
+        }
+        word = ngram_word(ngram_g2p_model, gnode_int32(gn));
+
+        if (!word)
+            continue;
+
+        unigram  = dict_split_unigram(word);
+
+        if (strcmp(unigram.phone, "_") == 0) {
+            free_unigram_t(&unigram);
+            continue;
+        }
+        strcat(final_phone, unigram.phone);
+        strcat(final_phone, " ");
+
+        free_unigram_t(&unigram);
+    }
+
+    if (history_list)
+        glist_free(history_list);
+
+    return final_phone;
+}
+
+// This function just receives the dict lacking word from fsg_search, call the main function dict_g2p, and then adds the word to the memory dict.
+// The second part of this function is the same as pocketsphinx.c: https://github.com/cmusphinx/pocketsphinx/blob/ba6bd21b3601339646d2db6d2297d02a8a6b7029/src/libpocketsphinx/pocketsphinx.c#L816
+int
+dict_add_g2p_word(dict_t *dict, char const *word)
+{
+    int32 wid = 0;
+    s3cipid_t *pron;
+    char **phonestr, *tmp;
+    int np, i;
+    char *phones;
+
+    phones = dict_g2p(word, dict->ngram_g2p_model);
+    if (phones == NULL)
+        return 0;
+
+    E_INFO("Adding phone %s for word %s \n",  phones, word);
+    tmp = ckd_salloc(phones);
+    np = str2words(tmp, NULL, 0);
+    phonestr = ckd_calloc(np, sizeof(*phonestr));
+    str2words(tmp, phonestr, np);
+    pron = ckd_calloc(np, sizeof(*pron));
+    for (i = 0; i < np; ++i) {
+        pron[i] = bin_mdef_ciphone_id(dict->mdef, phonestr[i]);
+        if (pron[i] == -1) {
+            E_ERROR("Unknown phone %s in phone string %s\n",
+                    phonestr[i], tmp);
+            ckd_free(phonestr);
+            ckd_free(tmp);
+            ckd_free(pron);
+            ckd_free(phones);
+            return -1;
+        }
+    }
+    ckd_free(phonestr);
+    ckd_free(tmp);
+    ckd_free(phones);
+    if ((wid = dict_add_word(dict, word, pron, np)) == -1) {
+        ckd_free(pron);
+        return -1;
+    }
+    ckd_free(pron);
+
+    return wid;
+}
--- a/media/pocketsphinx/src/dict.h
+++ b/media/pocketsphinx/src/dict.h
@ -44,6 +44,7 @@

 /* SphinxBase headers. */
 #include <sphinxbase/hash_table.h>
+#include <sphinxbase/ngram_model.h>

 /* Local headers. */
 #include "s3types.h"
@ -86,8 +87,21 @@ typedef struct {
    s3wid_t finishwid;	/**< FOR INTERNAL-USE ONLY */
    s3wid_t silwid;	/**< FOR INTERNAL-USE ONLY */
    int nocase;
+    ngram_model_t *ngram_g2p_model;
 } dict_t;

+struct winner_t
+{
+    size_t length_match;
+    int winner_wid;
+    size_t len_phoneme;
+};
+
+typedef struct
+{
+    char *word;
+    char *phone;
+} unigram_t;

 /**
 * Initialize a new dictionary.
@ -101,7 +115,8 @@ typedef struct {
 * Return ptr to dict_t if successful, NULL otherwise.
 */
 dict_t *dict_init(cmd_ln_t *config, /**< Configuration (-dict, -fdict, -dictcase) or NULL */
-                  bin_mdef_t *mdef  /**< For looking up CI phone IDs (or NULL) */
+                  bin_mdef_t *mdef,  /**< For looking up CI phone IDs (or NULL) */
+                  logmath_t *logmath // To load ngram_model for g2p load. logmath must be retained with logmath_retain() if it is to be used elsewhere.
    );

 /**
@ -203,6 +218,9 @@ int dict_free(dict_t *d);
 void dict_report(dict_t *d /**< A dictionary structure */
    );

+// g2p functions
+int dict_add_g2p_word(dict_t * dict, char const *word);
+
 #ifdef __cplusplus
 }
 #endif
--- a/media/pocketsphinx/src/fsg_search.c
+++ b/media/pocketsphinx/src/fsg_search.c
@ -65,6 +65,7 @@
 #include "fsg_search_internal.h"
 #include "fsg_history.h"
 #include "fsg_lextree.h"
+#include "dict.h"

 /* Turn this on for detailed debugging dump */
 #define __FSG_DBG__		0
@ -139,9 +140,21 @@ fsg_search_check_dict(fsg_search_t *fsgs, fsg_model_t *fsg)
        word = fsg_model_word_str(fsg, i);
        wid = dict_wordid(dict, word);
        if (wid == BAD_S3WID) {
-    	    E_ERROR("The word '%s' is missing in the dictionary\n", word);
-    	    return FALSE;
-    	}
+            E_WARN("The word '%s' is missing in the dictionary. Trying to create new phoneme \n", word);
+            if (!dict->ngram_g2p_model) {
+                E_ERROR("NO dict->ngram_g2p_model. Aborting..");
+                return FALSE;
+            }
+
+            int new_wid = dict_add_g2p_word(dict, word);
+            if (new_wid > 0){
+                /* Now we also have to add it to dict2pid. */
+                dict2pid_add_word(ps_search_dict2pid(fsgs), new_wid);
+            } else {
+                E_ERROR("Exiting... \n");
+                return FALSE;
+            }
+        }
    }

    return TRUE;
--- a/media/pocketsphinx/src/fsg_search_internal.h
+++ b/media/pocketsphinx/src/fsg_search_internal.h
@ -69,7 +69,8 @@ typedef struct fsg_search_s {
    ps_search_t base;

    hmm_context_t *hmmctx; /**< HMM context. */
-
+    char const *arpafile;
+    cmd_ln_t *config;
    fsg_model_t *fsg;		/**< FSG model */
    struct fsg_lextree_s *lextree;/**< Lextree structure for the currently
 				   active FSG */
--- a/media/pocketsphinx/src/pocketsphinx.c
+++ b/media/pocketsphinx/src/pocketsphinx.c
@ -278,7 +278,7 @@ ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)

    /* Dictionary and triphone mappings (depends on acmod). */
    /* FIXME: pass config, change arguments, implement LTS, etc. */
-    if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL)
+    if ((ps->dict = dict_init(ps->config, ps->acmod->mdef, ps->acmod->lmath)) == NULL)
        return -1;
    if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL)
        return -1;
@ -720,7 +720,7 @@ ps_load_dict(ps_decoder_t *ps, char const *dictfile,
                         cmd_ln_str_r(ps->config, "-fdict"));

    /* Try to load it. */
-    if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) {
+    if ((dict = dict_init(newconfig, ps->acmod->mdef, ps->acmod->lmath)) == NULL) {
        cmd_ln_free_r(newconfig);
        return -1;
    }
--- a/media/pocketsphinx/src/ps_lattice.c
+++ b/media/pocketsphinx/src/ps_lattice.c
@ -404,10 +404,11 @@ ps_lattice_read(ps_decoder_t *ps,
        dag->search = ps->search;
        dag->dict = dict_retain(ps->dict);
        dag->lmath = logmath_retain(ps->lmath);
+        dag->dict = dict_init(NULL, NULL, dag->lmath);
        dag->frate = cmd_ln_int32_r(dag->search->config, "-frate");
    }
    else {
-        dag->dict = dict_init(NULL, NULL);
+        dag->dict = dict_init(NULL, NULL, dag->lmath);
        dag->lmath = logmath_init(1.0001, 0, FALSE);
        dag->frate = 100;
    }