From acb7bbecba87e0e5230fb8c634701dcc1b8c8017 Mon Sep 17 00:00:00 2001 From: Janos Kovacs Date: Thu, 6 Jun 2013 16:32:45 +0300 Subject: [PATCH] recognizer: next iteration of notification + plugin support for it --- src/daemon/recognizer.h | 8 +- src/plugins/fake-speech-engine/fake.c | 8 +- src/plugins/sphinx-speech-engine/decoder-set.c | 5 +- src/plugins/sphinx-speech-engine/options.c | 150 +++++++++--------- src/plugins/sphinx-speech-engine/options.h | 2 + src/plugins/sphinx-speech-engine/sphinx-plugin.c | 6 +- src/plugins/sphinx-speech-engine/sphinx-plugin.h | 5 +- src/plugins/sphinx-speech-engine/utterance.c | 191 ++++++++++++----------- src/plugins/sphinx-speech-engine/utterance.h | 7 +- 9 files changed, 201 insertions(+), 181 deletions(-) diff --git a/src/daemon/recognizer.h b/src/daemon/recognizer.h index b103506..14a7dbd 100644 --- a/src/daemon/recognizer.h +++ b/src/daemon/recognizer.h @@ -63,10 +63,10 @@ typedef struct { * a single speech token */ typedef struct { - char *token; /* recognized tokens */ - double score; /* correctness probability */ - uint32_t start; /* start in audio buffer */ - uint32_t end; /* end in audio buffer */ + const char *token; /* recognized tokens */ + double score; /* correctness probability */ + uint32_t start; /* start in audio buffer */ + uint32_t end; /* end in audio buffer */ } srs_srec_token_t; /* diff --git a/src/plugins/fake-speech-engine/fake.c b/src/plugins/fake-speech-engine/fake.c index ea1bc7a..5604e0c 100644 --- a/src/plugins/fake-speech-engine/fake.c +++ b/src/plugins/fake-speech-engine/fake.c @@ -95,7 +95,7 @@ static void push_token_cb(mrp_timer_t *t, void *user_data) fake_t *fake = (fake_t *)user_data; fake_candidate_t *fcnd = fake->cand + fake->candidx++; srs_srec_token_t tokens[fcnd->ntoken]; - srs_srec_candidate_t cand, *candptr; + srs_srec_candidate_t cand, *cands[2]; srs_srec_utterance_t utt; int i; @@ -138,13 +138,15 @@ static void push_token_cb(mrp_timer_t *t, void *user_data) cand.score = 1; cand.tokens = &tokens[0]; cand.ntoken = fcnd->ntoken; - candptr = &cand; + + cands[0] = &cand; + cands[1] = NULL; utt.id = "fake backend utterance"; utt.score = 1; utt.length = fcnd->ntoken * 2; utt.ncand = 1; - utt.cands = &candptr; + utt.cands = cands; fake->notify(&utt, fake->notify_data); } diff --git a/src/plugins/sphinx-speech-engine/decoder-set.c b/src/plugins/sphinx-speech-engine/decoder-set.c index 10c051a..4588593 100644 --- a/src/plugins/sphinx-speech-engine/decoder-set.c +++ b/src/plugins/sphinx-speech-engine/decoder-set.c @@ -143,7 +143,10 @@ int decoder_set_add(context_t *ctx, const char *decoder_name, cmd_ln_set_str_r(cfg, "-dict", dict); cmd_ln_set_int_r(cfg, "-topn", topn); cmd_ln_set_float_r(cfg, "-samprate", (double)opts->rate); - cmd_ln_set_str_r(cfg, "-logfn", opts->logfn); + cmd_ln_set_boolean_r(cfg, "-verbose", ctx->verbose); + + if (opts->logfn) + cmd_ln_set_str_r(cfg, "-logfn", opts->logfn); if (fsg) cmd_ln_set_str_r(cfg, "-fsg", opts->fsg); diff --git a/src/plugins/sphinx-speech-engine/options.c b/src/plugins/sphinx-speech-engine/options.c index 54f73f8..f37551a 100644 --- a/src/plugins/sphinx-speech-engine/options.c +++ b/src/plugins/sphinx-speech-engine/options.c @@ -25,12 +25,15 @@ int options_create(context_t *ctx, int ncfg, srs_cfg_t *cfgs) bool verbose; int i; int sts; + size_t pfxlen; if (!ctx) { errno = EINVAL; return -1; } + pfxlen = strlen(SPHINX_PREFIX); + if (!(opts = mrp_allocz(sizeof(options_t)))) return -1; @@ -50,84 +53,85 @@ int options_create(context_t *ctx, int ncfg, srs_cfg_t *cfgs) for (i = 0; i < ncfg; i++) { cfg = cfgs + i; - key = cfg->key; + key = cfg->key + pfxlen; value = cfg->value; - switch (key[0]) { - - case 'd': - if (!strcmp(key, "dict")) { - mrp_free((void *)opts->lm); - opts->dict = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 'f': - if (!strcmp(key, "fsg")) { - mrp_free((void *)opts->fsg); - opts->fsg = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 'h': - if (!strcmp(key, "hmm")) { - mrp_free((void *)opts->hmm); - opts->hmm = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 'l': - if (!strcmp(key, "lm")) { - mrp_free((void *)opts->lm); - opts->lm = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 'p': - if (!strcmp(key, "pulsesrc")) { - mrp_free((void *)opts->srcnam); - opts->srcnam = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 'r': - if (!strcmp(key, "record")) { - mrp_free((void *)opts->audio); - opts->audio = mrp_strdup(value); - cfg->used = TRUE; - } - break; - - case 's': - if (!strcmp(key, "samplerate")) { - opts->rate = strtoul(value, &e, 10); - if (e[0] || e == value || - opts->rate < 8000 || opts->rate > 4800) - { - mrp_log_error("invalid value %s for samplerate", value); - sts = -1; + if (!strncmp(cfg->key, SPHINX_PREFIX, pfxlen)) { + + switch (key[0]) { + + case 'd': + if (!strcmp(key, "dict")) { + mrp_free((void *)opts->dict); + opts->dict = mrp_strdup(value); + } + break; + + case 'f': + if (!strcmp(key, "fsg")) { + mrp_free((void *)opts->fsg); + opts->fsg = mrp_strdup(value); + } + break; + + case 'h': + if (!strcmp(key, "hmm")) { + mrp_free((void *)opts->hmm); + opts->hmm = mrp_strdup(value); } - cfg->used = TRUE; - } - break; - - case 't': - if (!strcmp(key, "topn")) { - opts->topn = strtoul(value, &e, 10); - if (e[0] || e == value || opts->topn < 1 || opts->topn > 100) { - mrp_log_error("invalid value %s for topn", value); - sts = -1; + break; + + case 'l': + if (!strcmp(key, "lm")) { + mrp_free((void *)opts->lm); + opts->lm = mrp_strdup(value); } - cfg->used = TRUE; - } - break; + break; + + case 'p': + if (!strcmp(key, "pulsesrc")) { + mrp_free((void *)opts->srcnam); + opts->srcnam = mrp_strdup(value); + } + break; + + case 'r': + if (!strcmp(key, "record")) { + mrp_free((void *)opts->audio); + opts->audio = mrp_strdup(value); + } + break; + + case 's': + if (!strcmp(key, "samplerate")) { + opts->rate = strtoul(value, &e, 10); + if (e[0] || e == value || + opts->rate < 8000 || opts->rate > 4800) + { + mrp_log_error("invalid value %s for samplerate",value); + sts = -1; + } + } + break; + + case 't': + if (!strcmp(key, "topn")) { + opts->topn = strtoul(value, &e, 10); + if (e[0] || e == value || + opts->topn < 1 || opts->topn > 100) + { + mrp_log_error("invalid value %s for topn", value); + sts = -1; + } + } + break; + + default: + cfg->used = FALSE; + break; - } /* switch key */ + } /* switch key */ + } } /* for cfg */ if (sts == 0) { diff --git a/src/plugins/sphinx-speech-engine/options.h b/src/plugins/sphinx-speech-engine/options.h index f113810..8a76074 100644 --- a/src/plugins/sphinx-speech-engine/options.h +++ b/src/plugins/sphinx-speech-engine/options.h @@ -5,6 +5,8 @@ #include "sphinx-plugin.h" +#define SPHINX_PREFIX "sphinx." + struct options_s { const char *hmm; const char *lm; diff --git a/src/plugins/sphinx-speech-engine/sphinx-plugin.c b/src/plugins/sphinx-speech-engine/sphinx-plugin.c index c2fce3e..9345f82 100644 --- a/src/plugins/sphinx-speech-engine/sphinx-plugin.c +++ b/src/plugins/sphinx-speech-engine/sphinx-plugin.c @@ -33,9 +33,6 @@ #include #include -#include "src/daemon/plugin.h" -#include "src/daemon/recognizer.h" - #include "options.h" #include "decoder-set.h" #include "utterance.h" @@ -48,7 +45,6 @@ #define SPHINX_DESCRIPTION "A CMU Sphinx-based speech engine backend plugin." #define SPHINX_AUTHORS "Janos Kovacs " #define SPHINX_VERSION "0.0.1" -#define SPHINX_PREFIX "sphinx." struct plugin_s { srs_plugin_t *self; /* us, the backend plugin */ @@ -59,7 +55,7 @@ struct plugin_s { }; -int32_t plugin_utterance_handler(context_t *ctx, utterance_t *utt) +int32_t plugin_utterance_handler(context_t *ctx, srs_srec_utterance_t *utt) { int32_t length = utt->length ? utt->length : -1; diff --git a/src/plugins/sphinx-speech-engine/sphinx-plugin.h b/src/plugins/sphinx-speech-engine/sphinx-plugin.h index 031d9f2..df76dc6 100644 --- a/src/plugins/sphinx-speech-engine/sphinx-plugin.h +++ b/src/plugins/sphinx-speech-engine/sphinx-plugin.h @@ -4,6 +4,9 @@ #include #include +#include "src/daemon/plugin.h" +#include "src/daemon/recognizer.h" + typedef enum utterance_processor_e utterance_processor_t; typedef struct context_s context_t; @@ -36,7 +39,7 @@ struct context_s { }; -int32_t plugin_utterance_handler(context_t *ctx, utterance_t *utt); +int32_t plugin_utterance_handler(context_t *ctx, srs_srec_utterance_t *utt); #endif /* __SRS_POCKET_SPHINX_PLUGIN_H__ */ diff --git a/src/plugins/sphinx-speech-engine/utterance.c b/src/plugins/sphinx-speech-engine/utterance.c index 6f2a212..1e04c3b 100644 --- a/src/plugins/sphinx-speech-engine/utterance.c +++ b/src/plugins/sphinx-speech-engine/utterance.c @@ -19,17 +19,18 @@ static void process_utterance(context_t *); -static void acoustic_processor(context_t *, utterance_t *, - candidate_t *, candidate_t **); -static void fsg_processor(context_t *, utterance_t *, - candidate_t *, candidate_t **); -static void print_utterance(context_t *, utterance_t *); +static void acoustic_processor(context_t *, srs_srec_utterance_t *, + srs_srec_candidate_t *,srs_srec_candidate_t **); +static void fsg_processor(context_t *, srs_srec_utterance_t *, + srs_srec_candidate_t *, srs_srec_candidate_t **); +static void print_utterance(context_t *, srs_srec_utterance_t *); -static candidate_t *candidate_equal(candidate_t *, candidate_t *); -static double candidate_quality(candidate_t *); -static uint32_t candidate_sort(candidate_t *, candidate_t **); +static srs_srec_candidate_t *candidate_equal(srs_srec_candidate_t *, + srs_srec_candidate_t *); +static double candidate_score(srs_srec_candidate_t *); +static uint32_t candidate_sort(srs_srec_candidate_t *,srs_srec_candidate_t **); -static bool wdeq(const char *, const char *); +static bool tkneq(const char *, const char *); void utterance_start(context_t *ctx) @@ -65,13 +66,18 @@ static void process_utterance(context_t *ctx) { decoder_set_t *decset; decoder_t *dec; - utterance_t utt; - candidate_t cands[CANDIDATE_MAX + 1]; - candidate_t *sorted[CANDIDATE_MAX + 1]; + srs_srec_utterance_t utt; + srs_srec_token_t token_pool[CANDIDATE_MAX * (CANDIDATE_TOKEN_MAX + 1)]; + srs_srec_candidate_t cands[CANDIDATE_MAX + 1]; + srs_srec_candidate_t *sorted[CANDIDATE_MAX + 1]; int32_t purgelen; + int i; if (ctx && (decset = ctx->decset) && (dec = decset->curdec)) { + for (i = 0; i < CANDIDATE_MAX; i++) + cands[i].tokens = token_pool + (i * (CANDIDATE_TOKEN_MAX + 1)); + switch (dec->utproc) { case UTTERANCE_PROCESSOR_ACOUSTIC: @@ -96,9 +102,9 @@ static void process_utterance(context_t *ctx) } static void acoustic_processor(context_t *ctx, - utterance_t *utt, - candidate_t *cands, - candidate_t **sorted) + srs_srec_utterance_t *utt, + srs_srec_candidate_t *cands, + srs_srec_candidate_t **sorted) { decoder_set_t *decset; decoder_t *dec; @@ -115,8 +121,8 @@ static void acoustic_processor(context_t *ctx, ps_latnode_t *nod; int32 start, end; size_t ncand, nsort; - candidate_t *cand; - word_t *wd; + srs_srec_candidate_t *cand; + srs_srec_token_t *tkn; int32_t length, purgelen; if (!ctx || !(decset = ctx->decset) || !(dec = decset->curdec)) @@ -151,18 +157,18 @@ static void acoustic_processor(context_t *ctx, cand = cands + ncand; - cand->quality = logmath_exp(lmath, score) / prob; - cand->nword = 0; + cand->score = logmath_exp(lmath, score) / prob; + cand->ntoken = 0; length = 0; while ((seg = ps_seg_next(seg))) { if ((hyp = ps_seg_word(seg))) { if (!strcmp(hyp, "") || - cand->nword >= CANDIDATE_WORD_MAX) + cand->ntoken >= CANDIDATE_TOKEN_MAX) { ncand++; - memset(cand+1, 0, sizeof(candidate_t)); + memset(cand+1, 0, sizeof(srs_srec_candidate_t)); ps_seg_frames(seg, &start, &end); ps_seg_free(seg); //printf("hyp= ncand=%d\n", ncand); @@ -174,39 +180,39 @@ static void acoustic_processor(context_t *ctx, //printf("hyp= skip it\n"); } else { - wd = cand->words + cand->nword++; - wd->word = hyp; - ps_seg_frames(seg, &wd->start, &wd->end); - //printf("hyp=%s (%d, %d) wd count %d\n", - // wd->word, wd->start,wd->end, cand->nword); + tkn = cand->tokens + cand->ntoken++; + tkn->token = hyp; + ps_seg_frames(seg, &tkn->start, &tkn->end); + //printf("hyp=%s (%d, %d) tkn count %d\n", + // tkn->word, tkn->start,tkn->end, cand->ntoken); } } } /* while seg */ - if (!seg && cand->nword > 0) { + if (!seg && cand->ntoken > 0) { ncand++; - cand->quality *= 0.9; /* some penalty */ - memset(cand+1, 0, sizeof(candidate_t)); + cand->score *= 0.9; /* some penalty */ + memset(cand+1, 0, sizeof(srs_srec_candidate_t)); } if (!length) { - wd = cand->words + (cand->nword - 1); - length = wd->end; + tkn = cand->tokens + (cand->ntoken - 1); + length = tkn->end; } } } /* for nb */ utt->id = uttid; - utt->quality = prob; + utt->score = prob; utt->length = length; utt->ncand = candidate_sort(cands, sorted); utt->cands = sorted; } static void fsg_processor(context_t *ctx, - utterance_t *utt, - candidate_t *cands, - candidate_t **sorted) + srs_srec_utterance_t *utt, + srs_srec_candidate_t *cands, + srs_srec_candidate_t **sorted) { decoder_set_t *decset; decoder_t *dec; @@ -215,12 +221,12 @@ static void fsg_processor(context_t *ctx, const char *uttid; int32_t score; double prob; - candidate_t *cand; - word_t *wd; + srs_srec_candidate_t *cand; + srs_srec_token_t *tkn; ps_lattice_t *dag; ps_latlink_t *lnk; ps_latnode_t *nod; - const char *word; + const char *token; int32_t start; int16 fef, lef; int32_t purgelen; @@ -233,10 +239,10 @@ static void fsg_processor(context_t *ctx, prob = logmath_exp(lmath, score); cand = cands; - cand->quality = 1.0; - cand->nword = 0; + cand->score = 1.0; + cand->ntoken = 0; - wd = NULL; + tkn = NULL; if ((dag = ps_get_lattice(dec->ps))) { @@ -244,11 +250,11 @@ static void fsg_processor(context_t *ctx, ps_latlink_nodes(lnk, &nod); - if (nod && (word = ps_latnode_word(dag, nod)) && *word != '<') { - wd = cand->words + cand->nword++; - wd->word = word; - wd->start = ps_latnode_times(nod, &fef, &lef); - wd->end = (fef + lef) / 2; + if (nod && (token = ps_latnode_word(dag, nod)) && *token != '<') { + tkn = cand->tokens + cand->ntoken++; + tkn->token = token; + tkn->start = ps_latnode_times(nod, &fef, &lef); + tkn->end = (fef + lef) / 2; } goto handle_destination_node; @@ -258,17 +264,18 @@ static void fsg_processor(context_t *ctx, handle_destination_node: nod = ps_latlink_nodes(lnk, NULL); - if (nod && (word = ps_latnode_word(dag, nod)) && *word != '<'){ + if (nod && (token = ps_latnode_word(dag,nod)) && *token != '<') + { start = ps_latnode_times(nod, &fef, &lef); - if (wd && start < wd->end) + if (tkn && start < tkn->end) break; /* just take one candidate */ - if (!wd || !wdeq(word, wd->word)) { - wd = cand->words + cand->nword++; - wd->word = word; - wd->start = start; - wd->end = fef; + if (!tkn || !tkneq(token, tkn->token)) { + tkn = cand->tokens + cand->ntoken++; + tkn->token = token; + tkn->start = start; + tkn->end = fef; } } } @@ -279,91 +286,93 @@ static void fsg_processor(context_t *ctx, sorted[1] = NULL; utt->id = uttid; - utt->quality = prob < 0.00001 ? 0.00001 : prob; + utt->score = prob < 0.00001 ? 0.00001 : prob; utt->length = dag ? ps_lattice_n_frames(dag) : 0; utt->ncand = 1; utt->cands = sorted; } -static void print_utterance(context_t *ctx, utterance_t *utt) +static void print_utterance(context_t *ctx, srs_srec_utterance_t *utt) { decoder_set_t *decset; decoder_t *dec; - candidate_t *cand; - word_t *wd; + srs_srec_candidate_t *cand; + srs_srec_token_t *tkn; size_t i,j; if (ctx && (decset = ctx->decset) && (dec = decset->curdec)) { mrp_log_info("*** %15s (%.4lf) %u candidates, length %u", - utt->id, utt->quality, utt->ncand, utt->length); + utt->id, utt->score, utt->ncand, utt->length); for (i = 0; cand = utt->cands[i]; i++) { - mrp_log_info(" (%.4lf) ----------------------", cand->quality); + mrp_log_info(" (%.4lf) ----------------------", cand->score); - for (j = 0; j < cand->nword; j++) { - wd = cand->words + j; - mrp_log_info(" %d - %d %s\n", - wd->start, wd->end, wd->word); + for (j = 0; j < cand->ntoken; j++) { + tkn = cand->tokens + j; + mrp_log_info(" %d - %d %s", + tkn->start, tkn->end, tkn->token); } } - mrp_log_info(" ----------------------\n"); + mrp_log_info(" ----------------------"); } } -static candidate_t *candidate_equal(candidate_t *a, candidate_t *b) +static srs_srec_candidate_t *candidate_equal(srs_srec_candidate_t *a, + srs_srec_candidate_t *b) { - word_t *aw,*bw; + srs_srec_token_t *at,*bt; size_t i,n; if (!a || !b) return NULL; - if ((n = a->nword) != b->nword) + if ((n = a->ntoken) != b->ntoken) return false; for (i = 0; i < n; i++) { - aw = a->words + i; - bw = b->words + i; + at = a->tokens + i; + bt = b->tokens + i; - if (!wdeq(aw->word, bw->word)) + if (!tkneq(at->token, bt->token)) return NULL; } - return (a->quality > b->quality) ? a : b; + return (a->score > b->score) ? a : b; } -static double candidate_quality(candidate_t *cand) +static double candidate_score(srs_srec_candidate_t *cand) { - return cand ? cand->quality : 0.0; + return cand ? cand->score : 0.0; } -static uint32_t candidate_sort(candidate_t *cands, candidate_t **sorted) +static uint32_t candidate_sort(srs_srec_candidate_t *cands, + srs_srec_candidate_t **sorted) { - candidate_t *c, **s; - candidate_t *better_quality; + srs_srec_candidate_t *c, **s; + srs_srec_candidate_t *better_score; size_t i,j,n; - memset(sorted, 0, sizeof(candidate_t *) * (CANDIDATE_MAX + 1)); + memset(sorted, 0, sizeof(srs_srec_candidate_t *) * (CANDIDATE_MAX + 1)); for (i = n = 0; i < CANDIDATE_MAX; i++) { - if (!(c = cands + i)->nword) + if (!(c = cands + i)->ntoken) break; for (j = 0; j <= n; j++) { s = sorted + j; - if ((better_quality = candidate_equal(c, *s))) { - *s = better_quality; + if ((better_score = candidate_equal(c, *s))) { + *s = better_score; break; } - if (candidate_quality(c) > candidate_quality(*s)) { + if (candidate_score(c) > candidate_score(*s)) { if (j < n) { memmove(sorted + j+1, sorted + j, - sizeof(candidate_t *) * (n - j)); + sizeof(srs_srec_candidate_t *) * (n - j)); } *s = c; n++; @@ -375,27 +384,27 @@ static uint32_t candidate_sort(candidate_t *cands, candidate_t **sorted) return n; } -static bool wdeq(const char *wd1, const char *wd2) +static bool tkneq(const char *tkn1, const char *tkn2) { const char *e1, *e2; int l1, l2, l; - if (!wd1 || !wd2) + if (!tkn1 || !tkn2) return false; - if (!strcmp(wd1, wd2)) + if (!strcmp(tkn1, tkn2)) return true; - if (*wd1 == *wd2) { - l1 = (e1 = strchr(wd1, '(')) ? e1 - wd1 : 0; - l2 = (e2 = strchr(wd2, '(')) ? e2 - wd2 : 0; + if (*tkn1 == *tkn2) { + l1 = (e1 = strchr(tkn1, '(')) ? e1 - tkn1 : 0; + l2 = (e2 = strchr(tkn2, '(')) ? e2 - tkn2 : 0; if (l1 || l2) { - if (l1 == l2 && !strncmp(wd1, wd2, l1)) + if (l1 == l2 && !strncmp(tkn1, tkn2, l1)) return true; - if (l1 && !l2 && !strncmp(wd1, wd2, l1)) + if (l1 && !l2 && !strncmp(tkn1, tkn2, l1)) return true; - if (!l1 && l2 && !strncmp(wd1, wd2, l2)) + if (!l1 && l2 && !strncmp(tkn1, tkn2, l2)) return true; } } diff --git a/src/plugins/sphinx-speech-engine/utterance.h b/src/plugins/sphinx-speech-engine/utterance.h index 4fca823..f2cfb38 100644 --- a/src/plugins/sphinx-speech-engine/utterance.h +++ b/src/plugins/sphinx-speech-engine/utterance.h @@ -3,10 +3,11 @@ #include "sphinx-plugin.h" -#define CANDIDATE_WORD_MAX 50 -#define CANDIDATE_MAX 1000 +#define CANDIDATE_TOKEN_MAX 50 +#define CANDIDATE_MAX 1000 +#if 0 struct word_s { const char *word; int32_t start; @@ -26,7 +27,7 @@ struct utternace_s { size_t ncand; candidate_t **cands; }; - +#endif void utterance_start(context_t *ctx); void utterance_end(context_t *ctx); -- 2.7.4