src/daemon/recognizer.h

   1 /*
   2  * Copyright (c) 2012, Intel Corporation
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  *   * Redistributions of source code must retain the above copyright notice,
   9  *     this list of conditions and the following disclaimer.
  10  *   * Redistributions in binary form must reproduce the above copyright
  11  *     notice, this list of conditions and the following disclaimer in the
  12  *     documentation and/or other materials provided with the distribution.
  13  *   * Neither the name of Intel Corporation nor the names of its contributors
  14  *     may be used to endorse or promote products derived from this software
  15  *     without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28  */
  29
  30 #ifndef __SRS_DAEMON_RECOGNIZER_H__
  31 #define __SRS_DAEMON_RECOGNIZER_H__
  32
  33 #include "src/daemon/client.h"
  34 #include "src/daemon/audiobuf.h"
  35
  36 /*
  37  * speech recognition backend interface
  38  */
  39
  40 /** Type for tokens recognized by a speech recognition backend. */
  41 typedef struct srs_srec_utterance_s srs_srec_utterance_t;
  42
  43 /** Type for a backend recognition notification callback. */
  44 typedef int (*srs_srec_notify_t)(srs_srec_utterance_t *utt, void *notify_data);
  45
  46 /** Notification callback return value for flushing the full audio buffer. */
  47 #define SRS_SREC_FLUSH_ALL -1
  48
  49
  50 /*
  51  * API to a speech recognition backend.
  52  */
  53 typedef struct {
  54     /** Activate speech recognition. */
  55     int (*activate)(void *user_data);
  56     /** Deactivate speech recognition. */
  57     void (*deactivate)(void *user_data);
  58     /** Flush part or whole of the audio buffer. */
  59     int (*flush)(uint32_t start, uint32_t end, void *user_data);
  60     /** Schedule a rescan of the given portion of the audio buffer. */
  61     int (*rescan)(uint32_t start, uint32_t end, void *user_data);
  62     /** Get a copy of the audio samples in the buffer. */
  63     srs_audiobuf_t *(*sampledup)(uint32_t start, uint32_t end, void *user_data);
  64     /** Check if the given language model exists/is usable. */
  65     int (*check_decoder)(const char *decoder, void *user_data);
  66     /** Set language model to be used. */
  67     int (*select_decoder)(const char *decoder, void *user_data);
  68     /** Get the used language model. */
  69     const char *(*active_decoder)(void *user_data);
  70 } srs_srec_api_t;
  71
  72 /*
  73  * a single speech token
  74  */
  75 typedef struct {
  76     const char *token;                     /* recognized tokens */
  77     double      score;                     /* correctness probability */
  78     uint32_t    start;                     /* start in audio buffer */
  79     uint32_t    end;                       /* end in audio buffer */
  80 } srs_srec_token_t;
  81
  82 /*
  83  * a single candidate (essentially a set of speech tokens)
  84  */
  85 typedef struct {
  86     double            score;             /* overall candidate quality score */
  87     size_t            ntoken;            /* number of tokens in candidate */
  88     srs_srec_token_t *tokens;            /* actual tokens of this candidate */
  89 } srs_srec_candidate_t;
  90
  91 /*
  92  * an utterance (candidates for a silence-terminated audio sequence)
  93  */
  94 struct srs_srec_utterance_s {
  95     const char            *id;           /* backend ID for this utterance */
  96     double                 score;        /* overall quality score */
  97     uint32_t               length;       /* length in the audio buffer */
  98     size_t                 ncand;        /* number of candidates */
  99     srs_srec_candidate_t **cands;        /* actual candidates */
 100 };
 101
 102 /** Register a speech recognition backend. */
 103 int srs_register_srec(srs_context_t *srs, const char *name,
 104                       srs_srec_api_t *api, void *api_data,
 105                       srs_srec_notify_t *notify, void **notify_data);
 106
 107 /** Unregister a speech recognition backend. */
 108 void srs_unregister_srec(srs_context_t *srs, const char *name);
 109
 110 /** Macro to refer to the default recognizer backend. */
 111 #define SRS_DEFAULT_RECOGNIZER NULL
 112
 113 /** Activate speech recognition using the specified backend. */
 114 int srs_activate_srec(srs_context_t *srs, const char *name);
 115
 116 /** Deactivate the specified speech recognition backend. */
 117 void srs_deactivate_srec(srs_context_t *srs, const char *name);
 118
 119 /** Check if a decoder (model/dictionary combination) exists for a backend. */
 120 int srs_check_decoder(srs_context_t *srs, const char *name,
 121                       const char *decoder);
 122
 123 /** Select a decoder for a backend. */
 124 int srs_set_decoder(srs_context_t *srs, const char *name, const char *decoder);
 125
 126
 127 /*
 128  * speech recognition disambiguator interface
 129  */
 130
 131 /** Type for disambiguated speech recognition results. */
 132 typedef struct srs_srec_result_s srs_srec_result_t;
 133
 134 /*
 135  * disambiguation result
 136  */
 137
 138 typedef enum {
 139     SRS_DISAMB_UNKNOWN = 0,
 140     SRS_DISAMB_MATCH,                    /* full match */
 141     SRS_DISAMB_RESCAN,                   /* rescan (after dictionary switch) */
 142     SRS_DISAMB_AMBIGUOUS,                /* failed to (fully) disambiguate */
 143 } srs_disamb_type_t;
 144
 145 typedef enum {
 146     SRS_SREC_RESULT_UNKNOWN = 0,         /* unknown result */
 147     SRS_SREC_RESULT_MATCH,               /* full command match */
 148     SRS_SREC_RESULT_DICT,                /* dictionary switch required */
 149     SRS_SREC_RESULT_AMBIGUOUS,           /* further disambiguation needed */
 150     SRS_SREC_RESULT_UNRECOGNIZED,        /* did not recognize */
 151 } srs_srec_result_type_t;
 152
 153 typedef struct {
 154     mrp_list_hook_t   hook;              /* to more commands */
 155     srs_client_t     *client;            /* actual client */
 156     int               index;             /* client command index */
 157     double            score;             /* backend score */
 158     int               fuzz;              /* disambiguation fuzz */
 159     char            **tokens;            /* command tokens */
 160 } srs_srec_match_t;
 161
 162 struct srs_srec_result_s {
 163     srs_srec_result_type_t   type;       /* result type */
 164     mrp_list_hook_t          hook;       /* to list of results */
 165     srs_audiobuf_t          *samplebuf;  /* audio sample buffer */
 166     uint32_t                 sampleoffs; /* extra audio sample offset */
 167     char                   **tokens;     /* matched tokens */
 168     uint32_t                *start;      /* token start offset */
 169     uint32_t                *end;        /* token end offsets */
 170     int                      ntoken;     /* number of tokens */
 171     char                   **dicts;      /* dictionary stack */
 172     int                      ndict;      /* stack depth */
 173
 174     union {                              /* type specific data */
 175         mrp_list_hook_t    matches;      /* full match(es) */
 176         struct {
 177             srs_dict_op_t  op;           /* push/pop/switch */
 178             char          *dict;         /* dictionary for switch/push */
 179             int            rescan;       /* rescan starting at this token */
 180             void          *state;        /* disambiguator continuation */
 181         } dict;
 182     } result;
 183 };
 184
 185
 186 /*
 187  * API to a disambiguator implementation.
 188  */
 189
 190 typedef struct {
 191     /** Register the commands of a client. */
 192     int (*add_client)(srs_client_t *client, void *api_data);
 193     /** Unregister the commands of a client. */
 194     void (*del_client)(srs_client_t *client, void *api_data);
 195     /** Disambiguate an utterance with candidates. */
 196     int (*disambiguate)(srs_srec_utterance_t *utt, srs_srec_result_t **result,
 197                         void *api_data);
 198 } srs_disamb_api_t;
 199
 200
 201 /** Register a disambiguator implementation. */
 202 int srs_register_disambiguator(srs_context_t *srs, const char *name,
 203                                srs_disamb_api_t *api, void *api_data);
 204
 205 /** Unregister a disambiguator implementation. */
 206 void srs_unregister_disambiguator(srs_context_t *srs, const char *name);
 207
 208 /** Register a client for speech recognition. */
 209 int srs_srec_add_client(srs_context_t *srs, srs_client_t *client);
 210
 211 /** Unregister a client from speech recognition. */
 212 void srs_srec_del_client(srs_context_t *srs, srs_client_t *client);
 213
 214
 215 /** Macro to refer to the default disambiguator. */
 216 #define SRS_DEFAULT_DISAMBIGUATOR NULL
 217
 218 #endif /* __SRS_DAEMON_RECOGNIZER_H__ */