2 * Copyright (c) 2012, Intel Corporation
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Intel Corporation nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #ifndef __SRS_DAEMON_RECOGNIZER_H__
31 #define __SRS_DAEMON_RECOGNIZER_H__
33 #include "src/daemon/client.h"
34 #include "src/daemon/audiobuf.h"
37 * speech recognition backend interface
40 /** Type for tokens recognized by a speech recognition backend. */
41 typedef struct srs_srec_utterance_s srs_srec_utterance_t;
43 /** Type for a backend recognition notification callback. */
44 typedef int (*srs_srec_notify_t)(srs_srec_utterance_t *utt, void *notify_data);
46 /** Notification callback return value for flushing the full audio buffer. */
47 #define SRS_SREC_FLUSH_ALL -1
51 * API to a speech recognition backend.
54 /** Activate speech recognition. */
55 int (*activate)(void *user_data);
56 /** Deactivate speech recognition. */
57 void (*deactivate)(void *user_data);
58 /** Flush part or whole of the audio buffer. */
59 int (*flush)(uint32_t start, uint32_t end, void *user_data);
60 /** Schedule a rescan of the given portion of the audio buffer. */
61 int (*rescan)(uint32_t start, uint32_t end, void *user_data);
62 /** Get a copy of the audio samples in the buffer. */
63 srs_audiobuf_t *(*sampledup)(uint32_t start, uint32_t end, void *user_data);
64 /** Check if the given language model exists/is usable. */
65 int (*check_decoder)(const char *decoder, void *user_data);
66 /** Set language model to be used. */
67 int (*select_decoder)(const char *decoder, void *user_data);
68 /** Get the used language model. */
69 const char *(*active_decoder)(void *user_data);
73 * a single speech token
76 const char *token; /* recognized tokens */
77 double score; /* correctness probability */
78 uint32_t start; /* start in audio buffer */
79 uint32_t end; /* end in audio buffer */
83 * a single candidate (essentially a set of speech tokens)
86 double score; /* overall candidate quality score */
87 size_t ntoken; /* number of tokens in candidate */
88 srs_srec_token_t *tokens; /* actual tokens of this candidate */
89 } srs_srec_candidate_t;
92 * an utterance (candidates for a silence-terminated audio sequence)
94 struct srs_srec_utterance_s {
95 const char *id; /* backend ID for this utterance */
96 double score; /* overall quality score */
97 uint32_t length; /* length in the audio buffer */
98 size_t ncand; /* number of candidates */
99 srs_srec_candidate_t **cands; /* actual candidates */
102 /** Register a speech recognition backend. */
103 int srs_register_srec(srs_context_t *srs, const char *name,
104 srs_srec_api_t *api, void *api_data,
105 srs_srec_notify_t *notify, void **notify_data);
107 /** Unregister a speech recognition backend. */
108 void srs_unregister_srec(srs_context_t *srs, const char *name);
110 /** Macro to refer to the default recognizer backend. */
111 #define SRS_DEFAULT_RECOGNIZER NULL
113 /** Activate speech recognition using the specified backend. */
114 int srs_activate_srec(srs_context_t *srs, const char *name);
116 /** Deactivate the specified speech recognition backend. */
117 void srs_deactivate_srec(srs_context_t *srs, const char *name);
119 /** Check if a decoder (model/dictionary combination) exists for a backend. */
120 int srs_check_decoder(srs_context_t *srs, const char *name,
121 const char *decoder);
123 /** Select a decoder for a backend. */
124 int srs_set_decoder(srs_context_t *srs, const char *name, const char *decoder);
128 * speech recognition disambiguator interface
131 /** Type for disambiguated speech recognition results. */
132 typedef struct srs_srec_result_s srs_srec_result_t;
135 * disambiguation result
139 SRS_DISAMB_UNKNOWN = 0,
140 SRS_DISAMB_MATCH, /* full match */
141 SRS_DISAMB_RESCAN, /* rescan (after dictionary switch) */
142 SRS_DISAMB_AMBIGUOUS, /* failed to (fully) disambiguate */
146 SRS_SREC_RESULT_UNKNOWN = 0, /* unknown result */
147 SRS_SREC_RESULT_MATCH, /* full command match */
148 SRS_SREC_RESULT_DICT, /* dictionary switch required */
149 SRS_SREC_RESULT_AMBIGUOUS, /* further disambiguation needed */
150 SRS_SREC_RESULT_UNRECOGNIZED, /* did not recognize */
151 } srs_srec_result_type_t;
154 mrp_list_hook_t hook; /* to more commands */
155 srs_client_t *client; /* actual client */
156 int index; /* client command index */
157 double score; /* backend score */
158 int fuzz; /* disambiguation fuzz */
159 char **tokens; /* command tokens */
162 struct srs_srec_result_s {
163 srs_srec_result_type_t type; /* result type */
164 mrp_list_hook_t hook; /* to list of results */
165 srs_audiobuf_t *samplebuf; /* audio sample buffer */
166 uint32_t sampleoffs; /* extra audio sample offset */
167 char **tokens; /* matched tokens */
168 uint32_t *start; /* token start offset */
169 uint32_t *end; /* token end offsets */
170 int ntoken; /* number of tokens */
171 char **dicts; /* dictionary stack */
172 int ndict; /* stack depth */
174 union { /* type specific data */
175 mrp_list_hook_t matches; /* full match(es) */
177 srs_dict_op_t op; /* push/pop/switch */
178 char *dict; /* dictionary for switch/push */
179 int rescan; /* rescan starting at this token */
180 void *state; /* disambiguator continuation */
187 * API to a disambiguator implementation.
191 /** Register the commands of a client. */
192 int (*add_client)(srs_client_t *client, void *api_data);
193 /** Unregister the commands of a client. */
194 void (*del_client)(srs_client_t *client, void *api_data);
195 /** Disambiguate an utterance with candidates. */
196 int (*disambiguate)(srs_srec_utterance_t *utt, srs_srec_result_t **result,
201 /** Register a disambiguator implementation. */
202 int srs_register_disambiguator(srs_context_t *srs, const char *name,
203 srs_disamb_api_t *api, void *api_data);
205 /** Unregister a disambiguator implementation. */
206 void srs_unregister_disambiguator(srs_context_t *srs, const char *name);
208 /** Register a client for speech recognition. */
209 int srs_srec_add_client(srs_context_t *srs, srs_client_t *client);
211 /** Unregister a client from speech recognition. */
212 void srs_srec_del_client(srs_context_t *srs, srs_client_t *client);
215 /** Macro to refer to the default disambiguator. */
216 #define SRS_DEFAULT_DISAMBIGUATOR NULL
218 #endif /* __SRS_DAEMON_RECOGNIZER_H__ */