1 // Copyright 2022 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 import "media/mojo/mojom/audio_data.mojom";
8 import "mojo/public/mojom/base/time.mojom";
9 import "mojo/public/mojom/base/unguessable_token.mojom";
10 import "ui/gfx/geometry/mojom/geometry.mojom";
14 // Corresponds to the LangIdEvent.ConfidenceInterval defined in
15 // http://google3/speech/soda/public/soda_event.proto.
17 enum ConfidenceLevel {
24 // Corresponds to the LangIdEvent.AsrSwitchResult defined in
25 // http://google3/speech/soda/public/soda_event.proto.
27 enum AsrSwitchResult {
28 [Default] kDefaultNoSwitch,
34 // The main interface a renderer client uses to interact with a speech
35 // recognition service process. In web Live Caption, every renderer can own one
36 // or more Remote<SpeechRecognitionContext>, with the receiver bound through the
37 // BrowserInterfaceBroker. This is a stable interface that is used across the
38 // LaCrOS/Ash boundary.
40 interface SpeechRecognitionContext {
41 // Bind the recognizers to the speech recognition service. Returns a flag
42 // indicating whether multichannel audio is supported by the speech
43 // recognition service.
44 BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
45 pending_remote<SpeechRecognitionRecognizerClient> client,
46 SpeechRecognitionOptions options)
47 => (bool is_multichannel_supported);
50 // The interface used to pass raw audio from the renderer to the speech
51 // recognition service. The remote lives either in the renderer process (for web
52 // Live Caption) or the browser process (for CrOS features like system Live
53 // Caption and dictation) and the receiver lives in the speech recognition
56 interface SpeechRecognitionRecognizer {
57 // Initialize the speech recognition instance. The speech recognition client
58 // will return the recognition events containing the transcribed audio back
59 // to the originating media.
60 SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);
62 // Mark audio stream done. This informs the speech recognition client to stop
63 // speech recognition after it finishes processing the audio it has received
64 // already. This will eventually trigger the
65 // SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
68 // Notify the speech recognition recognizer that the language changed. Takes
69 // in the locale string (e.g. "en-US").
70 OnLanguageChanged@2(string language);
72 // Notify the speech recognition recognizer that the mask offensive words
73 // setting has changed.
74 [MinVersion=2] OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
77 // The interface used to return speech recognition events from the speech
78 // recognition service to the client that will display the results to the user.
79 // The remote lives in the speech recognition process and the receiver lives in
80 // the browser process.
82 interface SpeechRecognitionRecognizerClient {
83 // Triggered by speech recognition process on a speech recognition event.
85 // Returns false if the client wants to halt speech recognition e.g. in
86 // response to user input or in the case of an error.
87 OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
88 => (bool continue_recognition);
90 // Called when speech recognition stops.
91 OnSpeechRecognitionStopped@1();
93 // Triggered by an error within the speech recognition service.
94 OnSpeechRecognitionError@2();
96 // Triggered by speech recognition process on a language identification event.
97 OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
100 // The hypothesis parts that provides timing information for each word in
101 // recognized speech.
103 struct HypothesisParts {
104 // A section of the final transcription text. Either an entire word or single
105 // character (depending on the language) with adjacent punctuation. There will
106 // usually only be one value here. If formatting is enabled in the speech
107 // recognition, then the raw text will be included as the second element.
110 // Time offset from this event's |audio_start_time| defined below. We enforce
111 // the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
112 // audio_start_time|.
113 mojo_base.mojom.TimeDelta hypothesis_part_offset;
116 // The timing information for the transcript.
118 struct TimingInformation {
119 // Start time in audio time from the start of the SODA session.
120 // This time measures the amount of audio input into SODA.
121 mojo_base.mojom.TimeDelta audio_start_time;
123 // Elapsed processed audio from first frame after preamble.
124 mojo_base.mojom.TimeDelta audio_end_time;
126 // The timing information for each word/letter in the transription.
127 // HypothesisPartsInResult was introduced in min version 1 in
128 // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
129 // must be optional. Hypothesis parts maybe non-empty optional containing a
130 // zero length vector if no words were spoken during the event's time span.
131 array<HypothesisParts> ? hypothesis_parts;
134 // A speech recognition result created by the speech service and passed to the
137 struct SpeechRecognitionResult {
138 string transcription;
140 // A flag indicating whether the result is final. If true, the result is
141 // locked in and the next result returned will not overlap with the previous
145 // Timing information for the current transcription. |timing_information| is
146 // expected to be valid if:
147 // 1. speech recognition is provided by |CrosSodaClient| and
148 // 2. |is_final| is true.
149 TimingInformation? timing_information;
152 // A language identification event created by the speech recognition service
153 // and passed to the browser and renderer.
155 struct LanguageIdentificationEvent {
156 // The locale of the language with the highest confidence.
159 // The confidence interval.
160 ConfidenceLevel confidence_level;
162 // If multilang is enabled, describes the actions Automatic Speech Recognition
163 // took as a result of this event.
164 [MinVersion=1] AsrSwitchResult? asr_switch_result;
167 // The interface used to notify the speech recognition client of events
168 // triggered by the browser. The remote lives in the browser process and the
169 // receiver lives either in the renderer process (for web Live Caption) or
170 // the browser process (for CrOS system Live Caption).
172 interface SpeechRecognitionBrowserObserver {
173 // Notify the speech recognition client when speech recognition availability
175 SpeechRecognitionAvailabilityChanged@0(bool is_speech_recognition_available);
177 // Notify the speech recognition client when the speech recognition language
179 SpeechRecognitionLanguageChanged@1(string language);
181 // Notify the speech recognition client when the mask offensive words pref
183 [MinVersion=2] SpeechRecognitionMaskOffensiveWordsChanged@2(
184 bool mask_offensive_words);
187 // The user-facing source of recognized speech; typically a tab. The remote
188 // lives in the Ash browser process and is used to trigger behavior in lacros
189 // (like focusing the tab). The receiver lives in the lacros browser process.
191 interface SpeechRecognitionSurface {
192 // "Activate" the surface - i.e. bring it to the front and focus it.
195 // Fetch the bounds of the surface in screen coordinates. A nullopt is
196 // returned if no bounds could be fetched.
197 GetBounds@1() => (gfx.mojom.Rect? bounds);
200 // The OS-side observer of a lacros-side speech surface. Used to close or
201 // re-render a live caption bubble based on user interaction with the
202 // lacros-side surface. The remote lives in the lacros browser process, and the
203 // receiver lives in the Ash browser process.
205 interface SpeechRecognitionSurfaceClient {
206 // Called when the user navigates away or refreshes the current tab. This
207 // comprises the end of a live caption "session", after which the caption
208 // bubble can be shown even if it was explicitly dismissed by the user.
211 // Called when the user fullscreens or un-fullscreens the speech surface.
212 OnFullscreenToggled@1();
215 // Static metadata about a remote speech surface. Used by the speech service
218 struct SpeechRecognitionSurfaceMetadata {
219 // A unique identifier for the "session" (i.e. tab) of the surface. Is used to
220 // hide the caption bubble for all streams in a tab if the bubble is closed
222 mojo_base.mojom.UnguessableToken session_id;
225 // This interface between the speech recognition client and the browser.
226 // The remote lives in the renderer process and the receiver lives in the
227 // browser process. Not necessary for browser-side features (e.g. CrOS system
228 // Live Caption), which can access browser functionality directly.
230 interface SpeechRecognitionClientBrowserInterface {
231 // Bind the speech recognition availability observer.
232 BindSpeechRecognitionBrowserObserver@0(
233 pending_remote<SpeechRecognitionBrowserObserver> observer);
235 // Requests that a remote speech recognition client be instantiated and bound
236 // in the Ash browser process. The instantiated client should use the surface
237 // and surface client bindings to perform tasks (such as refocusing) that
238 // require coordination with the current lacros tab.
239 [MinVersion=1] BindRecognizerToRemoteClient@1(
240 pending_receiver<SpeechRecognitionRecognizerClient> client,
241 pending_receiver<SpeechRecognitionSurfaceClient> surface_client,
242 pending_remote<SpeechRecognitionSurface> surface,
243 SpeechRecognitionSurfaceMetadata metadata);
246 // Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
247 // chrome/services/speech/soda/proto/soda_api.proto and
248 // SodaRecognitionMode in
249 // chromeos/services/machine_learning/public/mojom/soda.mojom.
251 enum SpeechRecognitionMode {
253 // Intended for voice input for keyboard usage.
255 // Intended to caption a stream of audio.
259 // Which Chrome/ChromeOS application that is triggering the
260 // speech recognition session to start.
262 enum RecognizerClientType {
264 // Dictation on ChromeOS.
266 // LiveCaption on Chrome/ChromeOS.
268 // Projector on ChromeOS.
270 // CastModerator on ChromeOS.
274 // Options for speech recognition.
276 struct SpeechRecognitionOptions {
277 // What kind of recognition to use.
278 // In the case of web fallback (not for launch, used for development only),
279 // this option will be ignored.
280 SpeechRecognitionMode recognition_mode;
282 // Whether to enable formatting and punctuation in the recognition results.
283 bool enable_formatting;
285 // The BCP-47 localized language code to use (e.g. "en-US").
286 // TODO(crbug.com/1161569): Language needs to be required when multiple
287 // languages are supported by SODA, so that each SpeechRecognitionRecognizer
288 // can use its own language. Right now Language is only used by Projector
289 // and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
292 // Whether the recognition is happening on-device or remotely on a server.
293 [MinVersion=1] bool is_server_based;
295 // Which client is requesting the speech recognition session.
296 [MinVersion=1] RecognizerClientType recognizer_client_type;
298 // When true, if the incoming audio buffer is zero for an extended period
299 // (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
300 // audio is received.
301 // When false, even empty audio is captioned indefinitely.
302 // Set to false if accurate TimingInfo relative to the start of captioning is
304 [MinVersion=4] bool skip_continuously_empty_audio = false;