media/mojo/mojom/speech_recognition.mojom

   1 // Copyright 2022 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 module media.mojom;
   6
   7 import "media/mojo/mojom/audio_data.mojom";
   8 import "mojo/public/mojom/base/time.mojom";
   9 import "mojo/public/mojom/base/unguessable_token.mojom";
  10 import "ui/gfx/geometry/mojom/geometry.mojom";
  11
  12 // Next MinVersion: 4
  13
  14 // Corresponds to the LangIdEvent.ConfidenceInterval defined in
  15 // http://google3/speech/soda/public/soda_event.proto.
  16 [Stable, Extensible]
  17 enum ConfidenceLevel {
  18   [Default] kUnknown,
  19   kNotConfident,
  20   kConfident,
  21   kHighlyConfident,
  22 };
  23
  24 // Corresponds to the LangIdEvent.AsrSwitchResult defined in
  25 // http://google3/speech/soda/public/soda_event.proto.
  26 [Stable, Extensible]
  27 enum AsrSwitchResult {
  28   [Default] kDefaultNoSwitch,
  29   kSwitchSucceeded,
  30   kSwitchFailed,
  31   kSwitchSkipedNoLp,
  32 };
  33
  34 // The main interface a renderer client uses to interact with a speech
  35 // recognition service process. In web Live Caption, every renderer can own one
  36 // or more Remote<SpeechRecognitionContext>, with the receiver bound through the
  37 // BrowserInterfaceBroker. This is a stable interface that is used across the
  38 // LaCrOS/Ash boundary.
  39 [Stable]
  40 interface SpeechRecognitionContext {
  41   // Bind the recognizers to the speech recognition service. Returns a flag
  42   // indicating whether multichannel audio is supported by the speech
  43   // recognition service.
  44   BindRecognizer@0(pending_receiver<SpeechRecognitionRecognizer> receiver,
  45                    pending_remote<SpeechRecognitionRecognizerClient> client,
  46                    SpeechRecognitionOptions options)
  47                    => (bool is_multichannel_supported);
  48 };
  49
  50 // The interface used to pass raw audio from the renderer to the speech
  51 // recognition service. The remote lives either in the renderer process (for web
  52 // Live Caption) or the browser process (for CrOS features like system Live
  53 // Caption and dictation) and the receiver lives in the speech recognition
  54 // process.
  55 [Stable]
  56 interface SpeechRecognitionRecognizer {
  57   // Initialize the speech recognition instance. The speech recognition client
  58   // will return the recognition events containing the transcribed audio back
  59   // to the originating media.
  60   SendAudioToSpeechRecognitionService@0(AudioDataS16 buffer);
  61
  62   // Mark audio stream done. This informs the speech recognition client to stop
  63   // speech recognition after it finishes processing the audio it has received
  64   // already. This will eventually trigger the
  65   // SpeechRecognitionRecognizerClient::OnSpeechRecognitionStopped callback.
  66   MarkDone@1();
  67
  68   // Notify the speech recognition recognizer that the language changed. Takes
  69   // in the locale string (e.g. "en-US").
  70   OnLanguageChanged@2(string language);
  71
  72   // Notify the speech recognition recognizer that the mask offensive words
  73   // setting has changed.
  74   [MinVersion=2] OnMaskOffensiveWordsChanged@3(bool mask_offensive_words);
  75 };
  76
  77 // The interface used to return speech recognition events from the speech
  78 // recognition service to the client that will display the results to the user.
  79 // The remote lives in the speech recognition process and the receiver lives in
  80 // the browser process.
  81 [Stable]
  82 interface SpeechRecognitionRecognizerClient {
  83   // Triggered by speech recognition process on a speech recognition event.
  84   //
  85   // Returns false if the client wants to halt speech recognition e.g. in
  86   // response to user input or in the case of an error.
  87   OnSpeechRecognitionRecognitionEvent@0(SpeechRecognitionResult result)
  88       => (bool continue_recognition);
  89
  90   // Called when speech recognition stops.
  91   OnSpeechRecognitionStopped@1();
  92
  93   // Triggered by an error within the speech recognition service.
  94   OnSpeechRecognitionError@2();
  95
  96   // Triggered by speech recognition process on a language identification event.
  97   OnLanguageIdentificationEvent@3(LanguageIdentificationEvent event);
  98 };
  99
 100 // The hypothesis parts that provides timing information for each word in
 101 // recognized speech.
 102 [Stable]
 103 struct HypothesisParts {
 104   // A section of the final transcription text. Either an entire word or single
 105   // character (depending on the language) with adjacent punctuation. There will
 106   // usually only be one value here. If formatting is enabled in the speech
 107   // recognition, then the raw text will be included as the second element.
 108   array<string> text;
 109
 110   // Time offset from this event's |audio_start_time| defined below. We enforce
 111   // the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
 112   // audio_start_time|.
 113   mojo_base.mojom.TimeDelta hypothesis_part_offset;
 114 };
 115
 116 // The timing information for the transcript.
 117 [Stable]
 118 struct TimingInformation {
 119   // Start time in audio time from the start of the SODA session.
 120   // This time measures the amount of audio input into SODA.
 121   mojo_base.mojom.TimeDelta audio_start_time;
 122
 123   // Elapsed processed audio from first frame after preamble.
 124   mojo_base.mojom.TimeDelta audio_end_time;
 125
 126   // The timing information for each word/letter in the transription.
 127   // HypothesisPartsInResult was introduced in min version 1 in
 128   // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
 129   // must be optional. Hypothesis parts maybe non-empty optional containing a
 130   // zero length vector if no words were spoken during the event's time span.
 131   array<HypothesisParts> ? hypothesis_parts;
 132 };
 133
 134 // A speech recognition result created by the speech service and passed to the
 135 // browser.
 136 [Stable]
 137 struct SpeechRecognitionResult {
 138   string transcription;
 139
 140   // A flag indicating whether the result is final. If true, the result is
 141   // locked in and the next result returned will not overlap with the previous
 142   // final result.
 143   bool is_final;
 144
 145   // Timing information for the current transcription. |timing_information| is
 146   // expected to be valid if:
 147   //   1. speech recognition is provided by |CrosSodaClient| and
 148   //   2. |is_final| is true.
 149   TimingInformation? timing_information;
 150 };
 151
 152 // A language identification event created by the speech recognition service
 153 // and passed to the browser and renderer.
 154 [Stable]
 155 struct LanguageIdentificationEvent {
 156   // The locale of the language with the highest confidence.
 157   string language;
 158
 159   // The confidence interval.
 160   ConfidenceLevel confidence_level;
 161
 162   // If multilang is enabled, describes the actions Automatic Speech Recognition
 163   // took as a result of this event.
 164   [MinVersion=1] AsrSwitchResult? asr_switch_result;
 165 };
 166
 167 // The interface used to notify the speech recognition client of events
 168 // triggered by the browser. The remote lives in the browser process and the
 169 // receiver lives either in the renderer process (for web Live Caption) or
 170 // the browser process (for CrOS system Live Caption).
 171 [Stable]
 172 interface SpeechRecognitionBrowserObserver {
 173   // Notify the speech recognition client when speech recognition availability
 174   // changes.
 175   SpeechRecognitionAvailabilityChanged@0(bool is_speech_recognition_available);
 176
 177   // Notify the speech recognition client when the speech recognition language
 178   // changes.
 179   SpeechRecognitionLanguageChanged@1(string language);
 180
 181   // Notify the speech recognition client when the mask offensive words pref
 182   // changes.
 183   [MinVersion=2] SpeechRecognitionMaskOffensiveWordsChanged@2(
 184       bool mask_offensive_words);
 185 };
 186
 187 // The user-facing source of recognized speech; typically a tab. The remote
 188 // lives in the Ash browser process and is used to trigger behavior in lacros
 189 // (like focusing the tab). The receiver lives in the lacros browser process.
 190 [Stable]
 191 interface SpeechRecognitionSurface {
 192   // "Activate" the surface - i.e. bring it to the front and focus it.
 193   Activate@0();
 194
 195   // Fetch the bounds of the surface in screen coordinates. A nullopt is
 196   // returned if no bounds could be fetched.
 197   GetBounds@1() => (gfx.mojom.Rect? bounds);
 198 };
 199
 200 // The OS-side observer of a lacros-side speech surface. Used to close or
 201 // re-render a live caption bubble based on user interaction with the
 202 // lacros-side surface. The remote lives in the lacros browser process, and the
 203 // receiver lives in the Ash browser process.
 204 [Stable]
 205 interface SpeechRecognitionSurfaceClient {
 206   // Called when the user navigates away or refreshes the current tab. This
 207   // comprises the end of a live caption "session", after which the caption
 208   // bubble can be shown even if it was explicitly dismissed by the user.
 209   OnSessionEnded@0();
 210
 211   // Called when the user fullscreens or un-fullscreens the speech surface.
 212   OnFullscreenToggled@1();
 213 };
 214
 215 // Static metadata about a remote speech surface. Used by the speech service
 216 // client in Ash.
 217 [Stable]
 218 struct SpeechRecognitionSurfaceMetadata {
 219   // A unique identifier for the "session" (i.e. tab) of the surface. Is used to
 220   // hide the caption bubble for all streams in a tab if the bubble is closed
 221   // once.
 222   mojo_base.mojom.UnguessableToken session_id;
 223 };
 224
 225 // This interface between the speech recognition client and the browser.
 226 // The remote lives in the renderer process and the receiver lives in the
 227 // browser process. Not necessary for browser-side features (e.g. CrOS system
 228 // Live Caption), which can access browser functionality directly.
 229 [Stable]
 230 interface SpeechRecognitionClientBrowserInterface {
 231   // Bind the speech recognition availability observer.
 232   BindSpeechRecognitionBrowserObserver@0(
 233     pending_remote<SpeechRecognitionBrowserObserver> observer);
 234
 235   // Requests that a remote speech recognition client be instantiated and bound
 236   // in the Ash browser process. The instantiated client should use the surface
 237   // and surface client bindings to perform tasks (such as refocusing) that
 238   // require coordination with the current lacros tab.
 239   [MinVersion=1] BindRecognizerToRemoteClient@1(
 240     pending_receiver<SpeechRecognitionRecognizerClient> client,
 241     pending_receiver<SpeechRecognitionSurfaceClient> surface_client,
 242     pending_remote<SpeechRecognitionSurface> surface,
 243     SpeechRecognitionSurfaceMetadata metadata);
 244 };
 245
 246 // Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
 247 // chrome/services/speech/soda/proto/soda_api.proto and
 248 // SodaRecognitionMode in
 249 // chromeos/services/machine_learning/public/mojom/soda.mojom.
 250 [Stable, Extensible]
 251 enum SpeechRecognitionMode {
 252   [Default] kUnknown,
 253   // Intended for voice input for keyboard usage.
 254   kIme,
 255   // Intended to caption a stream of audio.
 256   kCaption,
 257 };
 258
 259 // Which Chrome/ChromeOS application that is triggering the
 260 // speech recognition session to start.
 261 [Stable, Extensible]
 262 enum RecognizerClientType {
 263   [Default] kUnknown,
 264   // Dictation on ChromeOS.
 265   kDictation,
 266   // LiveCaption on Chrome/ChromeOS.
 267   kLiveCaption,
 268   // Projector on ChromeOS.
 269   kProjector,
 270   // CastModerator on ChromeOS.
 271   kCastModerator,
 272 };
 273
 274 // Options for speech recognition.
 275 [Stable]
 276 struct SpeechRecognitionOptions {
 277   // What kind of recognition to use.
 278   // In the case of web fallback (not for launch, used for development only),
 279   // this option will be ignored.
 280   SpeechRecognitionMode recognition_mode;
 281
 282   // Whether to enable formatting and punctuation in the recognition results.
 283   bool enable_formatting;
 284
 285   // The BCP-47 localized language code to use (e.g. "en-US").
 286   // TODO(crbug.com/1161569): Language needs to be required when multiple
 287   // languages are supported by SODA, so that each SpeechRecognitionRecognizer
 288   // can use its own language. Right now Language is only used by Projector
 289   // and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
 290   string? language;
 291
 292   // Whether the recognition is happening on-device or remotely on a server.
 293   [MinVersion=1] bool is_server_based;
 294
 295   // Which client is requesting the speech recognition session.
 296   [MinVersion=1] RecognizerClientType recognizer_client_type;
 297
 298   // When true, if the incoming audio buffer is zero for an extended period
 299   // (e.g. 10 seconds), audio won't be fed to the captioning model until nonzero
 300   // audio is received.
 301   // When false, even empty audio is captioned indefinitely.
 302   // Set to false if accurate TimingInfo relative to the start of captioning is
 303   // needed.
 304   [MinVersion=4] bool skip_continuously_empty_audio = false;
 305 };