1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/browser/speech/google_streaming_remote_engine.h"
10 #include "base/rand_util.h"
11 #include "base/strings/string_number_conversions.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "content/browser/speech/audio_buffer.h"
16 #include "content/browser/speech/proto/google_streaming_api.pb.h"
17 #include "content/public/common/content_switches.h"
18 #include "content/public/common/speech_recognition_error.h"
19 #include "content/public/common/speech_recognition_result.h"
20 #include "google_apis/google_api_keys.h"
21 #include "net/base/escape.h"
22 #include "net/base/load_flags.h"
23 #include "net/url_request/http_user_agent_settings.h"
24 #include "net/url_request/url_fetcher.h"
25 #include "net/url_request/url_request_context.h"
26 #include "net/url_request/url_request_context_getter.h"
27 #include "net/url_request/url_request_status.h"
29 using net::URLFetcher;
34 const char kWebServiceBaseUrl[] =
35 "https://www.google.com/speech-api/full-duplex/v1";
36 const char kDownstreamUrl[] = "/down?";
37 const char kUpstreamUrl[] = "/up?";
38 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
40 // This matches the maximum maxAlternatives value supported by the server.
41 const uint32 kMaxMaxAlternatives = 30;
43 // TODO(hans): Remove this and other logging when we don't need it anymore.
44 void DumpResponse(const std::string& response) {
45 DVLOG(1) << "------------";
46 proto::SpeechRecognitionEvent event;
47 if (!event.ParseFromString(response)) {
48 DVLOG(1) << "Parse failed!";
51 if (event.has_status())
52 DVLOG(1) << "STATUS\t" << event.status();
53 for (int i = 0; i < event.result_size(); ++i) {
54 DVLOG(1) << "RESULT #" << i << ":";
55 const proto::SpeechRecognitionResult& res = event.result(i);
57 DVLOG(1) << " FINAL:\t" << res.final();
58 if (res.has_stability())
59 DVLOG(1) << " STABILITY:\t" << res.stability();
60 for (int j = 0; j < res.alternative_size(); ++j) {
61 const proto::SpeechRecognitionAlternative& alt =
63 if (alt.has_confidence())
64 DVLOG(1) << " CONFIDENCE:\t" << alt.confidence();
65 if (alt.has_transcript())
66 DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript();
73 const int GoogleStreamingRemoteEngine::kAudioPacketIntervalMs = 100;
74 const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting = 0;
75 const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting = 1;
76 const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0;
77 const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5;
79 GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
80 net::URLRequestContextGetter* context)
81 : url_context_(context),
82 previous_response_length_(0),
83 got_last_definitive_result_(false),
84 is_dispatching_event_(false),
87 GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}
89 void GoogleStreamingRemoteEngine::SetConfig(
90 const SpeechRecognitionEngineConfig& config) {
94 void GoogleStreamingRemoteEngine::StartRecognition() {
95 FSMEventArgs event_args(EVENT_START_RECOGNITION);
96 DispatchEvent(event_args);
99 void GoogleStreamingRemoteEngine::EndRecognition() {
100 FSMEventArgs event_args(EVENT_END_RECOGNITION);
101 DispatchEvent(event_args);
104 void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
105 FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
106 event_args.audio_data = &data;
107 DispatchEvent(event_args);
110 void GoogleStreamingRemoteEngine::AudioChunksEnded() {
111 FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
112 DispatchEvent(event_args);
115 void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) {
116 const bool kResponseComplete = true;
117 DispatchHTTPResponse(source, kResponseComplete);
120 void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress(
121 const URLFetcher* source, int64 current, int64 total) {
122 const bool kPartialResponse = false;
123 DispatchHTTPResponse(source, kPartialResponse);
126 void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source,
127 bool end_of_response) {
128 DCHECK(CalledOnValidThread());
130 const bool response_is_good = source->GetStatus().is_success() &&
131 source->GetResponseCode() == 200;
132 std::string response;
133 if (response_is_good)
134 source->GetResponseAsString(&response);
135 const size_t current_response_length = response.size();
137 DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream")
138 << "HTTP, code: " << source->GetResponseCode()
139 << " length: " << current_response_length
140 << " eor: " << end_of_response;
142 // URLFetcher provides always the entire response buffer, but we are only
143 // interested in the fresh data introduced by the last chunk. Therefore, we
144 // drop the previous content we have already processed.
145 if (current_response_length != 0) {
146 DCHECK_GE(current_response_length, previous_response_length_);
147 response.erase(0, previous_response_length_);
148 previous_response_length_ = current_response_length;
151 if (!response_is_good && source == downstream_fetcher_.get()) {
152 DVLOG(1) << "Downstream error " << source->GetResponseCode();
153 FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
154 DispatchEvent(event_args);
157 if (!response_is_good && source == upstream_fetcher_.get()) {
158 DVLOG(1) << "Upstream error " << source->GetResponseCode()
159 << " EOR " << end_of_response;
160 FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
161 DispatchEvent(event_args);
165 // Ignore incoming data on the upstream connection.
166 if (source == upstream_fetcher_.get())
169 DCHECK(response_is_good && source == downstream_fetcher_.get());
171 // The downstream response is organized in chunks, whose size is determined
172 // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
173 // Such chunks are sent by the speech recognition webservice over the HTTP
174 // downstream channel using HTTP chunked transfer (unrelated to our chunks).
175 // This function is called every time an HTTP chunk is received by the
176 // url fetcher. However there isn't any particular matching beween our
177 // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
178 // contain a portion of one chunk or even more chunks together.
179 chunked_byte_buffer_.Append(response);
181 // A single HTTP chunk can contain more than one data chunk, thus the while.
182 while (chunked_byte_buffer_.HasChunks()) {
183 FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
184 event_args.response = chunked_byte_buffer_.PopChunk();
185 DCHECK(event_args.response.get());
186 DumpResponse(std::string(event_args.response->begin(),
187 event_args.response->end()));
188 DispatchEvent(event_args);
190 if (end_of_response) {
191 FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
192 DispatchEvent(event_args);
196 bool GoogleStreamingRemoteEngine::IsRecognitionPending() const {
197 DCHECK(CalledOnValidThread());
198 return state_ != STATE_IDLE;
201 int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const {
202 return kAudioPacketIntervalMs;
205 // ----------------------- Core FSM implementation ---------------------------
207 void GoogleStreamingRemoteEngine::DispatchEvent(
208 const FSMEventArgs& event_args) {
209 DCHECK(CalledOnValidThread());
210 DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
211 DCHECK_LE(state_, STATE_MAX_VALUE);
213 // Event dispatching must be sequential, otherwise it will break all the rules
214 // and the assumptions of the finite state automata model.
215 DCHECK(!is_dispatching_event_);
216 is_dispatching_event_ = true;
218 state_ = ExecuteTransitionAndGetNextState(event_args);
220 is_dispatching_event_ = false;
223 GoogleStreamingRemoteEngine::FSMState
224 GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState(
225 const FSMEventArgs& event_args) {
226 const FSMEvent event = event_args.event;
230 case EVENT_START_RECOGNITION:
231 return ConnectBothStreams(event_args);
232 case EVENT_END_RECOGNITION:
233 // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
234 // abort, so we just silently drop them here.
235 case EVENT_AUDIO_CHUNK:
236 case EVENT_AUDIO_CHUNKS_ENDED:
237 // DOWNSTREAM_CLOSED can be received if we end up here due to an error.
238 case EVENT_DOWNSTREAM_CLOSED:
239 return DoNothing(event_args);
240 case EVENT_UPSTREAM_ERROR:
241 case EVENT_DOWNSTREAM_ERROR:
242 case EVENT_DOWNSTREAM_RESPONSE:
243 return NotFeasible(event_args);
246 case STATE_BOTH_STREAMS_CONNECTED:
248 case EVENT_AUDIO_CHUNK:
249 return TransmitAudioUpstream(event_args);
250 case EVENT_DOWNSTREAM_RESPONSE:
251 return ProcessDownstreamResponse(event_args);
252 case EVENT_AUDIO_CHUNKS_ENDED:
253 return CloseUpstreamAndWaitForResults(event_args);
254 case EVENT_END_RECOGNITION:
255 return AbortSilently(event_args);
256 case EVENT_UPSTREAM_ERROR:
257 case EVENT_DOWNSTREAM_ERROR:
258 case EVENT_DOWNSTREAM_CLOSED:
259 return AbortWithError(event_args);
260 case EVENT_START_RECOGNITION:
261 return NotFeasible(event_args);
264 case STATE_WAITING_DOWNSTREAM_RESULTS:
266 case EVENT_DOWNSTREAM_RESPONSE:
267 return ProcessDownstreamResponse(event_args);
268 case EVENT_DOWNSTREAM_CLOSED:
269 return RaiseNoMatchErrorIfGotNoResults(event_args);
270 case EVENT_END_RECOGNITION:
271 return AbortSilently(event_args);
272 case EVENT_UPSTREAM_ERROR:
273 case EVENT_DOWNSTREAM_ERROR:
274 return AbortWithError(event_args);
275 case EVENT_START_RECOGNITION:
276 case EVENT_AUDIO_CHUNK:
277 case EVENT_AUDIO_CHUNKS_ENDED:
278 return NotFeasible(event_args);
282 return NotFeasible(event_args);
285 // ----------- Contract for all the FSM evolution functions below -------------
286 // - Are guaranteed to be executed in the same thread (IO, except for tests);
287 // - Are guaranteed to be not reentrant (themselves and each other);
288 // - event_args members are guaranteed to be stable during the call;
290 GoogleStreamingRemoteEngine::FSMState
291 GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
292 DCHECK(!upstream_fetcher_.get());
293 DCHECK(!downstream_fetcher_.get());
295 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
296 config_.audio_sample_rate,
297 config_.audio_num_bits_per_sample));
298 DCHECK(encoder_.get());
299 const std::string request_key = GenerateRequestKey();
301 // Setup downstream fetcher.
302 std::vector<std::string> downstream_args;
303 downstream_args.push_back(
304 "key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
305 downstream_args.push_back("pair=" + request_key);
306 downstream_args.push_back("output=pb");
307 GURL downstream_url(std::string(kWebServiceBaseUrl) +
308 std::string(kDownstreamUrl) +
309 JoinString(downstream_args, '&'));
311 downstream_fetcher_.reset(URLFetcher::Create(
312 kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET,
314 downstream_fetcher_->SetRequestContext(url_context_.get());
315 downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
316 net::LOAD_DO_NOT_SEND_COOKIES |
317 net::LOAD_DO_NOT_SEND_AUTH_DATA);
318 downstream_fetcher_->Start();
320 // Setup upstream fetcher.
321 // TODO(hans): Support for user-selected grammars.
322 std::vector<std::string> upstream_args;
323 upstream_args.push_back("key=" +
324 net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
325 upstream_args.push_back("pair=" + request_key);
326 upstream_args.push_back("output=pb");
327 upstream_args.push_back(
328 "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
329 upstream_args.push_back(
330 config_.filter_profanities ? "pFilter=2" : "pFilter=0");
331 if (config_.max_hypotheses > 0U) {
332 int max_alternatives = std::min(kMaxMaxAlternatives,
333 config_.max_hypotheses);
334 upstream_args.push_back("maxAlternatives=" +
335 base::UintToString(max_alternatives));
337 upstream_args.push_back("client=chromium");
338 if (!config_.hardware_info.empty()) {
339 upstream_args.push_back(
340 "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true));
342 if (config_.continuous)
343 upstream_args.push_back("continuous");
344 if (config_.interim_results)
345 upstream_args.push_back("interim");
347 GURL upstream_url(std::string(kWebServiceBaseUrl) +
348 std::string(kUpstreamUrl) +
349 JoinString(upstream_args, '&'));
351 upstream_fetcher_.reset(URLFetcher::Create(
352 kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this));
353 upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
354 upstream_fetcher_->SetRequestContext(url_context_.get());
355 upstream_fetcher_->SetReferrer(config_.origin_url);
356 upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
357 net::LOAD_DO_NOT_SEND_COOKIES |
358 net::LOAD_DO_NOT_SEND_AUTH_DATA);
359 upstream_fetcher_->Start();
360 previous_response_length_ = 0;
361 return STATE_BOTH_STREAMS_CONNECTED;
364 GoogleStreamingRemoteEngine::FSMState
365 GoogleStreamingRemoteEngine::TransmitAudioUpstream(
366 const FSMEventArgs& event_args) {
367 DCHECK(upstream_fetcher_.get());
368 DCHECK(event_args.audio_data.get());
369 const AudioChunk& audio = *(event_args.audio_data.get());
371 DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
372 encoder_->Encode(audio);
373 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
374 upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
378 GoogleStreamingRemoteEngine::FSMState
379 GoogleStreamingRemoteEngine::ProcessDownstreamResponse(
380 const FSMEventArgs& event_args) {
381 DCHECK(event_args.response.get());
383 proto::SpeechRecognitionEvent ws_event;
384 if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
385 event_args.response->end())))
386 return AbortWithError(event_args);
388 // An empty (default) event is used to notify us that the upstream has
389 // been connected. Ignore.
390 if (!ws_event.result_size() && (!ws_event.has_status() ||
391 ws_event.status() == proto::SpeechRecognitionEvent::STATUS_SUCCESS)) {
392 DVLOG(1) << "Received empty response";
396 if (ws_event.has_status()) {
397 switch (ws_event.status()) {
398 case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
400 case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
401 return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH);
402 case proto::SpeechRecognitionEvent::STATUS_ABORTED:
403 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
404 case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
405 return Abort(SPEECH_RECOGNITION_ERROR_AUDIO);
406 case proto::SpeechRecognitionEvent::STATUS_NETWORK:
407 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
408 case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
409 // TODO(hans): We need a better error code for this.
410 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
411 case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
412 // TODO(hans): We need a better error code for this.
413 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
414 case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
415 return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR);
416 case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
417 // TODO(hans): We need a better error code for this.
418 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
422 SpeechRecognitionResults results;
423 for (int i = 0; i < ws_event.result_size(); ++i) {
424 const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
425 results.push_back(SpeechRecognitionResult());
426 SpeechRecognitionResult& result = results.back();
427 result.is_provisional = !(ws_result.has_final() && ws_result.final());
429 if (!result.is_provisional)
430 got_last_definitive_result_ = true;
432 for (int j = 0; j < ws_result.alternative_size(); ++j) {
433 const proto::SpeechRecognitionAlternative& ws_alternative =
434 ws_result.alternative(j);
435 SpeechRecognitionHypothesis hypothesis;
436 if (ws_alternative.has_confidence())
437 hypothesis.confidence = ws_alternative.confidence();
438 else if (ws_result.has_stability())
439 hypothesis.confidence = ws_result.stability();
440 DCHECK(ws_alternative.has_transcript());
441 // TODO(hans): Perhaps the transcript should be required in the proto?
442 if (ws_alternative.has_transcript())
443 hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript());
445 result.hypotheses.push_back(hypothesis);
449 delegate()->OnSpeechRecognitionEngineResults(results);
454 GoogleStreamingRemoteEngine::FSMState
455 GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults(
456 const FSMEventArgs& event_args) {
457 if (!got_last_definitive_result_) {
458 // Provide an empty result to notify that recognition is ended with no
459 // errors, yet neither any further results.
460 delegate()->OnSpeechRecognitionEngineResults(SpeechRecognitionResults());
462 return AbortSilently(event_args);
465 GoogleStreamingRemoteEngine::FSMState
466 GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
467 const FSMEventArgs&) {
468 DCHECK(upstream_fetcher_.get());
469 DCHECK(encoder_.get());
471 DVLOG(1) << "Closing upstream.";
473 // The encoder requires a non-empty final buffer. So we encode a packet
474 // of silence in case encoder had no data already.
475 std::vector<short> samples(
476 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
477 scoped_refptr<AudioChunk> dummy_chunk =
478 new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
479 samples.size() * sizeof(short),
480 encoder_->bits_per_sample() / 8);
481 encoder_->Encode(*dummy_chunk.get());
483 scoped_refptr<AudioChunk> encoded_dummy_data =
484 encoder_->GetEncodedDataAndClear();
485 DCHECK(!encoded_dummy_data->IsEmpty());
488 upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
489 got_last_definitive_result_ = false;
490 return STATE_WAITING_DOWNSTREAM_RESULTS;
493 GoogleStreamingRemoteEngine::FSMState
494 GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) {
495 DCHECK(!upstream_fetcher_.get());
496 DCHECK(downstream_fetcher_.get());
498 DVLOG(1) << "Closing downstream.";
499 downstream_fetcher_.reset();
503 GoogleStreamingRemoteEngine::FSMState
504 GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) {
505 return Abort(SPEECH_RECOGNITION_ERROR_NONE);
508 GoogleStreamingRemoteEngine::FSMState
509 GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) {
510 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
513 GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort(
514 SpeechRecognitionErrorCode error_code) {
515 DVLOG(1) << "Aborting with error " << error_code;
517 if (error_code != SPEECH_RECOGNITION_ERROR_NONE) {
518 delegate()->OnSpeechRecognitionEngineError(
519 SpeechRecognitionError(error_code));
521 downstream_fetcher_.reset();
522 upstream_fetcher_.reset();
527 GoogleStreamingRemoteEngine::FSMState
528 GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) {
532 GoogleStreamingRemoteEngine::FSMState
533 GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) {
534 NOTREACHED() << "Unfeasible event " << event_args.event
535 << " in state " << state_;
539 std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const {
540 std::string langs = config_.language;
541 if (langs.empty() && url_context_.get()) {
542 // If no language is provided then we use the first from the accepted
543 // language list. If this list is empty then it defaults to "en-US".
544 // Example of the contents of this list: "es,en-GB;q=0.8", ""
545 net::URLRequestContext* request_context =
546 url_context_->GetURLRequestContext();
547 DCHECK(request_context);
548 // TODO(pauljensen): GoogleStreamingRemoteEngine should be constructed with
549 // a reference to the HttpUserAgentSettings rather than accessing the
550 // accept language through the URLRequestContext.
551 if (request_context->http_user_agent_settings()) {
552 std::string accepted_language_list =
553 request_context->http_user_agent_settings()->GetAcceptLanguage();
554 size_t separator = accepted_language_list.find_first_of(",;");
555 if (separator != std::string::npos)
556 langs = accepted_language_list.substr(0, separator);
564 // TODO(primiano): Is there any utility in the codebase that already does this?
565 std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
566 const int64 kKeepLowBytes = 0x00000000FFFFFFFFLL;
567 const int64 kKeepHighBytes = 0xFFFFFFFF00000000LL;
569 // Just keep the least significant bits of timestamp, in order to reduce
570 // probability of collisions.
571 int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
572 (base::RandUint64() & kKeepHighBytes);
573 return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
576 GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
577 : event(event_value) {
580 GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() {
583 } // namespace content