src/content/browser/speech/google_streaming_remote_engine_unittest.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <queue>
   6
   7 #include "base/memory/scoped_ptr.h"
   8 #include "base/message_loop/message_loop.h"
   9 #include "base/numerics/safe_conversions.h"
  10 #include "base/strings/utf_string_conversions.h"
  11 #include "base/sys_byteorder.h"
  12 #include "content/browser/speech/audio_buffer.h"
  13 #include "content/browser/speech/google_streaming_remote_engine.h"
  14 #include "content/browser/speech/proto/google_streaming_api.pb.h"
  15 #include "content/public/common/speech_recognition_error.h"
  16 #include "content/public/common/speech_recognition_result.h"
  17 #include "net/url_request/test_url_fetcher_factory.h"
  18 #include "net/url_request/url_request_context_getter.h"
  19 #include "net/url_request/url_request_status.h"
  20 #include "testing/gtest/include/gtest/gtest.h"
  21
  22 using base::HostToNet32;
  23 using base::checked_cast;
  24 using net::URLRequestStatus;
  25 using net::TestURLFetcher;
  26 using net::TestURLFetcherFactory;
  27
  28 namespace content {
  29
  30 // Note: the terms upstream and downstream are from the point-of-view of the
  31 // client (engine_under_test_).
  32
  33 class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate,
  34                                         public testing::Test {
  35  public:
  36   GoogleStreamingRemoteEngineTest()
  37       : last_number_of_upstream_chunks_seen_(0U),
  38         error_(SPEECH_RECOGNITION_ERROR_NONE) { }
  39
  40   // Creates a speech recognition request and invokes its URL fetcher delegate
  41   // with the given test data.
  42   void CreateAndTestRequest(bool success, const std::string& http_response);
  43
  44   // SpeechRecognitionRequestDelegate methods.
  45   void OnSpeechRecognitionEngineResults(
  46       const SpeechRecognitionResults& results) override {
  47     results_.push(results);
  48   }
  49   void OnSpeechRecognitionEngineError(
  50       const SpeechRecognitionError& error) override {
  51     error_ = error.code;
  52   }
  53
  54   // testing::Test methods.
  55   void SetUp() override;
  56   void TearDown() override;
  57
  58  protected:
  59   enum DownstreamError {
  60     DOWNSTREAM_ERROR_NONE,
  61     DOWNSTREAM_ERROR_HTTP500,
  62     DOWNSTREAM_ERROR_NETWORK,
  63     DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
  64   };
  65   static bool ResultsAreEqual(const SpeechRecognitionResults& a,
  66                               const SpeechRecognitionResults& b);
  67   static std::string SerializeProtobufResponse(
  68       const proto::SpeechRecognitionEvent& msg);
  69
  70   TestURLFetcher* GetUpstreamFetcher();
  71   TestURLFetcher* GetDownstreamFetcher();
  72   void StartMockRecognition();
  73   void EndMockRecognition();
  74   void InjectDummyAudioChunk();
  75   size_t UpstreamChunksUploadedFromLastCall();
  76   void ProvideMockProtoResultDownstream(
  77       const proto::SpeechRecognitionEvent& result);
  78   void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
  79   void ExpectResultsReceived(const SpeechRecognitionResults& result);
  80   void CloseMockDownstream(DownstreamError error);
  81
  82   scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
  83   TestURLFetcherFactory url_fetcher_factory_;
  84   size_t last_number_of_upstream_chunks_seen_;
  85   base::MessageLoop message_loop_;
  86   std::string response_buffer_;
  87   SpeechRecognitionErrorCode error_;
  88   std::queue<SpeechRecognitionResults> results_;
  89 };
  90
  91 TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) {
  92   StartMockRecognition();
  93   ASSERT_TRUE(GetUpstreamFetcher());
  94   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
  95
  96   // Inject some dummy audio chunks and check a corresponding chunked upload
  97   // is performed every time on the server.
  98   for (int i = 0; i < 3; ++i) {
  99     InjectDummyAudioChunk();
 100     ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 101   }
 102
 103   // Ensure that a final (empty) audio chunk is uploaded on chunks end.
 104   engine_under_test_->AudioChunksEnded();
 105   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 106   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 107
 108   // Simulate a protobuf message streamed from the server containing a single
 109   // result with two hypotheses.
 110   SpeechRecognitionResults results;
 111   results.push_back(SpeechRecognitionResult());
 112   SpeechRecognitionResult& result = results.back();
 113   result.is_provisional = false;
 114   result.hypotheses.push_back(
 115       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F));
 116   result.hypotheses.push_back(
 117       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 2"), 0.2F));
 118
 119   ProvideMockResultDownstream(result);
 120   ExpectResultsReceived(results);
 121   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 122
 123   // Ensure everything is closed cleanly after the downstream is closed.
 124   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
 125   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 126   EndMockRecognition();
 127   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
 128   ASSERT_EQ(0U, results_.size());
 129 }
 130
 131 TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) {
 132   StartMockRecognition();
 133   ASSERT_TRUE(GetUpstreamFetcher());
 134   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 135
 136   for (int i = 0; i < 4; ++i) {
 137     InjectDummyAudioChunk();
 138     ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 139
 140     SpeechRecognitionResults results;
 141     results.push_back(SpeechRecognitionResult());
 142     SpeechRecognitionResult& result = results.back();
 143     result.is_provisional = (i % 2 == 0);  // Alternate result types.
 144     float confidence = result.is_provisional ? 0.0F : (i * 0.1F);
 145     result.hypotheses.push_back(SpeechRecognitionHypothesis(
 146         base::UTF8ToUTF16("hypothesis"), confidence));
 147
 148     ProvideMockResultDownstream(result);
 149     ExpectResultsReceived(results);
 150     ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 151   }
 152
 153   // Ensure that a final (empty) audio chunk is uploaded on chunks end.
 154   engine_under_test_->AudioChunksEnded();
 155   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 156   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 157
 158   // Simulate a final definitive result.
 159   SpeechRecognitionResults results;
 160   results.push_back(SpeechRecognitionResult());
 161   SpeechRecognitionResult& result = results.back();
 162   result.is_provisional = false;
 163   result.hypotheses.push_back(
 164       SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 1.0F));
 165   ProvideMockResultDownstream(result);
 166   ExpectResultsReceived(results);
 167   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 168
 169   // Ensure everything is closed cleanly after the downstream is closed.
 170   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
 171   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 172   EndMockRecognition();
 173   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
 174   ASSERT_EQ(0U, results_.size());
 175 }
 176
 177 TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) {
 178   StartMockRecognition();
 179   ASSERT_TRUE(GetUpstreamFetcher());
 180   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 181
 182   // Simulate one pushed audio chunk.
 183   InjectDummyAudioChunk();
 184   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 185
 186   // Simulate the corresponding definitive result.
 187   SpeechRecognitionResults results;
 188   results.push_back(SpeechRecognitionResult());
 189   SpeechRecognitionResult& result = results.back();
 190   result.hypotheses.push_back(
 191       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis"), 1.0F));
 192   ProvideMockResultDownstream(result);
 193   ExpectResultsReceived(results);
 194   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 195
 196   // Simulate a silent downstream closure after |AudioChunksEnded|.
 197   engine_under_test_->AudioChunksEnded();
 198   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 199   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 200   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
 201
 202   // Expect an empty result, aimed at notifying recognition ended with no
 203   // actual results nor errors.
 204   SpeechRecognitionResults empty_results;
 205   ExpectResultsReceived(empty_results);
 206
 207   // Ensure everything is closed cleanly after the downstream is closed.
 208   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 209   EndMockRecognition();
 210   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
 211   ASSERT_EQ(0U, results_.size());
 212 }
 213
 214 TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) {
 215   StartMockRecognition();
 216   ASSERT_TRUE(GetUpstreamFetcher());
 217   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 218
 219   for (int i = 0; i < 3; ++i)
 220     InjectDummyAudioChunk();
 221   engine_under_test_->AudioChunksEnded();
 222   ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
 223   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 224
 225   // Simulate only a provisional result.
 226   SpeechRecognitionResults results;
 227   results.push_back(SpeechRecognitionResult());
 228   SpeechRecognitionResult& result = results.back();
 229   result.is_provisional = true;
 230   result.hypotheses.push_back(
 231       SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 0.0F));
 232   ProvideMockResultDownstream(result);
 233   ExpectResultsReceived(results);
 234   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 235
 236   CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
 237
 238   // Expect an empty result.
 239   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 240   EndMockRecognition();
 241   SpeechRecognitionResults empty_result;
 242   ExpectResultsReceived(empty_result);
 243 }
 244
 245 TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) {
 246   StartMockRecognition();
 247   ASSERT_TRUE(GetUpstreamFetcher());
 248   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 249
 250   InjectDummyAudioChunk();
 251   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 252
 253   // Close the downstream with a HTTP 500 error.
 254   CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
 255
 256   // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
 257   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 258   EndMockRecognition();
 259   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
 260   ASSERT_EQ(0U, results_.size());
 261 }
 262
 263 TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) {
 264   StartMockRecognition();
 265   ASSERT_TRUE(GetUpstreamFetcher());
 266   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 267
 268   InjectDummyAudioChunk();
 269   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 270
 271   // Close the downstream fetcher simulating a network failure.
 272   CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
 273
 274   // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
 275   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 276   EndMockRecognition();
 277   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
 278   ASSERT_EQ(0U, results_.size());
 279 }
 280
 281 TEST_F(GoogleStreamingRemoteEngineTest, Stability) {
 282   StartMockRecognition();
 283   ASSERT_TRUE(GetUpstreamFetcher());
 284   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
 285
 286   // Upload a dummy audio chunk.
 287   InjectDummyAudioChunk();
 288   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
 289   engine_under_test_->AudioChunksEnded();
 290
 291   // Simulate a protobuf message with an intermediate result without confidence,
 292   // but with stability.
 293   proto::SpeechRecognitionEvent proto_event;
 294   proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
 295   proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
 296   proto_result->set_stability(0.5);
 297   proto::SpeechRecognitionAlternative *proto_alternative =
 298       proto_result->add_alternative();
 299   proto_alternative->set_transcript("foo");
 300   ProvideMockProtoResultDownstream(proto_event);
 301
 302   // Set up expectations.
 303   SpeechRecognitionResults results;
 304   results.push_back(SpeechRecognitionResult());
 305   SpeechRecognitionResult& result = results.back();
 306   result.is_provisional = true;
 307   result.hypotheses.push_back(
 308       SpeechRecognitionHypothesis(base::UTF8ToUTF16("foo"), 0.5));
 309
 310   // Check that the protobuf generated the expected result.
 311   ExpectResultsReceived(results);
 312
 313   // Since it was a provisional result, recognition is still pending.
 314   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 315
 316   // Shut down.
 317   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
 318   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 319   EndMockRecognition();
 320
 321   // Since there was no final result, we get an empty "no match" result.
 322   SpeechRecognitionResults empty_result;
 323   ExpectResultsReceived(empty_result);
 324   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
 325   ASSERT_EQ(0U, results_.size());
 326 }
 327
 328 void GoogleStreamingRemoteEngineTest::SetUp() {
 329   engine_under_test_.reset(
 330       new  GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
 331   engine_under_test_->set_delegate(this);
 332 }
 333
 334 void GoogleStreamingRemoteEngineTest::TearDown() {
 335   engine_under_test_.reset();
 336 }
 337
 338 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
 339   return url_fetcher_factory_.GetFetcherByID(
 340         GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting);
 341 }
 342
 343 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
 344   return url_fetcher_factory_.GetFetcherByID(
 345         GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting);
 346 }
 347
 348 // Starts recognition on the engine, ensuring that both stream fetchers are
 349 // created.
 350 void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
 351   DCHECK(engine_under_test_.get());
 352
 353   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 354
 355   engine_under_test_->StartRecognition();
 356   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
 357
 358   TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
 359   ASSERT_TRUE(upstream_fetcher);
 360   upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL());
 361
 362   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
 363   ASSERT_TRUE(downstream_fetcher);
 364   downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL());
 365 }
 366
 367 void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
 368   DCHECK(engine_under_test_.get());
 369   engine_under_test_->EndRecognition();
 370   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
 371
 372   // TODO(primiano): In order to be very pedantic we should check that both the
 373   // upstream and downstream URL fetchers have been disposed at this time.
 374   // Unfortunately it seems that there is no direct way to detect (in tests)
 375   // if a url_fetcher has been freed or not, since they are not automatically
 376   // de-registered from the TestURLFetcherFactory on destruction.
 377 }
 378
 379 void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
 380   unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'};
 381   scoped_refptr<AudioChunk> dummy_audio_chunk(
 382       new AudioChunk(&dummy_audio_buffer_data[0],
 383                      sizeof(dummy_audio_buffer_data),
 384                      2 /* bytes per sample */));
 385   DCHECK(engine_under_test_.get());
 386   engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get());
 387 }
 388
 389 size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
 390   TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
 391   DCHECK(upstream_fetcher);
 392   const size_t number_of_chunks = upstream_fetcher->upload_chunks().size();
 393   DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_);
 394   const size_t new_chunks = number_of_chunks -
 395                             last_number_of_upstream_chunks_seen_;
 396   last_number_of_upstream_chunks_seen_ = number_of_chunks;
 397   return new_chunks;
 398 }
 399
 400 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
 401     const proto::SpeechRecognitionEvent& result) {
 402   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
 403
 404   ASSERT_TRUE(downstream_fetcher);
 405   downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */));
 406   downstream_fetcher->set_response_code(200);
 407
 408   std::string response_string = SerializeProtobufResponse(result);
 409   response_buffer_.append(response_string);
 410   downstream_fetcher->SetResponseString(response_buffer_);
 411   downstream_fetcher->delegate()->OnURLFetchDownloadProgress(
 412       downstream_fetcher,
 413       response_buffer_.size(),
 414       -1 /* total response length not used */);
 415 }
 416
 417 void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
 418     const SpeechRecognitionResult& result) {
 419   proto::SpeechRecognitionEvent proto_event;
 420   proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
 421   proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
 422   proto_result->set_final(!result.is_provisional);
 423   for (size_t i = 0; i < result.hypotheses.size(); ++i) {
 424     proto::SpeechRecognitionAlternative* proto_alternative =
 425         proto_result->add_alternative();
 426     const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i];
 427     proto_alternative->set_confidence(hypothesis.confidence);
 428     proto_alternative->set_transcript(base::UTF16ToUTF8(hypothesis.utterance));
 429   }
 430   ProvideMockProtoResultDownstream(proto_event);
 431 }
 432
 433 void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
 434     DownstreamError error) {
 435   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
 436   ASSERT_TRUE(downstream_fetcher);
 437
 438   const URLRequestStatus::Status fetcher_status =
 439       (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED :
 440                                             URLRequestStatus::SUCCESS;
 441   downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0));
 442   downstream_fetcher->set_response_code(
 443       (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200);
 444
 445   if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) {
 446     // Send empty response.
 447     proto::SpeechRecognitionEvent response;
 448     response_buffer_.append(SerializeProtobufResponse(response));
 449   }
 450   downstream_fetcher->SetResponseString(response_buffer_);
 451   downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher);
 452 }
 453
 454 void GoogleStreamingRemoteEngineTest::ExpectResultsReceived(
 455     const SpeechRecognitionResults& results) {
 456   ASSERT_GE(1U, results_.size());
 457   ASSERT_TRUE(ResultsAreEqual(results, results_.front()));
 458   results_.pop();
 459 }
 460
 461 bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
 462     const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) {
 463   if (a.size() != b.size())
 464     return false;
 465
 466   SpeechRecognitionResults::const_iterator it_a = a.begin();
 467   SpeechRecognitionResults::const_iterator it_b = b.begin();
 468   for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) {
 469     if (it_a->is_provisional != it_b->is_provisional ||
 470         it_a->hypotheses.size() != it_b->hypotheses.size()) {
 471       return false;
 472     }
 473     for (size_t i = 0; i < it_a->hypotheses.size(); ++i) {
 474       const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i];
 475       const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i];
 476       if (hyp_a.utterance != hyp_b.utterance ||
 477           hyp_a.confidence != hyp_b.confidence) {
 478         return false;
 479       }
 480     }
 481   }
 482
 483   return true;
 484 }
 485
 486 std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
 487     const proto::SpeechRecognitionEvent& msg) {
 488   std::string msg_string;
 489   msg.SerializeToString(&msg_string);
 490
 491   // Prepend 4 byte prefix length indication to the protobuf message as
 492   // envisaged by the google streaming recognition webservice protocol.
 493   uint32 prefix = HostToNet32(checked_cast<uint32>(msg_string.size()));
 494   msg_string.insert(0, reinterpret_cast<char*>(&prefix), sizeof(prefix));
 495
 496   return msg_string;
 497 }
 498
 499 }  // namespace content