Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / content / browser / speech / google_streaming_remote_engine_unittest.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <queue>
6
7 #include "base/memory/scoped_ptr.h"
8 #include "base/message_loop/message_loop.h"
9 #include "base/numerics/safe_conversions.h"
10 #include "base/strings/utf_string_conversions.h"
11 #include "base/sys_byteorder.h"
12 #include "content/browser/speech/audio_buffer.h"
13 #include "content/browser/speech/google_streaming_remote_engine.h"
14 #include "content/browser/speech/proto/google_streaming_api.pb.h"
15 #include "content/public/common/speech_recognition_error.h"
16 #include "content/public/common/speech_recognition_result.h"
17 #include "net/url_request/test_url_fetcher_factory.h"
18 #include "net/url_request/url_request_context_getter.h"
19 #include "net/url_request/url_request_status.h"
20 #include "testing/gtest/include/gtest/gtest.h"
21
22 using base::HostToNet32;
23 using base::checked_cast;
24 using net::URLRequestStatus;
25 using net::TestURLFetcher;
26 using net::TestURLFetcherFactory;
27
28 namespace content {
29
30 // Note: the terms upstream and downstream are from the point-of-view of the
31 // client (engine_under_test_).
32
33 class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate,
34                                         public testing::Test {
35  public:
36   GoogleStreamingRemoteEngineTest()
37       : last_number_of_upstream_chunks_seen_(0U),
38         error_(SPEECH_RECOGNITION_ERROR_NONE) { }
39
40   // Creates a speech recognition request and invokes its URL fetcher delegate
41   // with the given test data.
42   void CreateAndTestRequest(bool success, const std::string& http_response);
43
44   // SpeechRecognitionRequestDelegate methods.
45   void OnSpeechRecognitionEngineResults(
46       const SpeechRecognitionResults& results) override {
47     results_.push(results);
48   }
49   void OnSpeechRecognitionEngineError(
50       const SpeechRecognitionError& error) override {
51     error_ = error.code;
52   }
53
54   // testing::Test methods.
55   void SetUp() override;
56   void TearDown() override;
57
58  protected:
59   enum DownstreamError {
60     DOWNSTREAM_ERROR_NONE,
61     DOWNSTREAM_ERROR_HTTP500,
62     DOWNSTREAM_ERROR_NETWORK,
63     DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
64   };
65   static bool ResultsAreEqual(const SpeechRecognitionResults& a,
66                               const SpeechRecognitionResults& b);
67   static std::string SerializeProtobufResponse(
68       const proto::SpeechRecognitionEvent& msg);
69
70   TestURLFetcher* GetUpstreamFetcher();
71   TestURLFetcher* GetDownstreamFetcher();
72   void StartMockRecognition();
73   void EndMockRecognition();
74   void InjectDummyAudioChunk();
75   size_t UpstreamChunksUploadedFromLastCall();
76   void ProvideMockProtoResultDownstream(
77       const proto::SpeechRecognitionEvent& result);
78   void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
79   void ExpectResultsReceived(const SpeechRecognitionResults& result);
80   void CloseMockDownstream(DownstreamError error);
81
82   scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
83   TestURLFetcherFactory url_fetcher_factory_;
84   size_t last_number_of_upstream_chunks_seen_;
85   base::MessageLoop message_loop_;
86   std::string response_buffer_;
87   SpeechRecognitionErrorCode error_;
88   std::queue<SpeechRecognitionResults> results_;
89 };
90
91 TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) {
92   StartMockRecognition();
93   ASSERT_TRUE(GetUpstreamFetcher());
94   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
95
96   // Inject some dummy audio chunks and check a corresponding chunked upload
97   // is performed every time on the server.
98   for (int i = 0; i < 3; ++i) {
99     InjectDummyAudioChunk();
100     ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
101   }
102
103   // Ensure that a final (empty) audio chunk is uploaded on chunks end.
104   engine_under_test_->AudioChunksEnded();
105   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
106   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
107
108   // Simulate a protobuf message streamed from the server containing a single
109   // result with two hypotheses.
110   SpeechRecognitionResults results;
111   results.push_back(SpeechRecognitionResult());
112   SpeechRecognitionResult& result = results.back();
113   result.is_provisional = false;
114   result.hypotheses.push_back(
115       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F));
116   result.hypotheses.push_back(
117       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 2"), 0.2F));
118
119   ProvideMockResultDownstream(result);
120   ExpectResultsReceived(results);
121   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
122
123   // Ensure everything is closed cleanly after the downstream is closed.
124   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
125   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
126   EndMockRecognition();
127   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
128   ASSERT_EQ(0U, results_.size());
129 }
130
131 TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) {
132   StartMockRecognition();
133   ASSERT_TRUE(GetUpstreamFetcher());
134   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
135
136   for (int i = 0; i < 4; ++i) {
137     InjectDummyAudioChunk();
138     ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
139
140     SpeechRecognitionResults results;
141     results.push_back(SpeechRecognitionResult());
142     SpeechRecognitionResult& result = results.back();
143     result.is_provisional = (i % 2 == 0);  // Alternate result types.
144     float confidence = result.is_provisional ? 0.0F : (i * 0.1F);
145     result.hypotheses.push_back(SpeechRecognitionHypothesis(
146         base::UTF8ToUTF16("hypothesis"), confidence));
147
148     ProvideMockResultDownstream(result);
149     ExpectResultsReceived(results);
150     ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
151   }
152
153   // Ensure that a final (empty) audio chunk is uploaded on chunks end.
154   engine_under_test_->AudioChunksEnded();
155   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
156   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
157
158   // Simulate a final definitive result.
159   SpeechRecognitionResults results;
160   results.push_back(SpeechRecognitionResult());
161   SpeechRecognitionResult& result = results.back();
162   result.is_provisional = false;
163   result.hypotheses.push_back(
164       SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 1.0F));
165   ProvideMockResultDownstream(result);
166   ExpectResultsReceived(results);
167   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
168
169   // Ensure everything is closed cleanly after the downstream is closed.
170   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
171   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
172   EndMockRecognition();
173   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
174   ASSERT_EQ(0U, results_.size());
175 }
176
177 TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) {
178   StartMockRecognition();
179   ASSERT_TRUE(GetUpstreamFetcher());
180   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
181
182   // Simulate one pushed audio chunk.
183   InjectDummyAudioChunk();
184   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
185
186   // Simulate the corresponding definitive result.
187   SpeechRecognitionResults results;
188   results.push_back(SpeechRecognitionResult());
189   SpeechRecognitionResult& result = results.back();
190   result.hypotheses.push_back(
191       SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis"), 1.0F));
192   ProvideMockResultDownstream(result);
193   ExpectResultsReceived(results);
194   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
195
196   // Simulate a silent downstream closure after |AudioChunksEnded|.
197   engine_under_test_->AudioChunksEnded();
198   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
199   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
200   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
201
202   // Expect an empty result, aimed at notifying recognition ended with no
203   // actual results nor errors.
204   SpeechRecognitionResults empty_results;
205   ExpectResultsReceived(empty_results);
206
207   // Ensure everything is closed cleanly after the downstream is closed.
208   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
209   EndMockRecognition();
210   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
211   ASSERT_EQ(0U, results_.size());
212 }
213
214 TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) {
215   StartMockRecognition();
216   ASSERT_TRUE(GetUpstreamFetcher());
217   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
218
219   for (int i = 0; i < 3; ++i)
220     InjectDummyAudioChunk();
221   engine_under_test_->AudioChunksEnded();
222   ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
223   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
224
225   // Simulate only a provisional result.
226   SpeechRecognitionResults results;
227   results.push_back(SpeechRecognitionResult());
228   SpeechRecognitionResult& result = results.back();
229   result.is_provisional = true;
230   result.hypotheses.push_back(
231       SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 0.0F));
232   ProvideMockResultDownstream(result);
233   ExpectResultsReceived(results);
234   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
235
236   CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
237
238   // Expect an empty result.
239   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
240   EndMockRecognition();
241   SpeechRecognitionResults empty_result;
242   ExpectResultsReceived(empty_result);
243 }
244
245 TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) {
246   StartMockRecognition();
247   ASSERT_TRUE(GetUpstreamFetcher());
248   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
249
250   InjectDummyAudioChunk();
251   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
252
253   // Close the downstream with a HTTP 500 error.
254   CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
255
256   // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
257   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
258   EndMockRecognition();
259   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
260   ASSERT_EQ(0U, results_.size());
261 }
262
263 TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) {
264   StartMockRecognition();
265   ASSERT_TRUE(GetUpstreamFetcher());
266   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
267
268   InjectDummyAudioChunk();
269   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
270
271   // Close the downstream fetcher simulating a network failure.
272   CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
273
274   // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
275   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
276   EndMockRecognition();
277   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
278   ASSERT_EQ(0U, results_.size());
279 }
280
281 TEST_F(GoogleStreamingRemoteEngineTest, Stability) {
282   StartMockRecognition();
283   ASSERT_TRUE(GetUpstreamFetcher());
284   ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
285
286   // Upload a dummy audio chunk.
287   InjectDummyAudioChunk();
288   ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
289   engine_under_test_->AudioChunksEnded();
290
291   // Simulate a protobuf message with an intermediate result without confidence,
292   // but with stability.
293   proto::SpeechRecognitionEvent proto_event;
294   proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
295   proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
296   proto_result->set_stability(0.5);
297   proto::SpeechRecognitionAlternative *proto_alternative =
298       proto_result->add_alternative();
299   proto_alternative->set_transcript("foo");
300   ProvideMockProtoResultDownstream(proto_event);
301
302   // Set up expectations.
303   SpeechRecognitionResults results;
304   results.push_back(SpeechRecognitionResult());
305   SpeechRecognitionResult& result = results.back();
306   result.is_provisional = true;
307   result.hypotheses.push_back(
308       SpeechRecognitionHypothesis(base::UTF8ToUTF16("foo"), 0.5));
309
310   // Check that the protobuf generated the expected result.
311   ExpectResultsReceived(results);
312
313   // Since it was a provisional result, recognition is still pending.
314   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
315
316   // Shut down.
317   CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
318   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
319   EndMockRecognition();
320
321   // Since there was no final result, we get an empty "no match" result.
322   SpeechRecognitionResults empty_result;
323   ExpectResultsReceived(empty_result);
324   ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
325   ASSERT_EQ(0U, results_.size());
326 }
327
328 void GoogleStreamingRemoteEngineTest::SetUp() {
329   engine_under_test_.reset(
330       new  GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
331   engine_under_test_->set_delegate(this);
332 }
333
334 void GoogleStreamingRemoteEngineTest::TearDown() {
335   engine_under_test_.reset();
336 }
337
338 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
339   return url_fetcher_factory_.GetFetcherByID(
340         GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting);
341 }
342
343 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
344   return url_fetcher_factory_.GetFetcherByID(
345         GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting);
346 }
347
348 // Starts recognition on the engine, ensuring that both stream fetchers are
349 // created.
350 void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
351   DCHECK(engine_under_test_.get());
352
353   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
354
355   engine_under_test_->StartRecognition();
356   ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
357
358   TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
359   ASSERT_TRUE(upstream_fetcher);
360   upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL());
361
362   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
363   ASSERT_TRUE(downstream_fetcher);
364   downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL());
365 }
366
367 void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
368   DCHECK(engine_under_test_.get());
369   engine_under_test_->EndRecognition();
370   ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
371
372   // TODO(primiano): In order to be very pedantic we should check that both the
373   // upstream and downstream URL fetchers have been disposed at this time.
374   // Unfortunately it seems that there is no direct way to detect (in tests)
375   // if a url_fetcher has been freed or not, since they are not automatically
376   // de-registered from the TestURLFetcherFactory on destruction.
377 }
378
379 void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
380   unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'};
381   scoped_refptr<AudioChunk> dummy_audio_chunk(
382       new AudioChunk(&dummy_audio_buffer_data[0],
383                      sizeof(dummy_audio_buffer_data),
384                      2 /* bytes per sample */));
385   DCHECK(engine_under_test_.get());
386   engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get());
387 }
388
389 size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
390   TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
391   DCHECK(upstream_fetcher);
392   const size_t number_of_chunks = upstream_fetcher->upload_chunks().size();
393   DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_);
394   const size_t new_chunks = number_of_chunks -
395                             last_number_of_upstream_chunks_seen_;
396   last_number_of_upstream_chunks_seen_ = number_of_chunks;
397   return new_chunks;
398 }
399
400 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
401     const proto::SpeechRecognitionEvent& result) {
402   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
403
404   ASSERT_TRUE(downstream_fetcher);
405   downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */));
406   downstream_fetcher->set_response_code(200);
407
408   std::string response_string = SerializeProtobufResponse(result);
409   response_buffer_.append(response_string);
410   downstream_fetcher->SetResponseString(response_buffer_);
411   downstream_fetcher->delegate()->OnURLFetchDownloadProgress(
412       downstream_fetcher,
413       response_buffer_.size(),
414       -1 /* total response length not used */);
415 }
416
417 void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
418     const SpeechRecognitionResult& result) {
419   proto::SpeechRecognitionEvent proto_event;
420   proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
421   proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
422   proto_result->set_final(!result.is_provisional);
423   for (size_t i = 0; i < result.hypotheses.size(); ++i) {
424     proto::SpeechRecognitionAlternative* proto_alternative =
425         proto_result->add_alternative();
426     const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i];
427     proto_alternative->set_confidence(hypothesis.confidence);
428     proto_alternative->set_transcript(base::UTF16ToUTF8(hypothesis.utterance));
429   }
430   ProvideMockProtoResultDownstream(proto_event);
431 }
432
433 void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
434     DownstreamError error) {
435   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
436   ASSERT_TRUE(downstream_fetcher);
437
438   const URLRequestStatus::Status fetcher_status =
439       (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED :
440                                             URLRequestStatus::SUCCESS;
441   downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0));
442   downstream_fetcher->set_response_code(
443       (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200);
444
445   if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) {
446     // Send empty response.
447     proto::SpeechRecognitionEvent response;
448     response_buffer_.append(SerializeProtobufResponse(response));
449   }
450   downstream_fetcher->SetResponseString(response_buffer_);
451   downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher);
452 }
453
454 void GoogleStreamingRemoteEngineTest::ExpectResultsReceived(
455     const SpeechRecognitionResults& results) {
456   ASSERT_GE(1U, results_.size());
457   ASSERT_TRUE(ResultsAreEqual(results, results_.front()));
458   results_.pop();
459 }
460
461 bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
462     const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) {
463   if (a.size() != b.size())
464     return false;
465
466   SpeechRecognitionResults::const_iterator it_a = a.begin();
467   SpeechRecognitionResults::const_iterator it_b = b.begin();
468   for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) {
469     if (it_a->is_provisional != it_b->is_provisional ||
470         it_a->hypotheses.size() != it_b->hypotheses.size()) {
471       return false;
472     }
473     for (size_t i = 0; i < it_a->hypotheses.size(); ++i) {
474       const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i];
475       const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i];
476       if (hyp_a.utterance != hyp_b.utterance ||
477           hyp_a.confidence != hyp_b.confidence) {
478         return false;
479       }
480     }
481   }
482
483   return true;
484 }
485
486 std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
487     const proto::SpeechRecognitionEvent& msg) {
488   std::string msg_string;
489   msg.SerializeToString(&msg_string);
490
491   // Prepend 4 byte prefix length indication to the protobuf message as
492   // envisaged by the google streaming recognition webservice protocol.
493   uint32 prefix = HostToNet32(checked_cast<uint32>(msg_string.size()));
494   msg_string.insert(0, reinterpret_cast<char*>(&prefix), sizeof(prefix));
495
496   return msg_string;
497 }
498
499 }  // namespace content