- add sources.
[platform/framework/web/crosswalk.git] / src / content / browser / speech / speech_recognizer_impl_unittest.cc
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <vector>
6
7 #include "content/browser/browser_thread_impl.h"
8 #include "content/browser/speech/google_one_shot_remote_engine.h"
9 #include "content/browser/speech/speech_recognizer_impl.h"
10 #include "content/public/browser/speech_recognition_event_listener.h"
11 #include "media/audio/audio_manager_base.h"
12 #include "media/audio/fake_audio_input_stream.h"
13 #include "media/audio/fake_audio_output_stream.h"
14 #include "media/audio/mock_audio_manager.h"
15 #include "media/audio/test_audio_input_controller_factory.h"
16 #include "net/base/net_errors.h"
17 #include "net/url_request/test_url_fetcher_factory.h"
18 #include "net/url_request/url_request_status.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20
21 using base::MessageLoopProxy;
22 using media::AudioInputController;
23 using media::AudioInputStream;
24 using media::AudioManager;
25 using media::AudioOutputStream;
26 using media::AudioParameters;
27 using media::TestAudioInputController;
28 using media::TestAudioInputControllerFactory;
29
30 namespace content {
31
32 class SpeechRecognizerImplTest : public SpeechRecognitionEventListener,
33                                  public testing::Test {
34  public:
35   SpeechRecognizerImplTest()
36       : io_thread_(BrowserThread::IO, &message_loop_),
37         recognition_started_(false),
38         recognition_ended_(false),
39         result_received_(false),
40         audio_started_(false),
41         audio_ended_(false),
42         sound_started_(false),
43         sound_ended_(false),
44         error_(SPEECH_RECOGNITION_ERROR_NONE),
45         volume_(-1.0f) {
46     // SpeechRecognizer takes ownership of sr_engine.
47     SpeechRecognitionEngine* sr_engine =
48         new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */);
49     SpeechRecognitionEngineConfig config;
50     config.audio_num_bits_per_sample =
51         SpeechRecognizerImpl::kNumBitsPerAudioSample;
52     config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
53     config.filter_profanities = false;
54     sr_engine->SetConfig(config);
55
56     const int kTestingSessionId = 1;
57     const bool kOneShotMode = true;
58     recognizer_ = new SpeechRecognizerImpl(
59         this, kTestingSessionId, kOneShotMode, sr_engine);
60     audio_manager_.reset(new media::MockAudioManager(
61         base::MessageLoop::current()->message_loop_proxy().get()));
62     recognizer_->SetAudioManagerForTesting(audio_manager_.get());
63
64     int audio_packet_length_bytes =
65         (SpeechRecognizerImpl::kAudioSampleRate *
66          GoogleOneShotRemoteEngine::kAudioPacketIntervalMs *
67          ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) *
68          SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000);
69     audio_packet_.resize(audio_packet_length_bytes);
70   }
71
72   void CheckEventsConsistency() {
73     // Note: "!x || y" == "x implies y".
74     EXPECT_TRUE(!recognition_ended_ || recognition_started_);
75     EXPECT_TRUE(!audio_ended_ || audio_started_);
76     EXPECT_TRUE(!sound_ended_ || sound_started_);
77     EXPECT_TRUE(!audio_started_ || recognition_started_);
78     EXPECT_TRUE(!sound_started_ || audio_started_);
79     EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
80     EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
81   }
82
83   void CheckFinalEventsConsistency() {
84     // Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
85     EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
86     EXPECT_FALSE(audio_started_ ^ audio_ended_);
87     EXPECT_FALSE(sound_started_ ^ sound_ended_);
88   }
89
90   // Overridden from SpeechRecognitionEventListener:
91   virtual void OnAudioStart(int session_id) OVERRIDE {
92     audio_started_ = true;
93     CheckEventsConsistency();
94   }
95
96   virtual void OnAudioEnd(int session_id) OVERRIDE {
97     audio_ended_ = true;
98     CheckEventsConsistency();
99   }
100
101   virtual void OnRecognitionResults(
102       int session_id, const SpeechRecognitionResults& results) OVERRIDE {
103     result_received_ = true;
104   }
105
106   virtual void OnRecognitionError(
107       int session_id, const SpeechRecognitionError& error) OVERRIDE {
108     EXPECT_TRUE(recognition_started_);
109     EXPECT_FALSE(recognition_ended_);
110     error_ = error.code;
111   }
112
113   virtual void OnAudioLevelsChange(int session_id, float volume,
114                                    float noise_volume) OVERRIDE {
115     volume_ = volume;
116     noise_volume_ = noise_volume;
117   }
118
119   virtual void OnRecognitionEnd(int session_id) OVERRIDE {
120     recognition_ended_ = true;
121     CheckEventsConsistency();
122   }
123
124   virtual void OnRecognitionStart(int session_id) OVERRIDE {
125     recognition_started_ = true;
126     CheckEventsConsistency();
127   }
128
129   virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {}
130
131   virtual void OnSoundStart(int session_id) OVERRIDE {
132     sound_started_ = true;
133     CheckEventsConsistency();
134   }
135
136   virtual void OnSoundEnd(int session_id) OVERRIDE {
137     sound_ended_ = true;
138     CheckEventsConsistency();
139   }
140
141   // testing::Test methods.
142   virtual void SetUp() OVERRIDE {
143     AudioInputController::set_factory_for_testing(
144         &audio_input_controller_factory_);
145   }
146
147   virtual void TearDown() OVERRIDE {
148     AudioInputController::set_factory_for_testing(NULL);
149   }
150
151   void FillPacketWithTestWaveform() {
152     // Fill the input with a simple pattern, a 125Hz sawtooth waveform.
153     for (size_t i = 0; i < audio_packet_.size(); ++i)
154       audio_packet_[i] = static_cast<uint8>(i);
155   }
156
157   void FillPacketWithNoise() {
158     int value = 0;
159     int factor = 175;
160     for (size_t i = 0; i < audio_packet_.size(); ++i) {
161       value += factor;
162       audio_packet_[i] = value % 100;
163     }
164   }
165
166  protected:
167   base::MessageLoopForIO message_loop_;
168   BrowserThreadImpl io_thread_;
169   scoped_refptr<SpeechRecognizerImpl> recognizer_;
170   scoped_ptr<AudioManager> audio_manager_;
171   bool recognition_started_;
172   bool recognition_ended_;
173   bool result_received_;
174   bool audio_started_;
175   bool audio_ended_;
176   bool sound_started_;
177   bool sound_ended_;
178   SpeechRecognitionErrorCode error_;
179   net::TestURLFetcherFactory url_fetcher_factory_;
180   TestAudioInputControllerFactory audio_input_controller_factory_;
181   std::vector<uint8> audio_packet_;
182   float volume_;
183   float noise_volume_;
184 };
185
186 TEST_F(SpeechRecognizerImplTest, StopNoData) {
187   // Check for callbacks when stopping record before any audio gets recorded.
188   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
189   recognizer_->StopAudioCapture();
190   base::MessageLoop::current()->RunUntilIdle();
191   EXPECT_TRUE(recognition_started_);
192   EXPECT_FALSE(audio_started_);
193   EXPECT_FALSE(result_received_);
194   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
195   CheckFinalEventsConsistency();
196 }
197
198 TEST_F(SpeechRecognizerImplTest, CancelNoData) {
199   // Check for callbacks when canceling recognition before any audio gets
200   // recorded.
201   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
202   recognizer_->AbortRecognition();
203   base::MessageLoop::current()->RunUntilIdle();
204   EXPECT_TRUE(recognition_started_);
205   EXPECT_FALSE(audio_started_);
206   EXPECT_FALSE(result_received_);
207   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
208   CheckFinalEventsConsistency();
209 }
210
211 TEST_F(SpeechRecognizerImplTest, StopWithData) {
212   // Start recording, give some data and then stop. This should wait for the
213   // network callback to arrive before completion.
214   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
215   base::MessageLoop::current()->RunUntilIdle();
216   TestAudioInputController* controller =
217       audio_input_controller_factory_.controller();
218   ASSERT_TRUE(controller);
219
220   // Try sending 5 chunks of mock audio data and verify that each of them
221   // resulted immediately in a packet sent out via the network. This verifies
222   // that we are streaming out encoded data as chunks without waiting for the
223   // full recording to complete.
224   const size_t kNumChunks = 5;
225   for (size_t i = 0; i < kNumChunks; ++i) {
226     controller->event_handler()->OnData(controller, &audio_packet_[0],
227                                         audio_packet_.size());
228     base::MessageLoop::current()->RunUntilIdle();
229     net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
230     ASSERT_TRUE(fetcher);
231     EXPECT_EQ(i + 1, fetcher->upload_chunks().size());
232   }
233
234   recognizer_->StopAudioCapture();
235   base::MessageLoop::current()->RunUntilIdle();
236   EXPECT_TRUE(audio_started_);
237   EXPECT_TRUE(audio_ended_);
238   EXPECT_FALSE(recognition_ended_);
239   EXPECT_FALSE(result_received_);
240   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
241
242   // Issue the network callback to complete the process.
243   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
244   ASSERT_TRUE(fetcher);
245
246   fetcher->set_url(fetcher->GetOriginalURL());
247   net::URLRequestStatus status;
248   status.set_status(net::URLRequestStatus::SUCCESS);
249   fetcher->set_status(status);
250   fetcher->set_response_code(200);
251   fetcher->SetResponseString(
252       "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}");
253   fetcher->delegate()->OnURLFetchComplete(fetcher);
254   base::MessageLoop::current()->RunUntilIdle();
255   EXPECT_TRUE(recognition_ended_);
256   EXPECT_TRUE(result_received_);
257   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
258   CheckFinalEventsConsistency();
259 }
260
261 TEST_F(SpeechRecognizerImplTest, CancelWithData) {
262   // Start recording, give some data and then cancel.
263   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
264   base::MessageLoop::current()->RunUntilIdle();
265   TestAudioInputController* controller =
266       audio_input_controller_factory_.controller();
267   ASSERT_TRUE(controller);
268   controller->event_handler()->OnData(controller, &audio_packet_[0],
269                                       audio_packet_.size());
270   base::MessageLoop::current()->RunUntilIdle();
271   recognizer_->AbortRecognition();
272   base::MessageLoop::current()->RunUntilIdle();
273   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
274   EXPECT_TRUE(recognition_started_);
275   EXPECT_TRUE(audio_started_);
276   EXPECT_FALSE(result_received_);
277   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
278   CheckFinalEventsConsistency();
279 }
280
281 TEST_F(SpeechRecognizerImplTest, ConnectionError) {
282   // Start recording, give some data and then stop. Issue the network callback
283   // with a connection error and verify that the recognizer bubbles the error up
284   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
285   base::MessageLoop::current()->RunUntilIdle();
286   TestAudioInputController* controller =
287       audio_input_controller_factory_.controller();
288   ASSERT_TRUE(controller);
289   controller->event_handler()->OnData(controller, &audio_packet_[0],
290                                       audio_packet_.size());
291   base::MessageLoop::current()->RunUntilIdle();
292   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
293   ASSERT_TRUE(fetcher);
294
295   recognizer_->StopAudioCapture();
296   base::MessageLoop::current()->RunUntilIdle();
297   EXPECT_TRUE(audio_started_);
298   EXPECT_TRUE(audio_ended_);
299   EXPECT_FALSE(recognition_ended_);
300   EXPECT_FALSE(result_received_);
301   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
302
303   // Issue the network callback to complete the process.
304   fetcher->set_url(fetcher->GetOriginalURL());
305   net::URLRequestStatus status;
306   status.set_status(net::URLRequestStatus::FAILED);
307   status.set_error(net::ERR_CONNECTION_REFUSED);
308   fetcher->set_status(status);
309   fetcher->set_response_code(0);
310   fetcher->SetResponseString(std::string());
311   fetcher->delegate()->OnURLFetchComplete(fetcher);
312   base::MessageLoop::current()->RunUntilIdle();
313   EXPECT_TRUE(recognition_ended_);
314   EXPECT_FALSE(result_received_);
315   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
316   CheckFinalEventsConsistency();
317 }
318
319 TEST_F(SpeechRecognizerImplTest, ServerError) {
320   // Start recording, give some data and then stop. Issue the network callback
321   // with a 500 error and verify that the recognizer bubbles the error up
322   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
323   base::MessageLoop::current()->RunUntilIdle();
324   TestAudioInputController* controller =
325       audio_input_controller_factory_.controller();
326   ASSERT_TRUE(controller);
327   controller->event_handler()->OnData(controller, &audio_packet_[0],
328                                       audio_packet_.size());
329   base::MessageLoop::current()->RunUntilIdle();
330   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
331   ASSERT_TRUE(fetcher);
332
333   recognizer_->StopAudioCapture();
334   base::MessageLoop::current()->RunUntilIdle();
335   EXPECT_TRUE(audio_started_);
336   EXPECT_TRUE(audio_ended_);
337   EXPECT_FALSE(recognition_ended_);
338   EXPECT_FALSE(result_received_);
339   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
340
341   // Issue the network callback to complete the process.
342   fetcher->set_url(fetcher->GetOriginalURL());
343   net::URLRequestStatus status;
344   status.set_status(net::URLRequestStatus::SUCCESS);
345   fetcher->set_status(status);
346   fetcher->set_response_code(500);
347   fetcher->SetResponseString("Internal Server Error");
348   fetcher->delegate()->OnURLFetchComplete(fetcher);
349   base::MessageLoop::current()->RunUntilIdle();
350   EXPECT_TRUE(recognition_ended_);
351   EXPECT_FALSE(result_received_);
352   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
353   CheckFinalEventsConsistency();
354 }
355
356 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) {
357   // Check if things tear down properly if AudioInputController threw an error.
358   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
359   base::MessageLoop::current()->RunUntilIdle();
360   TestAudioInputController* controller =
361       audio_input_controller_factory_.controller();
362   ASSERT_TRUE(controller);
363   controller->event_handler()->OnError(controller);
364   base::MessageLoop::current()->RunUntilIdle();
365   EXPECT_TRUE(recognition_started_);
366   EXPECT_FALSE(audio_started_);
367   EXPECT_FALSE(result_received_);
368   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
369   CheckFinalEventsConsistency();
370 }
371
372 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) {
373   // Check if things tear down properly if AudioInputController threw an error
374   // after giving some audio data.
375   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
376   base::MessageLoop::current()->RunUntilIdle();
377   TestAudioInputController* controller =
378       audio_input_controller_factory_.controller();
379   ASSERT_TRUE(controller);
380   controller->event_handler()->OnData(controller, &audio_packet_[0],
381                                       audio_packet_.size());
382   controller->event_handler()->OnError(controller);
383   base::MessageLoop::current()->RunUntilIdle();
384   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
385   EXPECT_TRUE(recognition_started_);
386   EXPECT_TRUE(audio_started_);
387   EXPECT_FALSE(result_received_);
388   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
389   CheckFinalEventsConsistency();
390 }
391
392 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
393   // Start recording and give a lot of packets with audio samples set to zero.
394   // This should trigger the no-speech detector and issue a callback.
395   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
396   base::MessageLoop::current()->RunUntilIdle();
397   TestAudioInputController* controller =
398       audio_input_controller_factory_.controller();
399   ASSERT_TRUE(controller);
400
401   int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
402                      GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1;
403   // The vector is already filled with zero value samples on create.
404   for (int i = 0; i < num_packets; ++i) {
405     controller->event_handler()->OnData(controller, &audio_packet_[0],
406                                         audio_packet_.size());
407   }
408   base::MessageLoop::current()->RunUntilIdle();
409   EXPECT_TRUE(recognition_started_);
410   EXPECT_TRUE(audio_started_);
411   EXPECT_FALSE(result_received_);
412   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_);
413   CheckFinalEventsConsistency();
414 }
415
416 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
417   // Start recording and give a lot of packets with audio samples set to zero
418   // and then some more with reasonably loud audio samples. This should be
419   // treated as normal speech input and the no-speech detector should not get
420   // triggered.
421   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
422   base::MessageLoop::current()->RunUntilIdle();
423   TestAudioInputController* controller =
424       audio_input_controller_factory_.controller();
425   ASSERT_TRUE(controller);
426   controller = audio_input_controller_factory_.controller();
427   ASSERT_TRUE(controller);
428
429   int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
430                      GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
431
432   // The vector is already filled with zero value samples on create.
433   for (int i = 0; i < num_packets / 2; ++i) {
434     controller->event_handler()->OnData(controller, &audio_packet_[0],
435                                         audio_packet_.size());
436   }
437
438   FillPacketWithTestWaveform();
439   for (int i = 0; i < num_packets / 2; ++i) {
440     controller->event_handler()->OnData(controller, &audio_packet_[0],
441                                         audio_packet_.size());
442   }
443
444   base::MessageLoop::current()->RunUntilIdle();
445   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
446   EXPECT_TRUE(audio_started_);
447   EXPECT_FALSE(audio_ended_);
448   EXPECT_FALSE(recognition_ended_);
449   recognizer_->AbortRecognition();
450   base::MessageLoop::current()->RunUntilIdle();
451   CheckFinalEventsConsistency();
452 }
453
454 TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
455   // Start recording and give a lot of packets with audio samples set to zero
456   // and then some more with reasonably loud audio samples. Check that we don't
457   // get the callback during estimation phase, then get zero for the silence
458   // samples and proper volume for the loud audio.
459   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
460   base::MessageLoop::current()->RunUntilIdle();
461   TestAudioInputController* controller =
462       audio_input_controller_factory_.controller();
463   ASSERT_TRUE(controller);
464   controller = audio_input_controller_factory_.controller();
465   ASSERT_TRUE(controller);
466
467   // Feed some samples to begin with for the endpointer to do noise estimation.
468   int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs /
469                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
470   FillPacketWithNoise();
471   for (int i = 0; i < num_packets; ++i) {
472     controller->event_handler()->OnData(controller, &audio_packet_[0],
473                                         audio_packet_.size());
474   }
475   base::MessageLoop::current()->RunUntilIdle();
476   EXPECT_EQ(-1.0f, volume_);  // No audio volume set yet.
477
478   // The vector is already filled with zero value samples on create.
479   controller->event_handler()->OnData(controller, &audio_packet_[0],
480                                       audio_packet_.size());
481   base::MessageLoop::current()->RunUntilIdle();
482   EXPECT_FLOAT_EQ(0.74939233f, volume_);
483
484   FillPacketWithTestWaveform();
485   controller->event_handler()->OnData(controller, &audio_packet_[0],
486                                       audio_packet_.size());
487   base::MessageLoop::current()->RunUntilIdle();
488   EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
489   EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);
490
491   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
492   EXPECT_FALSE(audio_ended_);
493   EXPECT_FALSE(recognition_ended_);
494   recognizer_->AbortRecognition();
495   base::MessageLoop::current()->RunUntilIdle();
496   CheckFinalEventsConsistency();
497 }
498
499 }  // namespace content