src/content/browser/speech/endpointer/energy_endpointer.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4 //
   5 // To know more about the algorithm used and the original code which this is
   6 // based of, see
   7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
   8
   9 #include "content/browser/speech/endpointer/energy_endpointer.h"
  10
  11 #include <math.h>
  12
  13 #include "base/logging.h"
  14
  15 namespace {
  16
  17 // Returns the RMS (quadratic mean) of the input signal.
  18 float RMS(const int16* samples, int num_samples) {
  19   int64 ssq_int64 = 0;
  20   int64 sum_int64 = 0;
  21   for (int i = 0; i < num_samples; ++i) {
  22     sum_int64 += samples[i];
  23     ssq_int64 += samples[i] * samples[i];
  24   }
  25   // now convert to floats.
  26   double sum = static_cast<double>(sum_int64);
  27   sum /= num_samples;
  28   double ssq = static_cast<double>(ssq_int64);
  29   return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
  30 }
  31
  32 int64 Secs2Usecs(float seconds) {
  33   return static_cast<int64>(0.5 + (1.0e6 * seconds));
  34 }
  35
  36 float GetDecibel(float value) {
  37   if (value > 1.0e-100)
  38     return 20 * log10(value);
  39   return -2000.0;
  40 }
  41
  42 }  // namespace
  43
  44 namespace content {
  45
  46 // Stores threshold-crossing histories for making decisions about the speech
  47 // state.
  48 class EnergyEndpointer::HistoryRing {
  49  public:
  50   HistoryRing() : insertion_index_(0) {}
  51
  52   // Resets the ring to |size| elements each with state |initial_state|
  53   void SetRing(int size, bool initial_state);
  54
  55   // Inserts a new entry into the ring and drops the oldest entry.
  56   void Insert(int64 time_us, bool decision);
  57
  58   // Returns the time in microseconds of the most recently added entry.
  59   int64 EndTime() const;
  60
  61   // Returns the sum of all intervals during which 'decision' is true within
  62   // the time in seconds specified by 'duration'. The returned interval is
  63   // in seconds.
  64   float RingSum(float duration_sec);
  65
  66  private:
  67   struct DecisionPoint {
  68     int64 time_us;
  69     bool decision;
  70   };
  71
  72   std::vector<DecisionPoint> decision_points_;
  73   int insertion_index_;  // Index at which the next item gets added/inserted.
  74
  75   DISALLOW_COPY_AND_ASSIGN(HistoryRing);
  76 };
  77
  78 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
  79   insertion_index_ = 0;
  80   decision_points_.clear();
  81   DecisionPoint init = { -1, initial_state };
  82   decision_points_.resize(size, init);
  83 }
  84
  85 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
  86   decision_points_[insertion_index_].time_us = time_us;
  87   decision_points_[insertion_index_].decision = decision;
  88   insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
  89 }
  90
  91 int64 EnergyEndpointer::HistoryRing::EndTime() const {
  92   int ind = insertion_index_ - 1;
  93   if (ind < 0)
  94     ind = decision_points_.size() - 1;
  95   return decision_points_[ind].time_us;
  96 }
  97
  98 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
  99   if (!decision_points_.size())
 100     return 0.0;
 101
 102   int64 sum_us = 0;
 103   int ind = insertion_index_ - 1;
 104   if (ind < 0)
 105     ind = decision_points_.size() - 1;
 106   int64 end_us = decision_points_[ind].time_us;
 107   bool is_on = decision_points_[ind].decision;
 108   int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
 109   if (start_us < 0)
 110     start_us = 0;
 111   size_t n_summed = 1;  // n points ==> (n-1) intervals
 112   while ((decision_points_[ind].time_us > start_us) &&
 113          (n_summed < decision_points_.size())) {
 114     --ind;
 115     if (ind < 0)
 116       ind = decision_points_.size() - 1;
 117     if (is_on)
 118       sum_us += end_us - decision_points_[ind].time_us;
 119     is_on = decision_points_[ind].decision;
 120     end_us = decision_points_[ind].time_us;
 121     n_summed++;
 122   }
 123
 124   return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
 125 }
 126
 127 EnergyEndpointer::EnergyEndpointer()
 128     : status_(EP_PRE_SPEECH),
 129       offset_confirm_dur_sec_(0),
 130       endpointer_time_us_(0),
 131       fast_update_frames_(0),
 132       frame_counter_(0),
 133       max_window_dur_(4.0),
 134       sample_rate_(0),
 135       history_(new HistoryRing()),
 136       decision_threshold_(0),
 137       estimating_environment_(false),
 138       noise_level_(0),
 139       rms_adapt_(0),
 140       start_lag_(0),
 141       end_lag_(0),
 142       user_input_start_time_us_(0) {
 143 }
 144
 145 EnergyEndpointer::~EnergyEndpointer() {
 146 }
 147
 148 int EnergyEndpointer::TimeToFrame(float time) const {
 149   return static_cast<int32>(0.5 + (time / params_.frame_period()));
 150 }
 151
 152 void EnergyEndpointer::Restart(bool reset_threshold) {
 153   status_ = EP_PRE_SPEECH;
 154   user_input_start_time_us_ = 0;
 155
 156   if (reset_threshold) {
 157     decision_threshold_ = params_.decision_threshold();
 158     rms_adapt_ = decision_threshold_;
 159     noise_level_ = params_.decision_threshold() / 2.0f;
 160     frame_counter_ = 0;  // Used for rapid initial update of levels.
 161   }
 162
 163   // Set up the memories to hold the history windows.
 164   history_->SetRing(TimeToFrame(max_window_dur_), false);
 165
 166   // Flag that indicates that current input should be used for
 167   // estimating the environment. The user has not yet started input
 168   // by e.g. pressed the push-to-talk button. By default, this is
 169   // false for backward compatibility.
 170   estimating_environment_ = false;
 171 }
 172
 173 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
 174   params_ = params;
 175
 176   // Find the longest history interval to be used, and make the ring
 177   // large enough to accommodate that number of frames.  NOTE: This
 178   // depends upon ep_frame_period being set correctly in the factory
 179   // that did this instantiation.
 180   max_window_dur_ = params_.onset_window();
 181   if (params_.speech_on_window() > max_window_dur_)
 182     max_window_dur_ = params_.speech_on_window();
 183   if (params_.offset_window() > max_window_dur_)
 184     max_window_dur_ = params_.offset_window();
 185   Restart(true);
 186
 187   offset_confirm_dur_sec_ = params_.offset_window() -
 188                             params_.offset_confirm_dur();
 189   if (offset_confirm_dur_sec_ < 0.0)
 190     offset_confirm_dur_sec_ = 0.0;
 191
 192   user_input_start_time_us_ = 0;
 193
 194   // Flag that indicates that  current input should be used for
 195   // estimating the environment. The user has not yet started input
 196   // by e.g. pressed the push-to-talk button. By default, this is
 197   // false for backward compatibility.
 198   estimating_environment_ = false;
 199   // The initial value of the noise and speech levels is inconsequential.
 200   // The level of the first frame will overwrite these values.
 201   noise_level_ = params_.decision_threshold() / 2.0f;
 202   fast_update_frames_ =
 203       static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
 204
 205   frame_counter_ = 0;  // Used for rapid initial update of levels.
 206
 207   sample_rate_ = params_.sample_rate();
 208   start_lag_ = static_cast<int>(sample_rate_ /
 209                                 params_.max_fundamental_frequency());
 210   end_lag_ = static_cast<int>(sample_rate_ /
 211                               params_.min_fundamental_frequency());
 212 }
 213
 214 void EnergyEndpointer::StartSession() {
 215   Restart(true);
 216 }
 217
 218 void EnergyEndpointer::EndSession() {
 219   status_ = EP_POST_SPEECH;
 220 }
 221
 222 void EnergyEndpointer::SetEnvironmentEstimationMode() {
 223   Restart(true);
 224   estimating_environment_ = true;
 225 }
 226
 227 void EnergyEndpointer::SetUserInputMode() {
 228   estimating_environment_ = false;
 229   user_input_start_time_us_ = endpointer_time_us_;
 230 }
 231
 232 void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
 233                                          const int16* samples,
 234                                          int num_samples,
 235                                          float* rms_out) {
 236   endpointer_time_us_ = time_us;
 237   float rms = RMS(samples, num_samples);
 238
 239   // Check that this is user input audio vs. pre-input adaptation audio.
 240   // Input audio starts when the user indicates start of input, by e.g.
 241   // pressing push-to-talk. Audio received prior to that is used to update
 242   // noise and speech level estimates.
 243   if (!estimating_environment_) {
 244     bool decision = false;
 245     if ((endpointer_time_us_ - user_input_start_time_us_) <
 246         Secs2Usecs(params_.contamination_rejection_period())) {
 247       decision = false;
 248       DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_;
 249     } else {
 250       decision = (rms > decision_threshold_);
 251     }
 252
 253     history_->Insert(endpointer_time_us_, decision);
 254
 255     switch (status_) {
 256       case EP_PRE_SPEECH:
 257         if (history_->RingSum(params_.onset_window()) >
 258             params_.onset_detect_dur()) {
 259           status_ = EP_POSSIBLE_ONSET;
 260         }
 261         break;
 262
 263       case EP_POSSIBLE_ONSET: {
 264         float tsum = history_->RingSum(params_.onset_window());
 265         if (tsum > params_.onset_confirm_dur()) {
 266           status_ = EP_SPEECH_PRESENT;
 267         } else {  // If signal is not maintained, drop back to pre-speech.
 268           if (tsum <= params_.onset_detect_dur())
 269             status_ = EP_PRE_SPEECH;
 270         }
 271         break;
 272       }
 273
 274       case EP_SPEECH_PRESENT: {
 275         // To induce hysteresis in the state residency, we allow a
 276         // smaller residency time in the on_ring, than was required to
 277         // enter the SPEECH_PERSENT state.
 278         float on_time = history_->RingSum(params_.speech_on_window());
 279         if (on_time < params_.on_maintain_dur())
 280           status_ = EP_POSSIBLE_OFFSET;
 281         break;
 282       }
 283
 284       case EP_POSSIBLE_OFFSET:
 285         if (history_->RingSum(params_.offset_window()) <=
 286             offset_confirm_dur_sec_) {
 287           // Note that this offset time may be beyond the end
 288           // of the input buffer in a real-time system.  It will be up
 289           // to the RecognizerSession to decide what to do.
 290           status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
 291         } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
 292           if (history_->RingSum(params_.speech_on_window()) >=
 293               params_.on_maintain_dur())
 294             status_ = EP_SPEECH_PRESENT;
 295         }
 296         break;
 297
 298       default:
 299         LOG(WARNING) << "Invalid case in switch: " << status_;
 300         break;
 301     }
 302
 303     // If this is a quiet, non-speech region, slowly adapt the detection
 304     // threshold to be about 6dB above the average RMS.
 305     if ((!decision) && (status_ == EP_PRE_SPEECH)) {
 306       decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
 307       rms_adapt_ = decision_threshold_;
 308     } else {
 309       // If this is in a speech region, adapt the decision threshold to
 310       // be about 10dB below the average RMS. If the noise level is high,
 311       // the threshold is pushed up.
 312       // Adaptation up to a higher level is 5 times faster than decay to
 313       // a lower level.
 314       if ((status_ == EP_SPEECH_PRESENT) && decision) {
 315         if (rms_adapt_ > rms) {
 316           rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
 317         } else {
 318           rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
 319         }
 320         float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
 321         decision_threshold_ = (.90f * decision_threshold_) +
 322                               (0.10f * target_threshold);
 323       }
 324     }
 325
 326     // Set a floor
 327     if (decision_threshold_ < params_.min_decision_threshold())
 328       decision_threshold_ = params_.min_decision_threshold();
 329   }
 330
 331   // Update speech and noise levels.
 332   UpdateLevels(rms);
 333   ++frame_counter_;
 334
 335   if (rms_out)
 336     *rms_out = GetDecibel(rms);
 337 }
 338
 339 float EnergyEndpointer::GetNoiseLevelDb() const {
 340   return GetDecibel(noise_level_);
 341 }
 342
 343 void EnergyEndpointer::UpdateLevels(float rms) {
 344   // Update quickly initially. We assume this is noise and that
 345   // speech is 6dB above the noise.
 346   if (frame_counter_ < fast_update_frames_) {
 347     // Alpha increases from 0 to (k-1)/k where k is the number of time
 348     // steps in the initial adaptation period.
 349     float alpha = static_cast<float>(frame_counter_) /
 350         static_cast<float>(fast_update_frames_);
 351     noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
 352     DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
 353              << ", fast_update_frames_ " << fast_update_frames_;
 354   } else {
 355     // Update Noise level. The noise level adapts quickly downward, but
 356     // slowly upward. The noise_level_ parameter is not currently used
 357     // for threshold adaptation. It is used for UI feedback.
 358     if (noise_level_ < rms)
 359       noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
 360     else
 361       noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
 362   }
 363   if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
 364     decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
 365     // Set a floor
 366     if (decision_threshold_ < params_.min_decision_threshold())
 367       decision_threshold_ = params_.min_decision_threshold();
 368   }
 369 }
 370
 371 EpStatus EnergyEndpointer::Status(int64* status_time)  const {
 372   *status_time = history_->EndTime();
 373   return status_;
 374 }
 375
 376 }  // namespace content