src/third_party/webrtc/modules/audio_coding/neteq4/time_stretch.cc

   1 /*
   2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "webrtc/modules/audio_coding/neteq4/time_stretch.h"
  12
  13 #include <algorithm>  // min, max
  14
  15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
  16 #include "webrtc/modules/audio_coding/neteq4/background_noise.h"
  17 #include "webrtc/modules/audio_coding/neteq4/dsp_helper.h"
  18 #include "webrtc/system_wrappers/interface/scoped_ptr.h"
  19
  20 namespace webrtc {
  21
  22 TimeStretch::ReturnCodes TimeStretch::Process(
  23     const int16_t* input,
  24     size_t input_len,
  25     AudioMultiVector* output,
  26     int16_t* length_change_samples) {
  27
  28   // Pre-calculate common multiplication with |fs_mult_|.
  29   int fs_mult_120 = fs_mult_ * 120;  // Corresponds to 15 ms.
  30
  31   const int16_t* signal;
  32   scoped_ptr<int16_t[]> signal_array;
  33   size_t signal_len;
  34   if (num_channels_ == 1) {
  35     signal = input;
  36     signal_len = input_len;
  37   } else {
  38     // We want |signal| to be only the first channel of |input|, which is
  39     // interleaved. Thus, we take the first sample, skip forward |num_channels|
  40     // samples, and continue like that.
  41     signal_len = input_len / num_channels_;
  42     signal_array.reset(new int16_t[signal_len]);
  43     signal = signal_array.get();
  44     size_t j = master_channel_;
  45     for (size_t i = 0; i < signal_len; ++i) {
  46       signal_array[i] = input[j];
  47       j += num_channels_;
  48     }
  49   }
  50
  51   // Find maximum absolute value of input signal.
  52   max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal,
  53                                               static_cast<int>(signal_len));
  54
  55   // Downsample to 4 kHz sample rate and calculate auto-correlation.
  56   DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
  57                               sample_rate_hz_, true /* compensate delay*/,
  58                               downsampled_input_);
  59   AutoCorrelation();
  60
  61   // Find the strongest correlation peak.
  62   static const int kNumPeaks = 1;
  63   int peak_index;
  64   int16_t peak_value;
  65   DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
  66                            fs_mult_, &peak_index, &peak_value);
  67   // Assert that |peak_index| stays within boundaries.
  68   assert(peak_index >= 0);
  69   assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
  70
  71   // Compensate peak_index for displaced starting position. The displacement
  72   // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
  73   // domain, while the |peak_index| is in the original sample rate; hence, the
  74   // multiplication by fs_mult_ * 2.
  75   peak_index += kMinLag * fs_mult_ * 2;
  76   // Assert that |peak_index| stays within boundaries.
  77   assert(peak_index >= 20 * fs_mult_);
  78   assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
  79
  80   // Calculate scaling to ensure that |peak_index| samples can be square-summed
  81   // without overflowing.
  82   int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
  83       WebRtcSpl_NormW32(peak_index);
  84   scaling = std::max(0, scaling);
  85
  86   // |vec1| starts at 15 ms minus one pitch period.
  87   const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
  88   // |vec2| start at 15 ms.
  89   const int16_t* vec2 = &signal[fs_mult_120];
  90   // Calculate energies for |vec1| and |vec2|, assuming they both contain
  91   // |peak_index| samples.
  92   int32_t vec1_energy =
  93       WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
  94   int32_t vec2_energy =
  95       WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
  96
  97   // Calculate cross-correlation between |vec1| and |vec2|.
  98   int32_t cross_corr =
  99       WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
 100
 101   // Check if the signal seems to be active speech or not (simple VAD).
 102   bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
 103                                        scaling);
 104
 105   int16_t best_correlation;
 106   if (!active_speech) {
 107     SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
 108   } else {
 109     // Calculate correlation:
 110     // cross_corr / sqrt(vec1_energy * vec2_energy).
 111
 112     // Start with calculating scale values.
 113     int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
 114     int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
 115
 116     // Make sure total scaling is even (to simplify scale factor after sqrt).
 117     if ((energy1_scale + energy2_scale) & 1) {
 118       // The sum is odd.
 119       energy1_scale += 1;
 120     }
 121
 122     // Scale energies to int16_t.
 123     int16_t vec1_energy_int16 =
 124         static_cast<int16_t>(vec1_energy >> energy1_scale);
 125     int16_t vec2_energy_int16 =
 126         static_cast<int16_t>(vec2_energy >> energy2_scale);
 127
 128     // Calculate square-root of energy product.
 129     int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
 130                                                    vec2_energy_int16);
 131
 132     // Calculate cross_corr / sqrt(en1*en2) in Q14.
 133     int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
 134     cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
 135     cross_corr = std::max(0, cross_corr);  // Don't use if negative.
 136     best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
 137     // Make sure |best_correlation| is no larger than 1 in Q14.
 138     best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
 139   }
 140
 141
 142   // Check accelerate criteria and stretch the signal.
 143   ReturnCodes return_value = CheckCriteriaAndStretch(
 144       input, input_len, peak_index, best_correlation, active_speech, output);
 145   switch (return_value) {
 146     case kSuccess:
 147       *length_change_samples = peak_index;
 148       break;
 149     case kSuccessLowEnergy:
 150       *length_change_samples = peak_index;
 151       break;
 152     case kNoStretch:
 153     case kError:
 154       *length_change_samples = 0;
 155       break;
 156   }
 157   return return_value;
 158 }
 159
 160 void TimeStretch::AutoCorrelation() {
 161   // Set scaling factor for cross correlation to protect against overflow.
 162   int scaling = kLogCorrelationLen - WebRtcSpl_NormW32(
 163       max_input_value_ * max_input_value_);
 164   scaling = std::max(0, scaling);
 165
 166   // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
 167   int32_t auto_corr[kCorrelationLen];
 168   WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag],
 169                              &downsampled_input_[kMaxLag - kMinLag],
 170                              kCorrelationLen, kMaxLag - kMinLag, scaling, -1);
 171
 172   // Normalize correlation to 14 bits and write to |auto_correlation_|.
 173   int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
 174   scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
 175   WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
 176                                    auto_corr, scaling);
 177 }
 178
 179 bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
 180                                   int peak_index, int scaling) const {
 181   // Check if the signal seems to be active speech or not (simple VAD).
 182   // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
 183   // 8 * background_noise_energy, then we say that the signal contains no
 184   // active speech.
 185   // Rewrite the inequality as:
 186   // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
 187   // The two sides of the inequality will be denoted |left_side| and
 188   // |right_side|.
 189   int32_t left_side = (vec1_energy + vec2_energy) / 16;
 190   int32_t right_side;
 191   if (background_noise_.initialized()) {
 192     right_side = background_noise_.Energy(master_channel_);
 193   } else {
 194     // If noise parameters have not been estimated, use a fixed threshold.
 195     right_side = 75000;
 196   }
 197   int right_scale = 16 - WebRtcSpl_NormW32(right_side);
 198   right_scale = std::max(0, right_scale);
 199   left_side = left_side >> right_scale;
 200   right_side = peak_index * (right_side >> right_scale);
 201
 202   // Scale |left_side| properly before comparing with |right_side|.
 203   // (|scaling| is the scale factor before energy calculation, thus the scale
 204   // factor for the energy is 2 * scaling.)
 205   if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
 206     // Cannot scale only |left_side|, must scale |right_side| too.
 207     int temp_scale = WebRtcSpl_NormW32(left_side);
 208     left_side = left_side << temp_scale;
 209     right_side = right_side >> (2 * scaling - temp_scale);
 210   } else {
 211     left_side = left_side << 2 * scaling;
 212   }
 213   return left_side > right_side;
 214 }
 215
 216 }  // namespace webrtc