src/third_party/webrtc/common_audio/vad/vad_core.c

   1 /*
   2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "webrtc/common_audio/vad/vad_core.h"
  12
  13 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
  14 #include "webrtc/common_audio/vad/vad_filterbank.h"
  15 #include "webrtc/common_audio/vad/vad_gmm.h"
  16 #include "webrtc/common_audio/vad/vad_sp.h"
  17 #include "webrtc/typedefs.h"
  18
  19 // Spectrum Weighting
  20 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
  21 static const int16_t kNoiseUpdateConst = 655; // Q15
  22 static const int16_t kSpeechUpdateConst = 6554; // Q15
  23 static const int16_t kBackEta = 154; // Q8
  24 // Minimum difference between the two models, Q5
  25 static const int16_t kMinimumDifference[kNumChannels] = {
  26     544, 544, 576, 576, 576, 576 };
  27 // Upper limit of mean value for speech model, Q7
  28 static const int16_t kMaximumSpeech[kNumChannels] = {
  29     11392, 11392, 11520, 11520, 11520, 11520 };
  30 // Minimum value for mean value
  31 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
  32 // Upper limit of mean value for noise model, Q7
  33 static const int16_t kMaximumNoise[kNumChannels] = {
  34     9216, 9088, 8960, 8832, 8704, 8576 };
  35 // Start values for the Gaussian models, Q7
  36 // Weights for the two Gaussians for the six channels (noise)
  37 static const int16_t kNoiseDataWeights[kTableSize] = {
  38     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
  39 // Weights for the two Gaussians for the six channels (speech)
  40 static const int16_t kSpeechDataWeights[kTableSize] = {
  41     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
  42 // Means for the two Gaussians for the six channels (noise)
  43 static const int16_t kNoiseDataMeans[kTableSize] = {
  44     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
  45 // Means for the two Gaussians for the six channels (speech)
  46 static const int16_t kSpeechDataMeans[kTableSize] = {
  47     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
  48 };
  49 // Stds for the two Gaussians for the six channels (noise)
  50 static const int16_t kNoiseDataStds[kTableSize] = {
  51     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
  52 // Stds for the two Gaussians for the six channels (speech)
  53 static const int16_t kSpeechDataStds[kTableSize] = {
  54     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
  55
  56 // Constants used in GmmProbability().
  57 //
  58 // Maximum number of counted speech (VAD = 1) frames in a row.
  59 static const int16_t kMaxSpeechFrames = 6;
  60 // Minimum standard deviation for both speech and noise.
  61 static const int16_t kMinStd = 384;
  62
  63 // Constants in WebRtcVad_InitCore().
  64 // Default aggressiveness mode.
  65 static const short kDefaultMode = 0;
  66 static const int kInitCheck = 42;
  67
  68 // Constants used in WebRtcVad_set_mode_core().
  69 //
  70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
  71 //
  72 // Mode 0, Quality.
  73 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
  74 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
  75 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
  76 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
  77 // Mode 1, Low bitrate.
  78 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
  79 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
  80 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
  81 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
  82 // Mode 2, Aggressive.
  83 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
  84 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
  85 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
  86 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
  87 // Mode 3, Very aggressive.
  88 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
  89 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
  90 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
  91 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
  92
  93 // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
  94 // updated with an |offset| before averaging.
  95 //
  96 // - data     [i/o] : Data to average.
  97 // - offset   [i]   : An offset added to |data|.
  98 // - weights  [i]   : Weights used for averaging.
  99 //
 100 // returns          : The weighted average.
 101 static int32_t WeightedAverage(int16_t* data, int16_t offset,
 102                                const int16_t* weights) {
 103   int k;
 104   int32_t weighted_average = 0;
 105
 106   for (k = 0; k < kNumGaussians; k++) {
 107     data[k * kNumChannels] += offset;
 108     weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
 109   }
 110   return weighted_average;
 111 }
 112
 113 // Calculates the probabilities for both speech and background noise using
 114 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
 115 // type of signal is most probable.
 116 //
 117 // - self           [i/o] : Pointer to VAD instance
 118 // - features       [i]   : Feature vector of length |kNumChannels|
 119 //                          = log10(energy in frequency band)
 120 // - total_power    [i]   : Total power in audio frame.
 121 // - frame_length   [i]   : Number of input samples
 122 //
 123 // - returns              : the VAD decision (0 - noise, 1 - speech).
 124 static int16_t GmmProbability(VadInstT* self, int16_t* features,
 125                               int16_t total_power, int frame_length) {
 126   int channel, k;
 127   int16_t feature_minimum;
 128   int16_t h0, h1;
 129   int16_t log_likelihood_ratio;
 130   int16_t vadflag = 0;
 131   int16_t shifts_h0, shifts_h1;
 132   int16_t tmp_s16, tmp1_s16, tmp2_s16;
 133   int16_t diff;
 134   int gaussian;
 135   int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
 136   int16_t delt, ndelt;
 137   int16_t maxspe, maxmu;
 138   int16_t deltaN[kTableSize], deltaS[kTableSize];
 139   int16_t ngprvec[kTableSize] = { 0 };  // Conditional probability = 0.
 140   int16_t sgprvec[kTableSize] = { 0 };  // Conditional probability = 0.
 141   int32_t h0_test, h1_test;
 142   int32_t tmp1_s32, tmp2_s32;
 143   int32_t sum_log_likelihood_ratios = 0;
 144   int32_t noise_global_mean, speech_global_mean;
 145   int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
 146   int16_t overhead1, overhead2, individualTest, totalTest;
 147
 148   // Set various thresholds based on frame lengths (80, 160 or 240 samples).
 149   if (frame_length == 80) {
 150     overhead1 = self->over_hang_max_1[0];
 151     overhead2 = self->over_hang_max_2[0];
 152     individualTest = self->individual[0];
 153     totalTest = self->total[0];
 154   } else if (frame_length == 160) {
 155     overhead1 = self->over_hang_max_1[1];
 156     overhead2 = self->over_hang_max_2[1];
 157     individualTest = self->individual[1];
 158     totalTest = self->total[1];
 159   } else {
 160     overhead1 = self->over_hang_max_1[2];
 161     overhead2 = self->over_hang_max_2[2];
 162     individualTest = self->individual[2];
 163     totalTest = self->total[2];
 164   }
 165
 166   if (total_power > kMinEnergy) {
 167     // The signal power of current frame is large enough for processing. The
 168     // processing consists of two parts:
 169     // 1) Calculating the likelihood of speech and thereby a VAD decision.
 170     // 2) Updating the underlying model, w.r.t., the decision made.
 171
 172     // The detection scheme is an LRT with hypothesis
 173     // H0: Noise
 174     // H1: Speech
 175     //
 176     // We combine a global LRT with local tests, for each frequency sub-band,
 177     // here defined as |channel|.
 178     for (channel = 0; channel < kNumChannels; channel++) {
 179       // For each channel we model the probability with a GMM consisting of
 180       // |kNumGaussians|, with different means and standard deviations depending
 181       // on H0 or H1.
 182       h0_test = 0;
 183       h1_test = 0;
 184       for (k = 0; k < kNumGaussians; k++) {
 185         gaussian = channel + k * kNumChannels;
 186         // Probability under H0, that is, probability of frame being noise.
 187         // Value given in Q27 = Q7 * Q20.
 188         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
 189                                                  self->noise_means[gaussian],
 190                                                  self->noise_stds[gaussian],
 191                                                  &deltaN[gaussian]);
 192         noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
 193         h0_test += noise_probability[k];  // Q27
 194
 195         // Probability under H1, that is, probability of frame being speech.
 196         // Value given in Q27 = Q7 * Q20.
 197         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
 198                                                  self->speech_means[gaussian],
 199                                                  self->speech_stds[gaussian],
 200                                                  &deltaS[gaussian]);
 201         speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
 202         h1_test += speech_probability[k];  // Q27
 203       }
 204
 205       // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
 206       // Approximation:
 207       // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
 208       //                           = log2(h1_test) - log2(h0_test)
 209       //                           = log2(2^(31-shifts_h1)*(1+b1))
 210       //                             - log2(2^(31-shifts_h0)*(1+b0))
 211       //                           = shifts_h0 - shifts_h1
 212       //                             + log2(1+b1) - log2(1+b0)
 213       //                          ~= shifts_h0 - shifts_h1
 214       //
 215       // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
 216       // Further, b0 and b1 are independent and on the average the two terms
 217       // cancel.
 218       shifts_h0 = WebRtcSpl_NormW32(h0_test);
 219       shifts_h1 = WebRtcSpl_NormW32(h1_test);
 220       if (h0_test == 0) {
 221         shifts_h0 = 31;
 222       }
 223       if (h1_test == 0) {
 224         shifts_h1 = 31;
 225       }
 226       log_likelihood_ratio = shifts_h0 - shifts_h1;
 227
 228       // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
 229       // used for the global VAD decision.
 230       sum_log_likelihood_ratios +=
 231           (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
 232
 233       // Local VAD decision.
 234       if ((log_likelihood_ratio << 2) > individualTest) {
 235         vadflag = 1;
 236       }
 237
 238       // TODO(bjornv): The conditional probabilities below are applied on the
 239       // hard coded number of Gaussians set to two. Find a way to generalize.
 240       // Calculate local noise probabilities used later when updating the GMM.
 241       h0 = (int16_t) (h0_test >> 12);  // Q15
 242       if (h0 > 0) {
 243         // High probability of noise. Assign conditional probabilities for each
 244         // Gaussian in the GMM.
 245         tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2;  // Q29
 246         ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0);  // Q14
 247         ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
 248       } else {
 249         // Low noise probability. Assign conditional probability 1 to the first
 250         // Gaussian and 0 to the rest (which is already set at initialization).
 251         ngprvec[channel] = 16384;
 252       }
 253
 254       // Calculate local speech probabilities used later when updating the GMM.
 255       h1 = (int16_t) (h1_test >> 12);  // Q15
 256       if (h1 > 0) {
 257         // High probability of speech. Assign conditional probabilities for each
 258         // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
 259         tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2;  // Q29
 260         sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1);  // Q14
 261         sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
 262       }
 263     }
 264
 265     // Make a global VAD decision.
 266     vadflag |= (sum_log_likelihood_ratios >= totalTest);
 267
 268     // Update the model parameters.
 269     maxspe = 12800;
 270     for (channel = 0; channel < kNumChannels; channel++) {
 271
 272       // Get minimum value in past which is used for long term correction in Q4.
 273       feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
 274
 275       // Compute the "global" mean, that is the sum of the two means weighted.
 276       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
 277                                           &kNoiseDataWeights[channel]);
 278       tmp1_s16 = (int16_t) (noise_global_mean >> 6);  // Q8
 279
 280       for (k = 0; k < kNumGaussians; k++) {
 281         gaussian = channel + k * kNumChannels;
 282
 283         nmk = self->noise_means[gaussian];
 284         smk = self->speech_means[gaussian];
 285         nsk = self->noise_stds[gaussian];
 286         ssk = self->speech_stds[gaussian];
 287
 288         // Update noise mean vector if the frame consists of noise only.
 289         nmk2 = nmk;
 290         if (!vadflag) {
 291           // deltaN = (x-mu)/sigma^2
 292           // ngprvec[k] = |noise_probability[k]| /
 293           //   (|noise_probability[0]| + |noise_probability[1]|)
 294
 295           // (Q14 * Q11 >> 11) = Q14.
 296           delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[gaussian],
 297                                                      deltaN[gaussian],
 298                                                      11);
 299           // Q7 + (Q14 * Q15 >> 22) = Q7.
 300           nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
 301                                                            kNoiseUpdateConst,
 302                                                            22);
 303         }
 304
 305         // Long term correction of the noise mean.
 306         // Q8 - Q8 = Q8.
 307         ndelt = (feature_minimum << 4) - tmp1_s16;
 308         // Q7 + (Q8 * Q8) >> 9 = Q7.
 309         nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9);
 310
 311         // Control that the noise mean does not drift to much.
 312         tmp_s16 = (int16_t) ((k + 5) << 7);
 313         if (nmk3 < tmp_s16) {
 314           nmk3 = tmp_s16;
 315         }
 316         tmp_s16 = (int16_t) ((72 + k - channel) << 7);
 317         if (nmk3 > tmp_s16) {
 318           nmk3 = tmp_s16;
 319         }
 320         self->noise_means[gaussian] = nmk3;
 321
 322         if (vadflag) {
 323           // Update speech mean vector:
 324           // |deltaS| = (x-mu)/sigma^2
 325           // sgprvec[k] = |speech_probability[k]| /
 326           //   (|speech_probability[0]| + |speech_probability[1]|)
 327
 328           // (Q14 * Q11) >> 11 = Q14.
 329           delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[gaussian],
 330                                                      deltaS[gaussian],
 331                                                      11);
 332           // Q14 * Q15 >> 21 = Q8.
 333           tmp_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
 334                                                         kSpeechUpdateConst,
 335                                                         21);
 336           // Q7 + (Q8 >> 1) = Q7. With rounding.
 337           smk2 = smk + ((tmp_s16 + 1) >> 1);
 338
 339           // Control that the speech mean does not drift to much.
 340           maxmu = maxspe + 640;
 341           if (smk2 < kMinimumMean[k]) {
 342             smk2 = kMinimumMean[k];
 343           }
 344           if (smk2 > maxmu) {
 345             smk2 = maxmu;
 346           }
 347           self->speech_means[gaussian] = smk2;  // Q7.
 348
 349           // (Q7 >> 3) = Q4. With rounding.
 350           tmp_s16 = ((smk + 4) >> 3);
 351
 352           tmp_s16 = features[channel] - tmp_s16;  // Q4
 353           // (Q11 * Q4 >> 3) = Q12.
 354           tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[gaussian], tmp_s16, 3);
 355           tmp2_s32 = tmp1_s32 - 4096;
 356           tmp_s16 = sgprvec[gaussian] >> 2;
 357           // (Q14 >> 2) * Q12 = Q24.
 358           tmp1_s32 = tmp_s16 * tmp2_s32;
 359
 360           tmp2_s32 = tmp1_s32 >> 4;  // Q20
 361
 362           // 0.1 * Q20 / Q7 = Q13.
 363           if (tmp2_s32 > 0) {
 364             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
 365           } else {
 366             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
 367             tmp_s16 = -tmp_s16;
 368           }
 369           // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
 370           // Note that division by 4 equals shift by 2, hence,
 371           // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
 372           tmp_s16 += 128;  // Rounding.
 373           ssk += (tmp_s16 >> 8);
 374           if (ssk < kMinStd) {
 375             ssk = kMinStd;
 376           }
 377           self->speech_stds[gaussian] = ssk;
 378         } else {
 379           // Update GMM variance vectors.
 380           // deltaN * (features[channel] - nmk) - 1
 381           // Q4 - (Q7 >> 3) = Q4.
 382           tmp_s16 = features[channel] - (nmk >> 3);
 383           // (Q11 * Q4 >> 3) = Q12.
 384           tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[gaussian], tmp_s16, 3);
 385           tmp1_s32 -= 4096;
 386
 387           // (Q14 >> 2) * Q12 = Q24.
 388           tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
 389           tmp2_s32 = tmp_s16 * tmp1_s32;
 390           // Q20  * approx 0.001 (2^-10=0.0009766), hence,
 391           // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
 392           tmp1_s32 = tmp2_s32 >> 14;
 393
 394           // Q20 / Q7 = Q13.
 395           if (tmp1_s32 > 0) {
 396             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
 397           } else {
 398             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
 399             tmp_s16 = -tmp_s16;
 400           }
 401           tmp_s16 += 32;  // Rounding
 402           nsk += tmp_s16 >> 6;  // Q13 >> 6 = Q7.
 403           if (nsk < kMinStd) {
 404             nsk = kMinStd;
 405           }
 406           self->noise_stds[gaussian] = nsk;
 407         }
 408       }
 409
 410       // Separate models if they are too close.
 411       // |noise_global_mean| in Q14 (= Q7 * Q7).
 412       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
 413                                           &kNoiseDataWeights[channel]);
 414
 415       // |speech_global_mean| in Q14 (= Q7 * Q7).
 416       speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
 417                                            &kSpeechDataWeights[channel]);
 418
 419       // |diff| = "global" speech mean - "global" noise mean.
 420       // (Q14 >> 9) - (Q14 >> 9) = Q5.
 421       diff = (int16_t) (speech_global_mean >> 9) -
 422           (int16_t) (noise_global_mean >> 9);
 423       if (diff < kMinimumDifference[channel]) {
 424         tmp_s16 = kMinimumDifference[channel] - diff;
 425
 426         // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
 427         // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
 428         tmp1_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp_s16, 2);
 429         tmp2_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp_s16, 2);
 430
 431         // Move Gaussian means for speech model by |tmp1_s16| and update
 432         // |speech_global_mean|. Note that |self->speech_means[channel]| is
 433         // changed after the call.
 434         speech_global_mean = WeightedAverage(&self->speech_means[channel],
 435                                              tmp1_s16,
 436                                              &kSpeechDataWeights[channel]);
 437
 438         // Move Gaussian means for noise model by -|tmp2_s16| and update
 439         // |noise_global_mean|. Note that |self->noise_means[channel]| is
 440         // changed after the call.
 441         noise_global_mean = WeightedAverage(&self->noise_means[channel],
 442                                             -tmp2_s16,
 443                                             &kNoiseDataWeights[channel]);
 444       }
 445
 446       // Control that the speech & noise means do not drift to much.
 447       maxspe = kMaximumSpeech[channel];
 448       tmp2_s16 = (int16_t) (speech_global_mean >> 7);
 449       if (tmp2_s16 > maxspe) {
 450         // Upper limit of speech model.
 451         tmp2_s16 -= maxspe;
 452
 453         for (k = 0; k < kNumGaussians; k++) {
 454           self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
 455         }
 456       }
 457
 458       tmp2_s16 = (int16_t) (noise_global_mean >> 7);
 459       if (tmp2_s16 > kMaximumNoise[channel]) {
 460         tmp2_s16 -= kMaximumNoise[channel];
 461
 462         for (k = 0; k < kNumGaussians; k++) {
 463           self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
 464         }
 465       }
 466     }
 467     self->frame_counter++;
 468   }
 469
 470   // Smooth with respect to transition hysteresis.
 471   if (!vadflag) {
 472     if (self->over_hang > 0) {
 473       vadflag = 2 + self->over_hang;
 474       self->over_hang--;
 475     }
 476     self->num_of_speech = 0;
 477   } else {
 478     self->num_of_speech++;
 479     if (self->num_of_speech > kMaxSpeechFrames) {
 480       self->num_of_speech = kMaxSpeechFrames;
 481       self->over_hang = overhead2;
 482     } else {
 483       self->over_hang = overhead1;
 484     }
 485   }
 486   return vadflag;
 487 }
 488
 489 // Initialize the VAD. Set aggressiveness mode to default value.
 490 int WebRtcVad_InitCore(VadInstT* self) {
 491   int i;
 492
 493   if (self == NULL) {
 494     return -1;
 495   }
 496
 497   // Initialization of general struct variables.
 498   self->vad = 1;  // Speech active (=1).
 499   self->frame_counter = 0;
 500   self->over_hang = 0;
 501   self->num_of_speech = 0;
 502
 503   // Initialization of downsampling filter state.
 504   memset(self->downsampling_filter_states, 0,
 505          sizeof(self->downsampling_filter_states));
 506
 507   // Initialization of 48 to 8 kHz downsampling.
 508   WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
 509
 510   // Read initial PDF parameters.
 511   for (i = 0; i < kTableSize; i++) {
 512     self->noise_means[i] = kNoiseDataMeans[i];
 513     self->speech_means[i] = kSpeechDataMeans[i];
 514     self->noise_stds[i] = kNoiseDataStds[i];
 515     self->speech_stds[i] = kSpeechDataStds[i];
 516   }
 517
 518   // Initialize Index and Minimum value vectors.
 519   for (i = 0; i < 16 * kNumChannels; i++) {
 520     self->low_value_vector[i] = 10000;
 521     self->index_vector[i] = 0;
 522   }
 523
 524   // Initialize splitting filter states.
 525   memset(self->upper_state, 0, sizeof(self->upper_state));
 526   memset(self->lower_state, 0, sizeof(self->lower_state));
 527
 528   // Initialize high pass filter states.
 529   memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
 530
 531   // Initialize mean value memory, for WebRtcVad_FindMinimum().
 532   for (i = 0; i < kNumChannels; i++) {
 533     self->mean_value[i] = 1600;
 534   }
 535
 536   // Set aggressiveness mode to default (=|kDefaultMode|).
 537   if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
 538     return -1;
 539   }
 540
 541   self->init_flag = kInitCheck;
 542
 543   return 0;
 544 }
 545
 546 // Set aggressiveness mode
 547 int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
 548   int return_value = 0;
 549
 550   switch (mode) {
 551     case 0:
 552       // Quality mode.
 553       memcpy(self->over_hang_max_1, kOverHangMax1Q,
 554              sizeof(self->over_hang_max_1));
 555       memcpy(self->over_hang_max_2, kOverHangMax2Q,
 556              sizeof(self->over_hang_max_2));
 557       memcpy(self->individual, kLocalThresholdQ,
 558              sizeof(self->individual));
 559       memcpy(self->total, kGlobalThresholdQ,
 560              sizeof(self->total));
 561       break;
 562     case 1:
 563       // Low bitrate mode.
 564       memcpy(self->over_hang_max_1, kOverHangMax1LBR,
 565              sizeof(self->over_hang_max_1));
 566       memcpy(self->over_hang_max_2, kOverHangMax2LBR,
 567              sizeof(self->over_hang_max_2));
 568       memcpy(self->individual, kLocalThresholdLBR,
 569              sizeof(self->individual));
 570       memcpy(self->total, kGlobalThresholdLBR,
 571              sizeof(self->total));
 572       break;
 573     case 2:
 574       // Aggressive mode.
 575       memcpy(self->over_hang_max_1, kOverHangMax1AGG,
 576              sizeof(self->over_hang_max_1));
 577       memcpy(self->over_hang_max_2, kOverHangMax2AGG,
 578              sizeof(self->over_hang_max_2));
 579       memcpy(self->individual, kLocalThresholdAGG,
 580              sizeof(self->individual));
 581       memcpy(self->total, kGlobalThresholdAGG,
 582              sizeof(self->total));
 583       break;
 584     case 3:
 585       // Very aggressive mode.
 586       memcpy(self->over_hang_max_1, kOverHangMax1VAG,
 587              sizeof(self->over_hang_max_1));
 588       memcpy(self->over_hang_max_2, kOverHangMax2VAG,
 589              sizeof(self->over_hang_max_2));
 590       memcpy(self->individual, kLocalThresholdVAG,
 591              sizeof(self->individual));
 592       memcpy(self->total, kGlobalThresholdVAG,
 593              sizeof(self->total));
 594       break;
 595     default:
 596       return_value = -1;
 597       break;
 598   }
 599
 600   return return_value;
 601 }
 602
 603 // Calculate VAD decision by first extracting feature values and then calculate
 604 // probability for both speech and background noise.
 605
 606 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
 607                            int frame_length) {
 608   int vad;
 609   int i;
 610   int16_t speech_nb[240];  // 30 ms in 8 kHz.
 611   // |tmp_mem| is a temporary memory used by resample function, length is
 612   // frame length in 10 ms (480 samples) + 256 extra.
 613   int32_t tmp_mem[480 + 256] = { 0 };
 614   const int kFrameLen10ms48khz = 480;
 615   const int kFrameLen10ms8khz = 80;
 616   int num_10ms_frames = frame_length / kFrameLen10ms48khz;
 617
 618   for (i = 0; i < num_10ms_frames; i++) {
 619     WebRtcSpl_Resample48khzTo8khz(speech_frame,
 620                                   &speech_nb[i * kFrameLen10ms8khz],
 621                                   &inst->state_48_to_8,
 622                                   tmp_mem);
 623   }
 624
 625   // Do VAD on an 8 kHz signal
 626   vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
 627
 628   return vad;
 629 }
 630
 631 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
 632                            int frame_length)
 633 {
 634     int len, vad;
 635     int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
 636     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
 637
 638
 639     // Downsample signal 32->16->8 before doing VAD
 640     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
 641                            frame_length);
 642     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
 643
 644     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
 645     len = WEBRTC_SPL_RSHIFT_W16(len, 1);
 646
 647     // Do VAD on an 8 kHz signal
 648     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
 649
 650     return vad;
 651 }
 652
 653 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
 654                            int frame_length)
 655 {
 656     int len, vad;
 657     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
 658
 659     // Wideband: Downsample signal before doing VAD
 660     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
 661                            frame_length);
 662
 663     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
 664     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
 665
 666     return vad;
 667 }
 668
 669 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
 670                           int frame_length)
 671 {
 672     int16_t feature_vector[kNumChannels], total_power;
 673
 674     // Get power in the bands
 675     total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
 676                                               feature_vector);
 677
 678     // Make a VAD
 679     inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
 680
 681     return inst->vad;
 682 }