media/filters/wsola_internals.cc

   1 // Copyright 2013 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "media/filters/wsola_internals.h"
   6
   7 #include <algorithm>
   8 #include <cmath>
   9 #include <cstring>
  10 #include <limits>
  11 #include <memory>
  12
  13 #include "base/check_op.h"
  14 #include "base/numerics/math_constants.h"
  15 #include "build/build_config.h"
  16 #include "media/base/audio_bus.h"
  17
  18 #if defined(ARCH_CPU_X86_FAMILY)
  19 #define USE_SIMD 1
  20 #include <xmmintrin.h>
  21 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
  22 #define USE_SIMD 1
  23 #include <arm_neon.h>
  24 #endif
  25
  26 namespace media {
  27
  28 namespace internal {
  29
  30 bool InInterval(int n, Interval q) {
  31   return n >= q.first && n <= q.second;
  32 }
  33
  34 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,
  35                                     const float* energy_a,
  36                                     const float* energy_b,
  37                                     int channels) {
  38   const float kEpsilon = 1e-12f;
  39   float similarity_measure = 0.0f;
  40   for (int n = 0; n < channels; ++n) {
  41     similarity_measure +=
  42         dot_prod_a_b[n] / std::sqrt(energy_a[n] * energy_b[n] + kEpsilon);
  43   }
  44   return similarity_measure;
  45 }
  46
  47 void MultiChannelDotProduct(const AudioBus* a,
  48                             int frame_offset_a,
  49                             const AudioBus* b,
  50                             int frame_offset_b,
  51                             int num_frames,
  52                             float* dot_product) {
  53   DCHECK_EQ(a->channels(), b->channels());
  54   DCHECK_GE(frame_offset_a, 0);
  55   DCHECK_GE(frame_offset_b, 0);
  56   DCHECK_LE(frame_offset_a + num_frames, a->frames());
  57   DCHECK_LE(frame_offset_b + num_frames, b->frames());
  58
  59 // SIMD optimized variants can provide a massive speedup to this operation.
  60 #if defined(USE_SIMD)
  61   const int rem = num_frames % 4;
  62   const int last_index = num_frames - rem;
  63   const int channels = a->channels();
  64   for (int ch = 0; ch < channels; ++ch) {
  65     const float* a_src = a->channel(ch) + frame_offset_a;
  66     const float* b_src = b->channel(ch) + frame_offset_b;
  67
  68 #if defined(ARCH_CPU_X86_FAMILY)
  69     // First sum all components.
  70     __m128 m_sum = _mm_setzero_ps();
  71     for (int s = 0; s < last_index; s += 4) {
  72       m_sum = _mm_add_ps(
  73           m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));
  74     }
  75
  76     // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
  77     // horizontal sum function, so we have to condense manually.
  78     m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
  79     _mm_store_ss(dot_product + ch,
  80                  _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
  81 #elif defined(ARCH_CPU_ARM_FAMILY)
  82     // First sum all components.
  83     float32x4_t m_sum = vmovq_n_f32(0);
  84     for (int s = 0; s < last_index; s += 4)
  85       m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));
  86
  87     // Reduce to a single float for this channel.
  88     float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
  89     dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
  90 #endif
  91   }
  92
  93   if (!rem)
  94     return;
  95   num_frames = rem;
  96   frame_offset_a += last_index;
  97   frame_offset_b += last_index;
  98 #else
  99   memset(dot_product, 0, sizeof(*dot_product) * a->channels());
 100 #endif  // defined(USE_SIMD)
 101
 102   // C version is required to handle remainder of frames (% 4 != 0)
 103   for (int k = 0; k < a->channels(); ++k) {
 104     const float* ch_a = a->channel(k) + frame_offset_a;
 105     const float* ch_b = b->channel(k) + frame_offset_b;
 106     for (int n = 0; n < num_frames; ++n)
 107       dot_product[k] += *ch_a++ * *ch_b++;
 108   }
 109 }
 110
 111 void MultiChannelMovingBlockEnergies(const AudioBus* input,
 112                                      int frames_per_block,
 113                                      float* energy) {
 114   int num_blocks = input->frames() - (frames_per_block - 1);
 115   int channels = input->channels();
 116
 117   for (int k = 0; k < input->channels(); ++k) {
 118     const float* input_channel = input->channel(k);
 119
 120     energy[k] = 0;
 121
 122     // First block of channel |k|.
 123     for (int m = 0; m < frames_per_block; ++m) {
 124       energy[k] += input_channel[m] * input_channel[m];
 125     }
 126
 127     const float* slide_out = input_channel;
 128     const float* slide_in = input_channel + frames_per_block;
 129     for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) {
 130       energy[k + n * channels] = energy[k + (n - 1) * channels] - *slide_out *
 131           *slide_out + *slide_in * *slide_in;
 132     }
 133   }
 134 }
 135
 136 // Fit the curve f(x) = a * x^2 + b * x + c such that
 137 //   f(-1) = y[0]
 138 //   f(0) = y[1]
 139 //   f(1) = y[2]
 140 // and return the maximum, assuming that y[0] <= y[1] >= y[2].
 141 void QuadraticInterpolation(const float* y_values,
 142                             float* extremum,
 143                             float* extremum_value) {
 144   float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1];
 145   float b = 0.5f * (y_values[2] - y_values[0]);
 146   float c = y_values[1];
 147
 148   if (a == 0.f) {
 149     // The coordinates are colinear (within floating-point error).
 150     *extremum = 0;
 151     *extremum_value = y_values[1];
 152   } else {
 153     *extremum = -b / (2.f * a);
 154     *extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c;
 155   }
 156 }
 157
 158 int DecimatedSearch(int decimation,
 159                     Interval exclude_interval,
 160                     const AudioBus* target_block,
 161                     const AudioBus* search_segment,
 162                     const float* energy_target_block,
 163                     const float* energy_candidate_blocks) {
 164   int channels = search_segment->channels();
 165   int block_size = target_block->frames();
 166   int num_candidate_blocks = search_segment->frames() - (block_size - 1);
 167   std::unique_ptr<float[]> dot_prod(new float[channels]);
 168   float similarity[3];  // Three elements for cubic interpolation.
 169
 170   int n = 0;
 171   MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
 172                          dot_prod.get());
 173   similarity[0] = MultiChannelSimilarityMeasure(
 174       dot_prod.get(), energy_target_block,
 175       &energy_candidate_blocks[n * channels], channels);
 176
 177   // Set the starting point as optimal point.
 178   float best_similarity = similarity[0];
 179   int optimal_index = 0;
 180
 181   n += decimation;
 182   if (n >= num_candidate_blocks) {
 183     return 0;
 184   }
 185
 186   MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
 187                          dot_prod.get());
 188   similarity[1] = MultiChannelSimilarityMeasure(
 189       dot_prod.get(), energy_target_block,
 190       &energy_candidate_blocks[n * channels], channels);
 191
 192   n += decimation;
 193   if (n >= num_candidate_blocks) {
 194     // We cannot do any more sampling. Compare these two values and return the
 195     // optimal index.
 196     return similarity[1] > similarity[0] ? decimation : 0;
 197   }
 198
 199   for (; n < num_candidate_blocks; n += decimation) {
 200     MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
 201                            dot_prod.get());
 202
 203     similarity[2] = MultiChannelSimilarityMeasure(
 204         dot_prod.get(), energy_target_block,
 205         &energy_candidate_blocks[n * channels], channels);
 206
 207     if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) ||
 208         (similarity[1] >= similarity[0] && similarity[1] > similarity[2])) {
 209       // A local maximum is found. Do a cubic interpolation for a better
 210       // estimate of candidate maximum.
 211       float normalized_candidate_index;
 212       float candidate_similarity;
 213       QuadraticInterpolation(similarity, &normalized_candidate_index,
 214                              &candidate_similarity);
 215
 216       int candidate_index = n - decimation + static_cast<int>(
 217           normalized_candidate_index * decimation +  0.5f);
 218       if (candidate_similarity > best_similarity &&
 219           !InInterval(candidate_index, exclude_interval)) {
 220         optimal_index = candidate_index;
 221         best_similarity = candidate_similarity;
 222       }
 223     } else if (n + decimation >= num_candidate_blocks &&
 224                similarity[2] > best_similarity &&
 225                !InInterval(n, exclude_interval)) {
 226       // If this is the end-point and has a better similarity-measure than
 227       // optimal, then we accept it as optimal point.
 228       optimal_index = n;
 229       best_similarity = similarity[2];
 230     }
 231     memmove(similarity, &similarity[1], 2 * sizeof(*similarity));
 232   }
 233   return optimal_index;
 234 }
 235
 236 int FullSearch(int low_limit,
 237                int high_limit,
 238                Interval exclude_interval,
 239                const AudioBus* target_block,
 240                const AudioBus* search_block,
 241                const float* energy_target_block,
 242                const float* energy_candidate_blocks) {
 243   int channels = search_block->channels();
 244   int block_size = target_block->frames();
 245   std::unique_ptr<float[]> dot_prod(new float[channels]);
 246
 247   float best_similarity = std::numeric_limits<float>::min();
 248   int optimal_index = 0;
 249
 250   for (int n = low_limit; n <= high_limit; ++n) {
 251     if (InInterval(n, exclude_interval)) {
 252       continue;
 253     }
 254     MultiChannelDotProduct(target_block, 0, search_block, n, block_size,
 255                            dot_prod.get());
 256
 257     float similarity = MultiChannelSimilarityMeasure(
 258         dot_prod.get(), energy_target_block,
 259         &energy_candidate_blocks[n * channels], channels);
 260
 261     if (similarity > best_similarity) {
 262       best_similarity = similarity;
 263       optimal_index = n;
 264     }
 265   }
 266
 267   return optimal_index;
 268 }
 269
 270 int OptimalIndex(const AudioBus* search_block,
 271                  const AudioBus* target_block,
 272                  Interval exclude_interval) {
 273   int channels = search_block->channels();
 274   DCHECK_EQ(channels, target_block->channels());
 275   int target_size = target_block->frames();
 276   int num_candidate_blocks = search_block->frames() - (target_size - 1);
 277
 278   // This is a compromise between complexity reduction and search accuracy. I
 279   // don't have a proof that down sample of order 5 is optimal. One can compute
 280   // a decimation factor that minimizes complexity given the size of
 281   // |search_block| and |target_block|. However, my experiments show the rate of
 282   // missing the optimal index is significant. This value is chosen
 283   // heuristically based on experiments.
 284   const int kSearchDecimation = 5;
 285
 286   std::unique_ptr<float[]> energy_target_block(new float[channels]);
 287   std::unique_ptr<float[]> energy_candidate_blocks(
 288       new float[channels * num_candidate_blocks]);
 289
 290   // Energy of all candid frames.
 291   MultiChannelMovingBlockEnergies(search_block, target_size,
 292                                   energy_candidate_blocks.get());
 293
 294   // Energy of target frame.
 295   MultiChannelDotProduct(target_block, 0, target_block, 0,
 296                          target_size, energy_target_block.get());
 297
 298   int optimal_index = DecimatedSearch(kSearchDecimation,
 299                                       exclude_interval, target_block,
 300                                       search_block, energy_target_block.get(),
 301                                       energy_candidate_blocks.get());
 302
 303   int lim_low = std::max(0, optimal_index - kSearchDecimation);
 304   int lim_high = std::min(num_candidate_blocks - 1,
 305                           optimal_index + kSearchDecimation);
 306   return FullSearch(lim_low, lim_high, exclude_interval, target_block,
 307                     search_block, energy_target_block.get(),
 308                     energy_candidate_blocks.get());
 309 }
 310
 311 void GetPeriodicHanningWindow(int window_length, float* window) {
 312   const float scale = 2.0f * base::kPiFloat / window_length;
 313   for (int n = 0; n < window_length; ++n)
 314     window[n] = 0.5f * (1.0f - std::cos(n * scale));
 315 }
 316
 317 }  // namespace internal
 318
 319 }  // namespace media
 320