1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "media/filters/wsola_internals.h"
13 #include "base/check_op.h"
14 #include "base/numerics/math_constants.h"
15 #include "build/build_config.h"
16 #include "media/base/audio_bus.h"
18 #if defined(ARCH_CPU_X86_FAMILY)
20 #include <xmmintrin.h>
21 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
30 bool InInterval(int n, Interval q) {
31 return n >= q.first && n <= q.second;
34 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,
35 const float* energy_a,
36 const float* energy_b,
38 const float kEpsilon = 1e-12f;
39 float similarity_measure = 0.0f;
40 for (int n = 0; n < channels; ++n) {
42 dot_prod_a_b[n] / std::sqrt(energy_a[n] * energy_b[n] + kEpsilon);
44 return similarity_measure;
47 void MultiChannelDotProduct(const AudioBus* a,
53 DCHECK_EQ(a->channels(), b->channels());
54 DCHECK_GE(frame_offset_a, 0);
55 DCHECK_GE(frame_offset_b, 0);
56 DCHECK_LE(frame_offset_a + num_frames, a->frames());
57 DCHECK_LE(frame_offset_b + num_frames, b->frames());
59 // SIMD optimized variants can provide a massive speedup to this operation.
61 const int rem = num_frames % 4;
62 const int last_index = num_frames - rem;
63 const int channels = a->channels();
64 for (int ch = 0; ch < channels; ++ch) {
65 const float* a_src = a->channel(ch) + frame_offset_a;
66 const float* b_src = b->channel(ch) + frame_offset_b;
68 #if defined(ARCH_CPU_X86_FAMILY)
69 // First sum all components.
70 __m128 m_sum = _mm_setzero_ps();
71 for (int s = 0; s < last_index; s += 4) {
73 m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));
76 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
77 // horizontal sum function, so we have to condense manually.
78 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
79 _mm_store_ss(dot_product + ch,
80 _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
81 #elif defined(ARCH_CPU_ARM_FAMILY)
82 // First sum all components.
83 float32x4_t m_sum = vmovq_n_f32(0);
84 for (int s = 0; s < last_index; s += 4)
85 m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));
87 // Reduce to a single float for this channel.
88 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
89 dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
96 frame_offset_a += last_index;
97 frame_offset_b += last_index;
99 memset(dot_product, 0, sizeof(*dot_product) * a->channels());
100 #endif // defined(USE_SIMD)
102 // C version is required to handle remainder of frames (% 4 != 0)
103 for (int k = 0; k < a->channels(); ++k) {
104 const float* ch_a = a->channel(k) + frame_offset_a;
105 const float* ch_b = b->channel(k) + frame_offset_b;
106 for (int n = 0; n < num_frames; ++n)
107 dot_product[k] += *ch_a++ * *ch_b++;
111 void MultiChannelMovingBlockEnergies(const AudioBus* input,
112 int frames_per_block,
114 int num_blocks = input->frames() - (frames_per_block - 1);
115 int channels = input->channels();
117 for (int k = 0; k < input->channels(); ++k) {
118 const float* input_channel = input->channel(k);
122 // First block of channel |k|.
123 for (int m = 0; m < frames_per_block; ++m) {
124 energy[k] += input_channel[m] * input_channel[m];
127 const float* slide_out = input_channel;
128 const float* slide_in = input_channel + frames_per_block;
129 for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) {
130 energy[k + n * channels] = energy[k + (n - 1) * channels] - *slide_out *
131 *slide_out + *slide_in * *slide_in;
136 // Fit the curve f(x) = a * x^2 + b * x + c such that
140 // and return the maximum, assuming that y[0] <= y[1] >= y[2].
141 void QuadraticInterpolation(const float* y_values,
143 float* extremum_value) {
144 float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1];
145 float b = 0.5f * (y_values[2] - y_values[0]);
146 float c = y_values[1];
149 // The coordinates are colinear (within floating-point error).
151 *extremum_value = y_values[1];
153 *extremum = -b / (2.f * a);
154 *extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c;
158 int DecimatedSearch(int decimation,
159 Interval exclude_interval,
160 const AudioBus* target_block,
161 const AudioBus* search_segment,
162 const float* energy_target_block,
163 const float* energy_candidate_blocks) {
164 int channels = search_segment->channels();
165 int block_size = target_block->frames();
166 int num_candidate_blocks = search_segment->frames() - (block_size - 1);
167 std::unique_ptr<float[]> dot_prod(new float[channels]);
168 float similarity[3]; // Three elements for cubic interpolation.
171 MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
173 similarity[0] = MultiChannelSimilarityMeasure(
174 dot_prod.get(), energy_target_block,
175 &energy_candidate_blocks[n * channels], channels);
177 // Set the starting point as optimal point.
178 float best_similarity = similarity[0];
179 int optimal_index = 0;
182 if (n >= num_candidate_blocks) {
186 MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
188 similarity[1] = MultiChannelSimilarityMeasure(
189 dot_prod.get(), energy_target_block,
190 &energy_candidate_blocks[n * channels], channels);
193 if (n >= num_candidate_blocks) {
194 // We cannot do any more sampling. Compare these two values and return the
196 return similarity[1] > similarity[0] ? decimation : 0;
199 for (; n < num_candidate_blocks; n += decimation) {
200 MultiChannelDotProduct(target_block, 0, search_segment, n, block_size,
203 similarity[2] = MultiChannelSimilarityMeasure(
204 dot_prod.get(), energy_target_block,
205 &energy_candidate_blocks[n * channels], channels);
207 if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) ||
208 (similarity[1] >= similarity[0] && similarity[1] > similarity[2])) {
209 // A local maximum is found. Do a cubic interpolation for a better
210 // estimate of candidate maximum.
211 float normalized_candidate_index;
212 float candidate_similarity;
213 QuadraticInterpolation(similarity, &normalized_candidate_index,
214 &candidate_similarity);
216 int candidate_index = n - decimation + static_cast<int>(
217 normalized_candidate_index * decimation + 0.5f);
218 if (candidate_similarity > best_similarity &&
219 !InInterval(candidate_index, exclude_interval)) {
220 optimal_index = candidate_index;
221 best_similarity = candidate_similarity;
223 } else if (n + decimation >= num_candidate_blocks &&
224 similarity[2] > best_similarity &&
225 !InInterval(n, exclude_interval)) {
226 // If this is the end-point and has a better similarity-measure than
227 // optimal, then we accept it as optimal point.
229 best_similarity = similarity[2];
231 memmove(similarity, &similarity[1], 2 * sizeof(*similarity));
233 return optimal_index;
236 int FullSearch(int low_limit,
238 Interval exclude_interval,
239 const AudioBus* target_block,
240 const AudioBus* search_block,
241 const float* energy_target_block,
242 const float* energy_candidate_blocks) {
243 int channels = search_block->channels();
244 int block_size = target_block->frames();
245 std::unique_ptr<float[]> dot_prod(new float[channels]);
247 float best_similarity = std::numeric_limits<float>::min();
248 int optimal_index = 0;
250 for (int n = low_limit; n <= high_limit; ++n) {
251 if (InInterval(n, exclude_interval)) {
254 MultiChannelDotProduct(target_block, 0, search_block, n, block_size,
257 float similarity = MultiChannelSimilarityMeasure(
258 dot_prod.get(), energy_target_block,
259 &energy_candidate_blocks[n * channels], channels);
261 if (similarity > best_similarity) {
262 best_similarity = similarity;
267 return optimal_index;
270 int OptimalIndex(const AudioBus* search_block,
271 const AudioBus* target_block,
272 Interval exclude_interval) {
273 int channels = search_block->channels();
274 DCHECK_EQ(channels, target_block->channels());
275 int target_size = target_block->frames();
276 int num_candidate_blocks = search_block->frames() - (target_size - 1);
278 // This is a compromise between complexity reduction and search accuracy. I
279 // don't have a proof that down sample of order 5 is optimal. One can compute
280 // a decimation factor that minimizes complexity given the size of
281 // |search_block| and |target_block|. However, my experiments show the rate of
282 // missing the optimal index is significant. This value is chosen
283 // heuristically based on experiments.
284 const int kSearchDecimation = 5;
286 std::unique_ptr<float[]> energy_target_block(new float[channels]);
287 std::unique_ptr<float[]> energy_candidate_blocks(
288 new float[channels * num_candidate_blocks]);
290 // Energy of all candid frames.
291 MultiChannelMovingBlockEnergies(search_block, target_size,
292 energy_candidate_blocks.get());
294 // Energy of target frame.
295 MultiChannelDotProduct(target_block, 0, target_block, 0,
296 target_size, energy_target_block.get());
298 int optimal_index = DecimatedSearch(kSearchDecimation,
299 exclude_interval, target_block,
300 search_block, energy_target_block.get(),
301 energy_candidate_blocks.get());
303 int lim_low = std::max(0, optimal_index - kSearchDecimation);
304 int lim_high = std::min(num_candidate_blocks - 1,
305 optimal_index + kSearchDecimation);
306 return FullSearch(lim_low, lim_high, exclude_interval, target_block,
307 search_block, energy_target_block.get(),
308 energy_candidate_blocks.get());
311 void GetPeriodicHanningWindow(int window_length, float* window) {
312 const float scale = 2.0f * base::kPiFloat / window_length;
313 for (int n = 0; n < window_length; ++n)
314 window[n] = 0.5f * (1.0f - std::cos(n * scale));
317 } // namespace internal