2 * Copyright (C) 2011 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 * its contributors may be used to endorse or promote products derived
15 * from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include "platform/audio/SincResampler.h"
35 #include "platform/audio/AudioBus.h"
37 #include "wtf/MathExtras.h"
39 #if CPU(X86) || CPU(X86_64)
40 #include <emmintrin.h>
45 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
47 // |----------------|----------------------------------------------------------------|----------------|
49 // blockSize + kernelSize / 2
50 // <-------------------------------------------------------------------------------->
53 // kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
54 // <---------------> <---------------> <---------------> <--------------->
58 // <-------------------------------------------------------------->
63 // 1) Consume input frames into r0 (r1 is zero-initialized).
64 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
65 // or we've finished generating all the output frames.
66 // 3) Copy r3 to r1 and r4 to r2.
67 // 4) Consume input frames into r5 (zero-pad if we run out of input).
68 // 5) Goto (2) until all of input is consumed.
70 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
74 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
75 : m_scaleFactor(scaleFactor)
76 , m_kernelSize(kernelSize)
77 , m_numberOfKernelOffsets(numberOfKernelOffsets)
78 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
79 , m_virtualSourceIndex(0)
81 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
83 , m_sourceFramesAvailable(0)
85 , m_isBufferPrimed(false)
90 void SincResampler::initializeKernel()
92 // Blackman window parameters.
94 double a0 = 0.5 * (1.0 - alpha);
96 double a2 = 0.5 * alpha;
98 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
99 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
101 // The sinc function is an idealized brick-wall filter, but since we're windowing it the
102 // transition from pass to stop does not happen right away. So we should adjust the
103 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
104 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
105 sincScaleFactor *= 0.9;
107 int n = m_kernelSize;
108 int halfSize = n / 2;
110 // Generates a set of windowed sinc() kernels.
111 // We generate a range of sub-sample offsets from 0.0 to 1.0.
112 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
113 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
115 for (int i = 0; i < n; ++i) {
116 // Compute the sinc() with offset.
117 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
118 double sinc = !s ? 1.0 : sin(s) / s;
119 sinc *= sincScaleFactor;
121 // Compute Blackman window, matching the offset of the sinc().
122 double x = (i - subsampleOffset) / n;
123 double window = a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
125 // Window the sinc() function and store at the correct offset.
126 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
131 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
133 ASSERT(m_sourceProvider);
134 if (!m_sourceProvider)
137 // Wrap the provided buffer by an AudioBus for use by the source provider.
138 RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
140 // FIXME: Find a way to make the following const-correct:
141 bus->setChannelMemory(0, buffer, numberOfSourceFrames);
143 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
148 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
150 class BufferSourceProvider FINAL : public AudioSourceProvider {
152 BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
154 , m_sourceFramesAvailable(numberOfSourceFrames)
158 // Consumes samples from the in-memory buffer.
159 virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
161 ASSERT(m_source && bus);
162 if (!m_source || !bus)
165 float* buffer = bus->channel(0)->mutableData();
167 // Clamp to number of frames available and zero-pad.
168 size_t framesToCopy = min(m_sourceFramesAvailable, framesToProcess);
169 memcpy(buffer, m_source, sizeof(float) * framesToCopy);
171 // Zero-pad if necessary.
172 if (framesToCopy < framesToProcess)
173 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
175 m_sourceFramesAvailable -= framesToCopy;
176 m_source += framesToCopy;
180 const float* m_source;
181 size_t m_sourceFramesAvailable;
186 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
188 // Resample an in-memory buffer using an AudioSourceProvider.
189 BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
191 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
192 unsigned remaining = numberOfDestinationFrames;
195 unsigned framesThisTime = min(remaining, m_blockSize);
196 process(&sourceProvider, destination, framesThisTime);
198 destination += framesThisTime;
199 remaining -= framesThisTime;
203 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
205 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
210 m_sourceProvider = sourceProvider;
212 unsigned numberOfDestinationFrames = framesToProcess;
214 // Setup various region pointers in the buffer (see diagram above).
215 float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
216 float* r1 = m_inputBuffer.data();
218 float* r3 = r0 + m_blockSize - m_kernelSize / 2;
219 float* r4 = r0 + m_blockSize;
220 float* r5 = r0 + m_kernelSize / 2;
223 // Prime the input buffer at the start of the input stream.
224 if (!m_isBufferPrimed) {
225 consumeSource(r0, m_blockSize + m_kernelSize / 2);
226 m_isBufferPrimed = true;
231 while (numberOfDestinationFrames) {
232 while (m_virtualSourceIndex < m_blockSize) {
233 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
234 int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
235 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
237 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
238 int offsetIndex = static_cast<int>(virtualOffsetIndex);
240 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
241 float* k2 = k1 + m_kernelSize;
243 // Initialize input pointer based on quantized m_virtualSourceIndex.
244 float* inputP = r1 + sourceIndexI;
246 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
250 // Figure out how much to weight each kernel's "convolution".
251 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
253 // Generate a single output sample.
254 int n = m_kernelSize;
256 #define CONVOLVE_ONE_SAMPLE \
258 sum1 += input * *k1; \
259 sum2 += input * *k2; \
266 #if CPU(X86) || CPU(X86_64)
267 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
268 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
273 // Now the inputP is aligned and start to apply SSE.
274 float* endP = inputP + n - n % 4;
281 __m128 sums1 = _mm_setzero_ps();
282 __m128 sums2 = _mm_setzero_ps();
283 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
284 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
286 #define LOAD_DATA(l1, l2) \
287 mInput = _mm_load_ps(inputP); \
288 mK1 = _mm_##l1##_ps(k1); \
289 mK2 = _mm_##l2##_ps(k2);
291 #define CONVOLVE_4_SAMPLES \
292 mul1 = _mm_mul_ps(mInput, mK1); \
293 mul2 = _mm_mul_ps(mInput, mK2); \
294 sums1 = _mm_add_ps(sums1, mul1); \
295 sums2 = _mm_add_ps(sums2, mul2); \
300 if (k1Aligned && k2Aligned) { // both aligned
301 while (inputP < endP) {
302 LOAD_DATA(load, load)
305 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
306 while (inputP < endP) {
307 LOAD_DATA(loadu, load)
310 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
311 while (inputP < endP) {
312 LOAD_DATA(load, loadu)
315 } else { // both non-aligned
316 while (inputP < endP) {
317 LOAD_DATA(loadu, loadu)
322 // Summarize the SSE results to sum1 and sum2.
323 float* groupSumP = reinterpret_cast<float*>(&sums1);
324 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
325 groupSumP = reinterpret_cast<float*>(&sums2);
326 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
334 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
336 // Optimize size 32 and size 64 kernels by unrolling the while loop.
337 // A 20 - 30% speed improvement was measured in some cases by using this approach.
340 CONVOLVE_ONE_SAMPLE // 1
341 CONVOLVE_ONE_SAMPLE // 2
342 CONVOLVE_ONE_SAMPLE // 3
343 CONVOLVE_ONE_SAMPLE // 4
344 CONVOLVE_ONE_SAMPLE // 5
345 CONVOLVE_ONE_SAMPLE // 6
346 CONVOLVE_ONE_SAMPLE // 7
347 CONVOLVE_ONE_SAMPLE // 8
348 CONVOLVE_ONE_SAMPLE // 9
349 CONVOLVE_ONE_SAMPLE // 10
350 CONVOLVE_ONE_SAMPLE // 11
351 CONVOLVE_ONE_SAMPLE // 12
352 CONVOLVE_ONE_SAMPLE // 13
353 CONVOLVE_ONE_SAMPLE // 14
354 CONVOLVE_ONE_SAMPLE // 15
355 CONVOLVE_ONE_SAMPLE // 16
356 CONVOLVE_ONE_SAMPLE // 17
357 CONVOLVE_ONE_SAMPLE // 18
358 CONVOLVE_ONE_SAMPLE // 19
359 CONVOLVE_ONE_SAMPLE // 20
360 CONVOLVE_ONE_SAMPLE // 21
361 CONVOLVE_ONE_SAMPLE // 22
362 CONVOLVE_ONE_SAMPLE // 23
363 CONVOLVE_ONE_SAMPLE // 24
364 CONVOLVE_ONE_SAMPLE // 25
365 CONVOLVE_ONE_SAMPLE // 26
366 CONVOLVE_ONE_SAMPLE // 27
367 CONVOLVE_ONE_SAMPLE // 28
368 CONVOLVE_ONE_SAMPLE // 29
369 CONVOLVE_ONE_SAMPLE // 30
370 CONVOLVE_ONE_SAMPLE // 31
371 CONVOLVE_ONE_SAMPLE // 32
372 } else if (n == 64) {
373 CONVOLVE_ONE_SAMPLE // 1
374 CONVOLVE_ONE_SAMPLE // 2
375 CONVOLVE_ONE_SAMPLE // 3
376 CONVOLVE_ONE_SAMPLE // 4
377 CONVOLVE_ONE_SAMPLE // 5
378 CONVOLVE_ONE_SAMPLE // 6
379 CONVOLVE_ONE_SAMPLE // 7
380 CONVOLVE_ONE_SAMPLE // 8
381 CONVOLVE_ONE_SAMPLE // 9
382 CONVOLVE_ONE_SAMPLE // 10
383 CONVOLVE_ONE_SAMPLE // 11
384 CONVOLVE_ONE_SAMPLE // 12
385 CONVOLVE_ONE_SAMPLE // 13
386 CONVOLVE_ONE_SAMPLE // 14
387 CONVOLVE_ONE_SAMPLE // 15
388 CONVOLVE_ONE_SAMPLE // 16
389 CONVOLVE_ONE_SAMPLE // 17
390 CONVOLVE_ONE_SAMPLE // 18
391 CONVOLVE_ONE_SAMPLE // 19
392 CONVOLVE_ONE_SAMPLE // 20
393 CONVOLVE_ONE_SAMPLE // 21
394 CONVOLVE_ONE_SAMPLE // 22
395 CONVOLVE_ONE_SAMPLE // 23
396 CONVOLVE_ONE_SAMPLE // 24
397 CONVOLVE_ONE_SAMPLE // 25
398 CONVOLVE_ONE_SAMPLE // 26
399 CONVOLVE_ONE_SAMPLE // 27
400 CONVOLVE_ONE_SAMPLE // 28
401 CONVOLVE_ONE_SAMPLE // 29
402 CONVOLVE_ONE_SAMPLE // 30
403 CONVOLVE_ONE_SAMPLE // 31
404 CONVOLVE_ONE_SAMPLE // 32
405 CONVOLVE_ONE_SAMPLE // 33
406 CONVOLVE_ONE_SAMPLE // 34
407 CONVOLVE_ONE_SAMPLE // 35
408 CONVOLVE_ONE_SAMPLE // 36
409 CONVOLVE_ONE_SAMPLE // 37
410 CONVOLVE_ONE_SAMPLE // 38
411 CONVOLVE_ONE_SAMPLE // 39
412 CONVOLVE_ONE_SAMPLE // 40
413 CONVOLVE_ONE_SAMPLE // 41
414 CONVOLVE_ONE_SAMPLE // 42
415 CONVOLVE_ONE_SAMPLE // 43
416 CONVOLVE_ONE_SAMPLE // 44
417 CONVOLVE_ONE_SAMPLE // 45
418 CONVOLVE_ONE_SAMPLE // 46
419 CONVOLVE_ONE_SAMPLE // 47
420 CONVOLVE_ONE_SAMPLE // 48
421 CONVOLVE_ONE_SAMPLE // 49
422 CONVOLVE_ONE_SAMPLE // 50
423 CONVOLVE_ONE_SAMPLE // 51
424 CONVOLVE_ONE_SAMPLE // 52
425 CONVOLVE_ONE_SAMPLE // 53
426 CONVOLVE_ONE_SAMPLE // 54
427 CONVOLVE_ONE_SAMPLE // 55
428 CONVOLVE_ONE_SAMPLE // 56
429 CONVOLVE_ONE_SAMPLE // 57
430 CONVOLVE_ONE_SAMPLE // 58
431 CONVOLVE_ONE_SAMPLE // 59
432 CONVOLVE_ONE_SAMPLE // 60
433 CONVOLVE_ONE_SAMPLE // 61
434 CONVOLVE_ONE_SAMPLE // 62
435 CONVOLVE_ONE_SAMPLE // 63
436 CONVOLVE_ONE_SAMPLE // 64
439 // Non-optimized using actual while loop.
446 // Linearly interpolate the two "convolutions".
447 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
449 *destination++ = result;
451 // Advance the virtual index.
452 m_virtualSourceIndex += m_scaleFactor;
454 --numberOfDestinationFrames;
455 if (!numberOfDestinationFrames)
459 // Wrap back around to the start.
460 m_virtualSourceIndex -= m_blockSize;
462 // Step (3) Copy r3 to r1 and r4 to r2.
463 // This wraps the last input frames back to the start of the buffer.
464 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
465 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
468 // Refresh the buffer with more input.
469 consumeSource(r5, m_blockSize);
475 #endif // ENABLE(WEB_AUDIO)