src/third_party/WebKit/Source/platform/audio/SincResampler.cpp

   1 /*
   2  * Copyright (C) 2011 Google Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  *
   8  * 1.  Redistributions of source code must retain the above copyright
   9  *     notice, this list of conditions and the following disclaimer.
  10  * 2.  Redistributions in binary form must reproduce the above copyright
  11  *     notice, this list of conditions and the following disclaimer in the
  12  *     documentation and/or other materials provided with the distribution.
  13  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  14  *     its contributors may be used to endorse or promote products derived
  15  *     from this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
  18  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  20  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
  21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include "config.h"
  30
  31 #if ENABLE(WEB_AUDIO)
  32
  33 #include "platform/audio/SincResampler.h"
  34
  35 #include "platform/audio/AudioBus.h"
  36 #include "wtf/CPU.h"
  37 #include "wtf/MathExtras.h"
  38
  39 #if CPU(X86) || CPU(X86_64)
  40 #include <emmintrin.h>
  41 #endif
  42
  43 using namespace std;
  44
  45 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
  46 //
  47 // |----------------|----------------------------------------------------------------|----------------|
  48 //
  49 //                                              blockSize + kernelSize / 2
  50 //                   <-------------------------------------------------------------------------------->
  51 //                                                  r0
  52 //
  53 //   kernelSize / 2   kernelSize / 2                                 kernelSize / 2     kernelSize / 2
  54 // <---------------> <--------------->                              <---------------> <--------------->
  55 //         r1                r2                                             r3                r4
  56 //
  57 //                                              blockSize
  58 //                                     <-------------------------------------------------------------->
  59 //                                                  r5
  60
  61 // The Algorithm:
  62 //
  63 // 1) Consume input frames into r0 (r1 is zero-initialized).
  64 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
  65 //    or we've finished generating all the output frames.
  66 // 3) Copy r3 to r1 and r4 to r2.
  67 // 4) Consume input frames into r5 (zero-pad if we run out of input).
  68 // 5) Goto (2) until all of input is consumed.
  69 //
  70 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
  71
  72 namespace blink {
  73
  74 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
  75     : m_scaleFactor(scaleFactor)
  76     , m_kernelSize(kernelSize)
  77     , m_numberOfKernelOffsets(numberOfKernelOffsets)
  78     , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
  79     , m_virtualSourceIndex(0)
  80     , m_blockSize(512)
  81     , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
  82     , m_source(0)
  83     , m_sourceFramesAvailable(0)
  84     , m_sourceProvider(0)
  85     , m_isBufferPrimed(false)
  86 {
  87     initializeKernel();
  88 }
  89
  90 void SincResampler::initializeKernel()
  91 {
  92     // Blackman window parameters.
  93     double alpha = 0.16;
  94     double a0 = 0.5 * (1.0 - alpha);
  95     double a1 = 0.5;
  96     double a2 = 0.5 * alpha;
  97
  98     // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
  99     double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
 100
 101     // The sinc function is an idealized brick-wall filter, but since we're windowing it the
 102     // transition from pass to stop does not happen right away. So we should adjust the
 103     // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
 104     // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
 105     sincScaleFactor *= 0.9;
 106
 107     int n = m_kernelSize;
 108     int halfSize = n / 2;
 109
 110     // Generates a set of windowed sinc() kernels.
 111     // We generate a range of sub-sample offsets from 0.0 to 1.0.
 112     for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
 113         double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
 114
 115         for (int i = 0; i < n; ++i) {
 116             // Compute the sinc() with offset.
 117             double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
 118             double sinc = !s ? 1.0 : sin(s) / s;
 119             sinc *= sincScaleFactor;
 120
 121             // Compute Blackman window, matching the offset of the sinc().
 122             double x = (i - subsampleOffset) / n;
 123             double window = a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
 124
 125             // Window the sinc() function and store at the correct offset.
 126             m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
 127         }
 128     }
 129 }
 130
 131 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
 132 {
 133     ASSERT(m_sourceProvider);
 134     if (!m_sourceProvider)
 135         return;
 136
 137     // Wrap the provided buffer by an AudioBus for use by the source provider.
 138     RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
 139
 140     // FIXME: Find a way to make the following const-correct:
 141     bus->setChannelMemory(0, buffer, numberOfSourceFrames);
 142
 143     m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
 144 }
 145
 146 namespace {
 147
 148 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
 149
 150 class BufferSourceProvider FINAL : public AudioSourceProvider {
 151 public:
 152     BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
 153         : m_source(source)
 154         , m_sourceFramesAvailable(numberOfSourceFrames)
 155     {
 156     }
 157
 158     // Consumes samples from the in-memory buffer.
 159     virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
 160     {
 161         ASSERT(m_source && bus);
 162         if (!m_source || !bus)
 163             return;
 164
 165         float* buffer = bus->channel(0)->mutableData();
 166
 167         // Clamp to number of frames available and zero-pad.
 168         size_t framesToCopy = min(m_sourceFramesAvailable, framesToProcess);
 169         memcpy(buffer, m_source, sizeof(float) * framesToCopy);
 170
 171         // Zero-pad if necessary.
 172         if (framesToCopy < framesToProcess)
 173             memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
 174
 175         m_sourceFramesAvailable -= framesToCopy;
 176         m_source += framesToCopy;
 177     }
 178
 179 private:
 180     const float* m_source;
 181     size_t m_sourceFramesAvailable;
 182 };
 183
 184 } // namespace
 185
 186 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
 187 {
 188     // Resample an in-memory buffer using an AudioSourceProvider.
 189     BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
 190
 191     unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
 192     unsigned remaining = numberOfDestinationFrames;
 193
 194     while (remaining) {
 195         unsigned framesThisTime = min(remaining, m_blockSize);
 196         process(&sourceProvider, destination, framesThisTime);
 197
 198         destination += framesThisTime;
 199         remaining -= framesThisTime;
 200     }
 201 }
 202
 203 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
 204 {
 205     bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
 206     ASSERT(isGood);
 207     if (!isGood)
 208         return;
 209
 210     m_sourceProvider = sourceProvider;
 211
 212     unsigned numberOfDestinationFrames = framesToProcess;
 213
 214     // Setup various region pointers in the buffer (see diagram above).
 215     float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
 216     float* r1 = m_inputBuffer.data();
 217     float* r2 = r0;
 218     float* r3 = r0 + m_blockSize - m_kernelSize / 2;
 219     float* r4 = r0 + m_blockSize;
 220     float* r5 = r0 + m_kernelSize / 2;
 221
 222     // Step (1)
 223     // Prime the input buffer at the start of the input stream.
 224     if (!m_isBufferPrimed) {
 225         consumeSource(r0, m_blockSize + m_kernelSize / 2);
 226         m_isBufferPrimed = true;
 227     }
 228
 229     // Step (2)
 230
 231     while (numberOfDestinationFrames) {
 232         while (m_virtualSourceIndex < m_blockSize) {
 233             // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
 234             int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
 235             double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
 236
 237             double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
 238             int offsetIndex = static_cast<int>(virtualOffsetIndex);
 239
 240             float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
 241             float* k2 = k1 + m_kernelSize;
 242
 243             // Initialize input pointer based on quantized m_virtualSourceIndex.
 244             float* inputP = r1 + sourceIndexI;
 245
 246             // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
 247             float sum1 = 0;
 248             float sum2 = 0;
 249
 250             // Figure out how much to weight each kernel's "convolution".
 251             double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
 252
 253             // Generate a single output sample.
 254             int n = m_kernelSize;
 255
 256 #define CONVOLVE_ONE_SAMPLE      \
 257             input = *inputP++;   \
 258             sum1 += input * *k1; \
 259             sum2 += input * *k2; \
 260             ++k1;                \
 261             ++k2;
 262
 263             {
 264                 float input;
 265
 266 #if CPU(X86) || CPU(X86_64)
 267                 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
 268                 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
 269                     CONVOLVE_ONE_SAMPLE
 270                     n--;
 271                 }
 272
 273                 // Now the inputP is aligned and start to apply SSE.
 274                 float* endP = inputP + n - n % 4;
 275                 __m128 mInput;
 276                 __m128 mK1;
 277                 __m128 mK2;
 278                 __m128 mul1;
 279                 __m128 mul2;
 280
 281                 __m128 sums1 = _mm_setzero_ps();
 282                 __m128 sums2 = _mm_setzero_ps();
 283                 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
 284                 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
 285
 286 #define LOAD_DATA(l1, l2)                        \
 287                 mInput = _mm_load_ps(inputP);    \
 288                 mK1 = _mm_##l1##_ps(k1);         \
 289                 mK2 = _mm_##l2##_ps(k2);
 290
 291 #define CONVOLVE_4_SAMPLES                       \
 292                 mul1 = _mm_mul_ps(mInput, mK1);  \
 293                 mul2 = _mm_mul_ps(mInput, mK2);  \
 294                 sums1 = _mm_add_ps(sums1, mul1); \
 295                 sums2 = _mm_add_ps(sums2, mul2); \
 296                 inputP += 4;                     \
 297                 k1 += 4;                         \
 298                 k2 += 4;
 299
 300                 if (k1Aligned && k2Aligned) { // both aligned
 301                     while (inputP < endP) {
 302                         LOAD_DATA(load, load)
 303                         CONVOLVE_4_SAMPLES
 304                     }
 305                 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
 306                     while (inputP < endP) {
 307                         LOAD_DATA(loadu, load)
 308                         CONVOLVE_4_SAMPLES
 309                     }
 310                 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
 311                     while (inputP < endP) {
 312                         LOAD_DATA(load, loadu)
 313                         CONVOLVE_4_SAMPLES
 314                     }
 315                 } else { // both non-aligned
 316                     while (inputP < endP) {
 317                         LOAD_DATA(loadu, loadu)
 318                         CONVOLVE_4_SAMPLES
 319                     }
 320                 }
 321
 322                 // Summarize the SSE results to sum1 and sum2.
 323                 float* groupSumP = reinterpret_cast<float*>(&sums1);
 324                 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
 325                 groupSumP = reinterpret_cast<float*>(&sums2);
 326                 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
 327
 328                 n %= 4;
 329                 while (n) {
 330                     CONVOLVE_ONE_SAMPLE
 331                     n--;
 332                 }
 333 #else
 334                 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
 335
 336                 // Optimize size 32 and size 64 kernels by unrolling the while loop.
 337                 // A 20 - 30% speed improvement was measured in some cases by using this approach.
 338
 339                 if (n == 32) {
 340                     CONVOLVE_ONE_SAMPLE // 1
 341                     CONVOLVE_ONE_SAMPLE // 2
 342                     CONVOLVE_ONE_SAMPLE // 3
 343                     CONVOLVE_ONE_SAMPLE // 4
 344                     CONVOLVE_ONE_SAMPLE // 5
 345                     CONVOLVE_ONE_SAMPLE // 6
 346                     CONVOLVE_ONE_SAMPLE // 7
 347                     CONVOLVE_ONE_SAMPLE // 8
 348                     CONVOLVE_ONE_SAMPLE // 9
 349                     CONVOLVE_ONE_SAMPLE // 10
 350                     CONVOLVE_ONE_SAMPLE // 11
 351                     CONVOLVE_ONE_SAMPLE // 12
 352                     CONVOLVE_ONE_SAMPLE // 13
 353                     CONVOLVE_ONE_SAMPLE // 14
 354                     CONVOLVE_ONE_SAMPLE // 15
 355                     CONVOLVE_ONE_SAMPLE // 16
 356                     CONVOLVE_ONE_SAMPLE // 17
 357                     CONVOLVE_ONE_SAMPLE // 18
 358                     CONVOLVE_ONE_SAMPLE // 19
 359                     CONVOLVE_ONE_SAMPLE // 20
 360                     CONVOLVE_ONE_SAMPLE // 21
 361                     CONVOLVE_ONE_SAMPLE // 22
 362                     CONVOLVE_ONE_SAMPLE // 23
 363                     CONVOLVE_ONE_SAMPLE // 24
 364                     CONVOLVE_ONE_SAMPLE // 25
 365                     CONVOLVE_ONE_SAMPLE // 26
 366                     CONVOLVE_ONE_SAMPLE // 27
 367                     CONVOLVE_ONE_SAMPLE // 28
 368                     CONVOLVE_ONE_SAMPLE // 29
 369                     CONVOLVE_ONE_SAMPLE // 30
 370                     CONVOLVE_ONE_SAMPLE // 31
 371                     CONVOLVE_ONE_SAMPLE // 32
 372                 } else if (n == 64) {
 373                     CONVOLVE_ONE_SAMPLE // 1
 374                     CONVOLVE_ONE_SAMPLE // 2
 375                     CONVOLVE_ONE_SAMPLE // 3
 376                     CONVOLVE_ONE_SAMPLE // 4
 377                     CONVOLVE_ONE_SAMPLE // 5
 378                     CONVOLVE_ONE_SAMPLE // 6
 379                     CONVOLVE_ONE_SAMPLE // 7
 380                     CONVOLVE_ONE_SAMPLE // 8
 381                     CONVOLVE_ONE_SAMPLE // 9
 382                     CONVOLVE_ONE_SAMPLE // 10
 383                     CONVOLVE_ONE_SAMPLE // 11
 384                     CONVOLVE_ONE_SAMPLE // 12
 385                     CONVOLVE_ONE_SAMPLE // 13
 386                     CONVOLVE_ONE_SAMPLE // 14
 387                     CONVOLVE_ONE_SAMPLE // 15
 388                     CONVOLVE_ONE_SAMPLE // 16
 389                     CONVOLVE_ONE_SAMPLE // 17
 390                     CONVOLVE_ONE_SAMPLE // 18
 391                     CONVOLVE_ONE_SAMPLE // 19
 392                     CONVOLVE_ONE_SAMPLE // 20
 393                     CONVOLVE_ONE_SAMPLE // 21
 394                     CONVOLVE_ONE_SAMPLE // 22
 395                     CONVOLVE_ONE_SAMPLE // 23
 396                     CONVOLVE_ONE_SAMPLE // 24
 397                     CONVOLVE_ONE_SAMPLE // 25
 398                     CONVOLVE_ONE_SAMPLE // 26
 399                     CONVOLVE_ONE_SAMPLE // 27
 400                     CONVOLVE_ONE_SAMPLE // 28
 401                     CONVOLVE_ONE_SAMPLE // 29
 402                     CONVOLVE_ONE_SAMPLE // 30
 403                     CONVOLVE_ONE_SAMPLE // 31
 404                     CONVOLVE_ONE_SAMPLE // 32
 405                     CONVOLVE_ONE_SAMPLE // 33
 406                     CONVOLVE_ONE_SAMPLE // 34
 407                     CONVOLVE_ONE_SAMPLE // 35
 408                     CONVOLVE_ONE_SAMPLE // 36
 409                     CONVOLVE_ONE_SAMPLE // 37
 410                     CONVOLVE_ONE_SAMPLE // 38
 411                     CONVOLVE_ONE_SAMPLE // 39
 412                     CONVOLVE_ONE_SAMPLE // 40
 413                     CONVOLVE_ONE_SAMPLE // 41
 414                     CONVOLVE_ONE_SAMPLE // 42
 415                     CONVOLVE_ONE_SAMPLE // 43
 416                     CONVOLVE_ONE_SAMPLE // 44
 417                     CONVOLVE_ONE_SAMPLE // 45
 418                     CONVOLVE_ONE_SAMPLE // 46
 419                     CONVOLVE_ONE_SAMPLE // 47
 420                     CONVOLVE_ONE_SAMPLE // 48
 421                     CONVOLVE_ONE_SAMPLE // 49
 422                     CONVOLVE_ONE_SAMPLE // 50
 423                     CONVOLVE_ONE_SAMPLE // 51
 424                     CONVOLVE_ONE_SAMPLE // 52
 425                     CONVOLVE_ONE_SAMPLE // 53
 426                     CONVOLVE_ONE_SAMPLE // 54
 427                     CONVOLVE_ONE_SAMPLE // 55
 428                     CONVOLVE_ONE_SAMPLE // 56
 429                     CONVOLVE_ONE_SAMPLE // 57
 430                     CONVOLVE_ONE_SAMPLE // 58
 431                     CONVOLVE_ONE_SAMPLE // 59
 432                     CONVOLVE_ONE_SAMPLE // 60
 433                     CONVOLVE_ONE_SAMPLE // 61
 434                     CONVOLVE_ONE_SAMPLE // 62
 435                     CONVOLVE_ONE_SAMPLE // 63
 436                     CONVOLVE_ONE_SAMPLE // 64
 437                 } else {
 438                     while (n--) {
 439                         // Non-optimized using actual while loop.
 440                         CONVOLVE_ONE_SAMPLE
 441                     }
 442                 }
 443 #endif
 444             }
 445
 446             // Linearly interpolate the two "convolutions".
 447             double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
 448
 449             *destination++ = result;
 450
 451             // Advance the virtual index.
 452             m_virtualSourceIndex += m_scaleFactor;
 453
 454             --numberOfDestinationFrames;
 455             if (!numberOfDestinationFrames)
 456                 return;
 457         }
 458
 459         // Wrap back around to the start.
 460         m_virtualSourceIndex -= m_blockSize;
 461
 462         // Step (3) Copy r3 to r1 and r4 to r2.
 463         // This wraps the last input frames back to the start of the buffer.
 464         memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
 465         memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
 466
 467         // Step (4)
 468         // Refresh the buffer with more input.
 469         consumeSource(r5, m_blockSize);
 470     }
 471 }
 472
 473 } // namespace blink
 474
 475 #endif // ENABLE(WEB_AUDIO)