f50e68f97400ed8c9ff6ef4786f07f445c0da698
[platform/framework/web/crosswalk.git] / src / third_party / WebKit / Source / platform / audio / DirectConvolver.cpp
1 /*
2  * Copyright (C) 2012 Intel Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1.  Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  * 2.  Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer in the
12  *     documentation and/or other materials provided with the distribution.
13  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14  *     its contributors may be used to endorse or promote products derived
15  *     from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28
29 #include "config.h"
30
31 #if ENABLE(WEB_AUDIO)
32
33 #include "platform/audio/DirectConvolver.h"
34
35 #if OS(MACOSX)
36 #include <Accelerate/Accelerate.h>
37 #endif
38
39 #include "platform/audio/VectorMath.h"
40 #include "wtf/CPU.h"
41
42 #if (CPU(X86) || CPU(X86_64)) && !(OS(MACOSX) || USE(WEBAUDIO_IPP))
43 #include <emmintrin.h>
44 #endif
45
46 namespace blink {
47
48 using namespace VectorMath;
49
50 DirectConvolver::DirectConvolver(size_t inputBlockSize)
51     : m_inputBlockSize(inputBlockSize)
52 #if USE(WEBAUDIO_IPP)
53     , m_overlayBuffer(inputBlockSize)
54 #endif // USE(WEBAUDIO_IPP)
55     , m_buffer(inputBlockSize * 2)
56 {
57 }
58
59 void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* sourceP, float* destP, size_t framesToProcess)
60 {
61     ASSERT(framesToProcess == m_inputBlockSize);
62     if (framesToProcess != m_inputBlockSize)
63         return;
64
65     // Only support kernelSize <= m_inputBlockSize
66     size_t kernelSize = convolutionKernel->size();
67     ASSERT(kernelSize <= m_inputBlockSize);
68     if (kernelSize > m_inputBlockSize)
69         return;
70
71     float* kernelP = convolutionKernel->data();
72
73     // Sanity check
74     bool isCopyGood = kernelP && sourceP && destP && m_buffer.data();
75     ASSERT(isCopyGood);
76     if (!isCopyGood)
77         return;
78
79 #if USE(WEBAUDIO_IPP)
80     float* outputBuffer = m_buffer.data();
81     float* overlayBuffer = m_overlayBuffer.data();
82     bool isCopyGood2 = overlayBuffer && m_overlayBuffer.size() >= kernelSize && m_buffer.size() == m_inputBlockSize * 2;
83     ASSERT(isCopyGood2);
84     if (!isCopyGood2)
85         return;
86
87     ippsConv_32f(static_cast<const Ipp32f*>(sourceP), framesToProcess, static_cast<Ipp32f*>(kernelP), kernelSize, static_cast<Ipp32f*>(outputBuffer));
88
89     vadd(outputBuffer, 1, overlayBuffer, 1, destP, 1, framesToProcess);
90     memcpy(overlayBuffer, outputBuffer + m_inputBlockSize, sizeof(float) * kernelSize);
91 #else
92     float* inputP = m_buffer.data() + m_inputBlockSize;
93
94     // Copy samples to 2nd half of input buffer.
95     memcpy(inputP, sourceP, sizeof(float) * framesToProcess);
96
97 #if OS(MACOSX)
98 #if CPU(X86)
99     conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
100 #else
101     vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
102 #endif // CPU(X86)
103 #else
104     size_t i = 0;
105 #if CPU(X86) || CPU(X86_64)
106     // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess|
107     // are multiples of 4. If not, use the straightforward loop below.
108
109     if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {
110         // AudioFloatArray's are always aligned on at least a 16-byte boundary.
111         AudioFloatArray kernelBuffer(4 * kernelSize);
112         __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());
113
114         // Reverse the kernel and repeat each value across a vector
115         for (i = 0; i < kernelSize; ++i) {
116             kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);
117         }
118
119         float* inputStartP = inputP - kernelSize + 1;
120
121         // Do convolution with 4 inputs at a time.
122         for (i = 0; i < framesToProcess; i += 4) {
123             __m128 convolutionSum;
124
125             convolutionSum = _mm_setzero_ps();
126
127             // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually.
128             for (size_t k = 0; k < kernelSize; k += 4) {
129                 size_t dataOffset = i + k;
130
131                 for (size_t m = 0; m < 4; ++m) {
132                     __m128 sourceBlock;
133                     __m128 product;
134
135                     sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);
136                     product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);
137                     convolutionSum = _mm_add_ps(convolutionSum, product);
138                 }
139             }
140             _mm_storeu_ps(destP + i, convolutionSum);
141         }
142     } else {
143 #endif
144
145     // FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES.
146 #define CONVOLVE_ONE_SAMPLE                 \
147     do {                                    \
148         sum += inputP[i - j] * kernelP[j];  \
149         j++;                                \
150     } while (0)
151
152     while (i < framesToProcess) {
153         size_t j = 0;
154         float sum = 0;
155
156         // FIXME: SSE optimization may be applied here.
157         if (kernelSize == 32) {
158             CONVOLVE_ONE_SAMPLE; // 1
159             CONVOLVE_ONE_SAMPLE; // 2
160             CONVOLVE_ONE_SAMPLE; // 3
161             CONVOLVE_ONE_SAMPLE; // 4
162             CONVOLVE_ONE_SAMPLE; // 5
163             CONVOLVE_ONE_SAMPLE; // 6
164             CONVOLVE_ONE_SAMPLE; // 7
165             CONVOLVE_ONE_SAMPLE; // 8
166             CONVOLVE_ONE_SAMPLE; // 9
167             CONVOLVE_ONE_SAMPLE; // 10
168
169             CONVOLVE_ONE_SAMPLE; // 11
170             CONVOLVE_ONE_SAMPLE; // 12
171             CONVOLVE_ONE_SAMPLE; // 13
172             CONVOLVE_ONE_SAMPLE; // 14
173             CONVOLVE_ONE_SAMPLE; // 15
174             CONVOLVE_ONE_SAMPLE; // 16
175             CONVOLVE_ONE_SAMPLE; // 17
176             CONVOLVE_ONE_SAMPLE; // 18
177             CONVOLVE_ONE_SAMPLE; // 19
178             CONVOLVE_ONE_SAMPLE; // 20
179
180             CONVOLVE_ONE_SAMPLE; // 21
181             CONVOLVE_ONE_SAMPLE; // 22
182             CONVOLVE_ONE_SAMPLE; // 23
183             CONVOLVE_ONE_SAMPLE; // 24
184             CONVOLVE_ONE_SAMPLE; // 25
185             CONVOLVE_ONE_SAMPLE; // 26
186             CONVOLVE_ONE_SAMPLE; // 27
187             CONVOLVE_ONE_SAMPLE; // 28
188             CONVOLVE_ONE_SAMPLE; // 29
189             CONVOLVE_ONE_SAMPLE; // 30
190
191             CONVOLVE_ONE_SAMPLE; // 31
192             CONVOLVE_ONE_SAMPLE; // 32
193
194         } else if (kernelSize == 64) {
195             CONVOLVE_ONE_SAMPLE; // 1
196             CONVOLVE_ONE_SAMPLE; // 2
197             CONVOLVE_ONE_SAMPLE; // 3
198             CONVOLVE_ONE_SAMPLE; // 4
199             CONVOLVE_ONE_SAMPLE; // 5
200             CONVOLVE_ONE_SAMPLE; // 6
201             CONVOLVE_ONE_SAMPLE; // 7
202             CONVOLVE_ONE_SAMPLE; // 8
203             CONVOLVE_ONE_SAMPLE; // 9
204             CONVOLVE_ONE_SAMPLE; // 10
205
206             CONVOLVE_ONE_SAMPLE; // 11
207             CONVOLVE_ONE_SAMPLE; // 12
208             CONVOLVE_ONE_SAMPLE; // 13
209             CONVOLVE_ONE_SAMPLE; // 14
210             CONVOLVE_ONE_SAMPLE; // 15
211             CONVOLVE_ONE_SAMPLE; // 16
212             CONVOLVE_ONE_SAMPLE; // 17
213             CONVOLVE_ONE_SAMPLE; // 18
214             CONVOLVE_ONE_SAMPLE; // 19
215             CONVOLVE_ONE_SAMPLE; // 20
216
217             CONVOLVE_ONE_SAMPLE; // 21
218             CONVOLVE_ONE_SAMPLE; // 22
219             CONVOLVE_ONE_SAMPLE; // 23
220             CONVOLVE_ONE_SAMPLE; // 24
221             CONVOLVE_ONE_SAMPLE; // 25
222             CONVOLVE_ONE_SAMPLE; // 26
223             CONVOLVE_ONE_SAMPLE; // 27
224             CONVOLVE_ONE_SAMPLE; // 28
225             CONVOLVE_ONE_SAMPLE; // 29
226             CONVOLVE_ONE_SAMPLE; // 30
227
228             CONVOLVE_ONE_SAMPLE; // 31
229             CONVOLVE_ONE_SAMPLE; // 32
230             CONVOLVE_ONE_SAMPLE; // 33
231             CONVOLVE_ONE_SAMPLE; // 34
232             CONVOLVE_ONE_SAMPLE; // 35
233             CONVOLVE_ONE_SAMPLE; // 36
234             CONVOLVE_ONE_SAMPLE; // 37
235             CONVOLVE_ONE_SAMPLE; // 38
236             CONVOLVE_ONE_SAMPLE; // 39
237             CONVOLVE_ONE_SAMPLE; // 40
238
239             CONVOLVE_ONE_SAMPLE; // 41
240             CONVOLVE_ONE_SAMPLE; // 42
241             CONVOLVE_ONE_SAMPLE; // 43
242             CONVOLVE_ONE_SAMPLE; // 44
243             CONVOLVE_ONE_SAMPLE; // 45
244             CONVOLVE_ONE_SAMPLE; // 46
245             CONVOLVE_ONE_SAMPLE; // 47
246             CONVOLVE_ONE_SAMPLE; // 48
247             CONVOLVE_ONE_SAMPLE; // 49
248             CONVOLVE_ONE_SAMPLE; // 50
249
250             CONVOLVE_ONE_SAMPLE; // 51
251             CONVOLVE_ONE_SAMPLE; // 52
252             CONVOLVE_ONE_SAMPLE; // 53
253             CONVOLVE_ONE_SAMPLE; // 54
254             CONVOLVE_ONE_SAMPLE; // 55
255             CONVOLVE_ONE_SAMPLE; // 56
256             CONVOLVE_ONE_SAMPLE; // 57
257             CONVOLVE_ONE_SAMPLE; // 58
258             CONVOLVE_ONE_SAMPLE; // 59
259             CONVOLVE_ONE_SAMPLE; // 60
260
261             CONVOLVE_ONE_SAMPLE; // 61
262             CONVOLVE_ONE_SAMPLE; // 62
263             CONVOLVE_ONE_SAMPLE; // 63
264             CONVOLVE_ONE_SAMPLE; // 64
265
266         } else if (kernelSize == 128) {
267             CONVOLVE_ONE_SAMPLE; // 1
268             CONVOLVE_ONE_SAMPLE; // 2
269             CONVOLVE_ONE_SAMPLE; // 3
270             CONVOLVE_ONE_SAMPLE; // 4
271             CONVOLVE_ONE_SAMPLE; // 5
272             CONVOLVE_ONE_SAMPLE; // 6
273             CONVOLVE_ONE_SAMPLE; // 7
274             CONVOLVE_ONE_SAMPLE; // 8
275             CONVOLVE_ONE_SAMPLE; // 9
276             CONVOLVE_ONE_SAMPLE; // 10
277
278             CONVOLVE_ONE_SAMPLE; // 11
279             CONVOLVE_ONE_SAMPLE; // 12
280             CONVOLVE_ONE_SAMPLE; // 13
281             CONVOLVE_ONE_SAMPLE; // 14
282             CONVOLVE_ONE_SAMPLE; // 15
283             CONVOLVE_ONE_SAMPLE; // 16
284             CONVOLVE_ONE_SAMPLE; // 17
285             CONVOLVE_ONE_SAMPLE; // 18
286             CONVOLVE_ONE_SAMPLE; // 19
287             CONVOLVE_ONE_SAMPLE; // 20
288
289             CONVOLVE_ONE_SAMPLE; // 21
290             CONVOLVE_ONE_SAMPLE; // 22
291             CONVOLVE_ONE_SAMPLE; // 23
292             CONVOLVE_ONE_SAMPLE; // 24
293             CONVOLVE_ONE_SAMPLE; // 25
294             CONVOLVE_ONE_SAMPLE; // 26
295             CONVOLVE_ONE_SAMPLE; // 27
296             CONVOLVE_ONE_SAMPLE; // 28
297             CONVOLVE_ONE_SAMPLE; // 29
298             CONVOLVE_ONE_SAMPLE; // 30
299
300             CONVOLVE_ONE_SAMPLE; // 31
301             CONVOLVE_ONE_SAMPLE; // 32
302             CONVOLVE_ONE_SAMPLE; // 33
303             CONVOLVE_ONE_SAMPLE; // 34
304             CONVOLVE_ONE_SAMPLE; // 35
305             CONVOLVE_ONE_SAMPLE; // 36
306             CONVOLVE_ONE_SAMPLE; // 37
307             CONVOLVE_ONE_SAMPLE; // 38
308             CONVOLVE_ONE_SAMPLE; // 39
309             CONVOLVE_ONE_SAMPLE; // 40
310
311             CONVOLVE_ONE_SAMPLE; // 41
312             CONVOLVE_ONE_SAMPLE; // 42
313             CONVOLVE_ONE_SAMPLE; // 43
314             CONVOLVE_ONE_SAMPLE; // 44
315             CONVOLVE_ONE_SAMPLE; // 45
316             CONVOLVE_ONE_SAMPLE; // 46
317             CONVOLVE_ONE_SAMPLE; // 47
318             CONVOLVE_ONE_SAMPLE; // 48
319             CONVOLVE_ONE_SAMPLE; // 49
320             CONVOLVE_ONE_SAMPLE; // 50
321
322             CONVOLVE_ONE_SAMPLE; // 51
323             CONVOLVE_ONE_SAMPLE; // 52
324             CONVOLVE_ONE_SAMPLE; // 53
325             CONVOLVE_ONE_SAMPLE; // 54
326             CONVOLVE_ONE_SAMPLE; // 55
327             CONVOLVE_ONE_SAMPLE; // 56
328             CONVOLVE_ONE_SAMPLE; // 57
329             CONVOLVE_ONE_SAMPLE; // 58
330             CONVOLVE_ONE_SAMPLE; // 59
331             CONVOLVE_ONE_SAMPLE; // 60
332
333             CONVOLVE_ONE_SAMPLE; // 61
334             CONVOLVE_ONE_SAMPLE; // 62
335             CONVOLVE_ONE_SAMPLE; // 63
336             CONVOLVE_ONE_SAMPLE; // 64
337             CONVOLVE_ONE_SAMPLE; // 65
338             CONVOLVE_ONE_SAMPLE; // 66
339             CONVOLVE_ONE_SAMPLE; // 67
340             CONVOLVE_ONE_SAMPLE; // 68
341             CONVOLVE_ONE_SAMPLE; // 69
342             CONVOLVE_ONE_SAMPLE; // 70
343
344             CONVOLVE_ONE_SAMPLE; // 71
345             CONVOLVE_ONE_SAMPLE; // 72
346             CONVOLVE_ONE_SAMPLE; // 73
347             CONVOLVE_ONE_SAMPLE; // 74
348             CONVOLVE_ONE_SAMPLE; // 75
349             CONVOLVE_ONE_SAMPLE; // 76
350             CONVOLVE_ONE_SAMPLE; // 77
351             CONVOLVE_ONE_SAMPLE; // 78
352             CONVOLVE_ONE_SAMPLE; // 79
353             CONVOLVE_ONE_SAMPLE; // 80
354
355             CONVOLVE_ONE_SAMPLE; // 81
356             CONVOLVE_ONE_SAMPLE; // 82
357             CONVOLVE_ONE_SAMPLE; // 83
358             CONVOLVE_ONE_SAMPLE; // 84
359             CONVOLVE_ONE_SAMPLE; // 85
360             CONVOLVE_ONE_SAMPLE; // 86
361             CONVOLVE_ONE_SAMPLE; // 87
362             CONVOLVE_ONE_SAMPLE; // 88
363             CONVOLVE_ONE_SAMPLE; // 89
364             CONVOLVE_ONE_SAMPLE; // 90
365
366             CONVOLVE_ONE_SAMPLE; // 91
367             CONVOLVE_ONE_SAMPLE; // 92
368             CONVOLVE_ONE_SAMPLE; // 93
369             CONVOLVE_ONE_SAMPLE; // 94
370             CONVOLVE_ONE_SAMPLE; // 95
371             CONVOLVE_ONE_SAMPLE; // 96
372             CONVOLVE_ONE_SAMPLE; // 97
373             CONVOLVE_ONE_SAMPLE; // 98
374             CONVOLVE_ONE_SAMPLE; // 99
375             CONVOLVE_ONE_SAMPLE; // 100
376
377             CONVOLVE_ONE_SAMPLE; // 101
378             CONVOLVE_ONE_SAMPLE; // 102
379             CONVOLVE_ONE_SAMPLE; // 103
380             CONVOLVE_ONE_SAMPLE; // 104
381             CONVOLVE_ONE_SAMPLE; // 105
382             CONVOLVE_ONE_SAMPLE; // 106
383             CONVOLVE_ONE_SAMPLE; // 107
384             CONVOLVE_ONE_SAMPLE; // 108
385             CONVOLVE_ONE_SAMPLE; // 109
386             CONVOLVE_ONE_SAMPLE; // 110
387
388             CONVOLVE_ONE_SAMPLE; // 111
389             CONVOLVE_ONE_SAMPLE; // 112
390             CONVOLVE_ONE_SAMPLE; // 113
391             CONVOLVE_ONE_SAMPLE; // 114
392             CONVOLVE_ONE_SAMPLE; // 115
393             CONVOLVE_ONE_SAMPLE; // 116
394             CONVOLVE_ONE_SAMPLE; // 117
395             CONVOLVE_ONE_SAMPLE; // 118
396             CONVOLVE_ONE_SAMPLE; // 119
397             CONVOLVE_ONE_SAMPLE; // 120
398
399             CONVOLVE_ONE_SAMPLE; // 121
400             CONVOLVE_ONE_SAMPLE; // 122
401             CONVOLVE_ONE_SAMPLE; // 123
402             CONVOLVE_ONE_SAMPLE; // 124
403             CONVOLVE_ONE_SAMPLE; // 125
404             CONVOLVE_ONE_SAMPLE; // 126
405             CONVOLVE_ONE_SAMPLE; // 127
406             CONVOLVE_ONE_SAMPLE; // 128
407         } else {
408             while (j < kernelSize) {
409                 // Non-optimized using actual while loop.
410                 CONVOLVE_ONE_SAMPLE;
411             }
412         }
413         destP[i++] = sum;
414     }
415 #if CPU(X86) || CPU(X86_64)
416     }
417 #endif
418 #endif // OS(MACOSX)
419
420     // Copy 2nd half of input buffer to 1st half.
421     memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);
422 #endif
423 }
424
425 void DirectConvolver::reset()
426 {
427     m_buffer.zero();
428 #if USE(WEBAUDIO_IPP)
429     m_overlayBuffer.zero();
430 #endif // USE(WEBAUDIO_IPP)
431 }
432
433 } // namespace blink
434
435 #endif // ENABLE(WEB_AUDIO)