2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3 * Copyright Takuya OOURA, 1996-2001
5 * You may use, copy, modify and distribute this code for any purpose (include
6 * commercial use) and without fee. Please refer to this package when you modify
9 * Changes by the WebRTC authors:
10 * - Trivial type modifications.
11 * - Minimal code subset to do rdft of length 128.
12 * - Optimizations because of known length.
14 * All changes are covered by the WebRTC license and IP grant:
15 * Use of this source code is governed by a BSD-style license
16 * that can be found in the LICENSE file in the root of the source
17 * tree. An additional intellectual property rights grant can be found
18 * in the file PATENTS. All contributing project authors may
19 * be found in the AUTHORS file in the root of the source tree.
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
26 #include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
27 #include "webrtc/typedefs.h"
29 // These tables used to be computed at run-time. For example, refer to:
30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
31 // to see the initialization code.
32 const float rdft_w[64] = {
33 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
34 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
35 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
36 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
37 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
38 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
39 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
40 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
41 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
42 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
43 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
44 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
45 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
46 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
47 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
48 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
50 const float rdft_wk3ri_first[16] = {
51 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
52 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
53 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
54 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
56 const float rdft_wk3ri_second[16] = {
57 -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
58 -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
59 -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
60 -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
63 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
64 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
65 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
66 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
67 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
68 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
69 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
70 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
73 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
74 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
75 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
76 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
77 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
78 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
79 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
80 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
83 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
84 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
85 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
86 -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
87 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
88 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
89 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
90 -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
93 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
94 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
95 -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
96 -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
97 -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
98 -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
99 -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
100 -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
103 -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
104 -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
105 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
106 -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
107 -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
108 -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
109 -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
110 -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
113 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
114 -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
115 -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
116 -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
117 -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
118 -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
119 -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
120 -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
123 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
126 static void bitrv2_128_C(float* a) {
128 Following things have been attempted but are no faster:
129 (a) Storing the swap indexes in a LUT (index calculations are done
130 for 'free' while waiting on memory/L1).
131 (b) Consolidate the load/store of two consecutive floats by a 64 bit
132 integer (execution is memory/L1 bound).
133 (c) Do a mix of floats and 64 bit integer to maximize register
134 utilization (execution is memory/L1 bound).
135 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
136 (e) Hard-coding of the offsets to completely eliminates index
140 unsigned int j, j1, k, k1;
141 float xr, xi, yr, yi;
143 static const int ip[4] = {0, 64, 32, 96};
144 for (k = 0; k < 4; k++) {
145 for (j = 0; j < k; j++) {
187 j1 = 2 * k + 8 + ip[k];
200 static void cft1st_128_C(float* a) {
203 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
204 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
206 // The processing of the first set of elements was simplified in C to avoid
207 // some operations (multiplication by zero or one, addition of two elements
208 // multiplied by the same weight, ...).
240 a[10] = wk1r * (x0r - x0i);
241 a[11] = wk1r * (x0r + x0i);
244 a[14] = wk1r * (x0i - x0r);
245 a[15] = wk1r * (x0i + x0r);
247 for (j = 16; j < n; j += 16) {
250 wk2r = rdft_w[k1 + 0];
251 wk2i = rdft_w[k1 + 1];
252 wk1r = rdft_w[k2 + 0];
253 wk1i = rdft_w[k2 + 1];
254 wk3r = rdft_wk3ri_first[k1 + 0];
255 wk3i = rdft_wk3ri_first[k1 + 1];
256 x0r = a[j + 0] + a[j + 2];
257 x0i = a[j + 1] + a[j + 3];
258 x1r = a[j + 0] - a[j + 2];
259 x1i = a[j + 1] - a[j + 3];
260 x2r = a[j + 4] + a[j + 6];
261 x2i = a[j + 5] + a[j + 7];
262 x3r = a[j + 4] - a[j + 6];
263 x3i = a[j + 5] - a[j + 7];
264 a[j + 0] = x0r + x2r;
265 a[j + 1] = x0i + x2i;
268 a[j + 4] = wk2r * x0r - wk2i * x0i;
269 a[j + 5] = wk2r * x0i + wk2i * x0r;
272 a[j + 2] = wk1r * x0r - wk1i * x0i;
273 a[j + 3] = wk1r * x0i + wk1i * x0r;
276 a[j + 6] = wk3r * x0r - wk3i * x0i;
277 a[j + 7] = wk3r * x0i + wk3i * x0r;
278 wk1r = rdft_w[k2 + 2];
279 wk1i = rdft_w[k2 + 3];
280 wk3r = rdft_wk3ri_second[k1 + 0];
281 wk3i = rdft_wk3ri_second[k1 + 1];
282 x0r = a[j + 8] + a[j + 10];
283 x0i = a[j + 9] + a[j + 11];
284 x1r = a[j + 8] - a[j + 10];
285 x1i = a[j + 9] - a[j + 11];
286 x2r = a[j + 12] + a[j + 14];
287 x2i = a[j + 13] + a[j + 15];
288 x3r = a[j + 12] - a[j + 14];
289 x3i = a[j + 13] - a[j + 15];
290 a[j + 8] = x0r + x2r;
291 a[j + 9] = x0i + x2i;
294 a[j + 12] = -wk2i * x0r - wk2r * x0i;
295 a[j + 13] = -wk2i * x0i + wk2r * x0r;
298 a[j + 10] = wk1r * x0r - wk1i * x0i;
299 a[j + 11] = wk1r * x0i + wk1i * x0r;
302 a[j + 14] = wk3r * x0r - wk3i * x0i;
303 a[j + 15] = wk3r * x0i + wk3i * x0r;
307 static void cftmdl_128_C(float* a) {
311 int j0, j1, j2, j3, k, k1, k2, m2;
312 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
313 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
315 for (j0 = 0; j0 < l; j0 += 2) {
319 x0r = a[j0 + 0] + a[j1 + 0];
320 x0i = a[j0 + 1] + a[j1 + 1];
321 x1r = a[j0 + 0] - a[j1 + 0];
322 x1i = a[j0 + 1] - a[j1 + 1];
323 x2r = a[j2 + 0] + a[j3 + 0];
324 x2i = a[j2 + 1] + a[j3 + 1];
325 x3r = a[j2 + 0] - a[j3 + 0];
326 x3i = a[j2 + 1] - a[j3 + 1];
327 a[j0 + 0] = x0r + x2r;
328 a[j0 + 1] = x0i + x2i;
329 a[j2 + 0] = x0r - x2r;
330 a[j2 + 1] = x0i - x2i;
331 a[j1 + 0] = x1r - x3i;
332 a[j1 + 1] = x1i + x3r;
333 a[j3 + 0] = x1r + x3i;
334 a[j3 + 1] = x1i - x3r;
337 for (j0 = m; j0 < l + m; j0 += 2) {
341 x0r = a[j0 + 0] + a[j1 + 0];
342 x0i = a[j0 + 1] + a[j1 + 1];
343 x1r = a[j0 + 0] - a[j1 + 0];
344 x1i = a[j0 + 1] - a[j1 + 1];
345 x2r = a[j2 + 0] + a[j3 + 0];
346 x2i = a[j2 + 1] + a[j3 + 1];
347 x3r = a[j2 + 0] - a[j3 + 0];
348 x3i = a[j2 + 1] - a[j3 + 1];
349 a[j0 + 0] = x0r + x2r;
350 a[j0 + 1] = x0i + x2i;
351 a[j2 + 0] = x2i - x0i;
352 a[j2 + 1] = x0r - x2r;
355 a[j1 + 0] = wk1r * (x0r - x0i);
356 a[j1 + 1] = wk1r * (x0r + x0i);
359 a[j3 + 0] = wk1r * (x0i - x0r);
360 a[j3 + 1] = wk1r * (x0i + x0r);
364 for (k = m2; k < n; k += m2) {
367 wk2r = rdft_w[k1 + 0];
368 wk2i = rdft_w[k1 + 1];
369 wk1r = rdft_w[k2 + 0];
370 wk1i = rdft_w[k2 + 1];
371 wk3r = rdft_wk3ri_first[k1 + 0];
372 wk3i = rdft_wk3ri_first[k1 + 1];
373 for (j0 = k; j0 < l + k; j0 += 2) {
377 x0r = a[j0 + 0] + a[j1 + 0];
378 x0i = a[j0 + 1] + a[j1 + 1];
379 x1r = a[j0 + 0] - a[j1 + 0];
380 x1i = a[j0 + 1] - a[j1 + 1];
381 x2r = a[j2 + 0] + a[j3 + 0];
382 x2i = a[j2 + 1] + a[j3 + 1];
383 x3r = a[j2 + 0] - a[j3 + 0];
384 x3i = a[j2 + 1] - a[j3 + 1];
385 a[j0 + 0] = x0r + x2r;
386 a[j0 + 1] = x0i + x2i;
389 a[j2 + 0] = wk2r * x0r - wk2i * x0i;
390 a[j2 + 1] = wk2r * x0i + wk2i * x0r;
393 a[j1 + 0] = wk1r * x0r - wk1i * x0i;
394 a[j1 + 1] = wk1r * x0i + wk1i * x0r;
397 a[j3 + 0] = wk3r * x0r - wk3i * x0i;
398 a[j3 + 1] = wk3r * x0i + wk3i * x0r;
400 wk1r = rdft_w[k2 + 2];
401 wk1i = rdft_w[k2 + 3];
402 wk3r = rdft_wk3ri_second[k1 + 0];
403 wk3i = rdft_wk3ri_second[k1 + 1];
404 for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
408 x0r = a[j0 + 0] + a[j1 + 0];
409 x0i = a[j0 + 1] + a[j1 + 1];
410 x1r = a[j0 + 0] - a[j1 + 0];
411 x1i = a[j0 + 1] - a[j1 + 1];
412 x2r = a[j2 + 0] + a[j3 + 0];
413 x2i = a[j2 + 1] + a[j3 + 1];
414 x3r = a[j2 + 0] - a[j3 + 0];
415 x3i = a[j2 + 1] - a[j3 + 1];
416 a[j0 + 0] = x0r + x2r;
417 a[j0 + 1] = x0i + x2i;
420 a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
421 a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
424 a[j1 + 0] = wk1r * x0r - wk1i * x0i;
425 a[j1 + 1] = wk1r * x0i + wk1i * x0r;
428 a[j3 + 0] = wk3r * x0r - wk3i * x0i;
429 a[j3 + 1] = wk3r * x0i + wk3i * x0r;
434 static void cftfsub_128_C(float* a) {
435 int j, j1, j2, j3, l;
436 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
441 for (j = 0; j < l; j += 2) {
446 x0i = a[j + 1] + a[j1 + 1];
448 x1i = a[j + 1] - a[j1 + 1];
450 x2i = a[j2 + 1] + a[j3 + 1];
452 x3i = a[j2 + 1] - a[j3 + 1];
454 a[j + 1] = x0i + x2i;
456 a[j2 + 1] = x0i - x2i;
458 a[j1 + 1] = x1i + x3r;
460 a[j3 + 1] = x1i - x3r;
464 static void cftbsub_128_C(float* a) {
465 int j, j1, j2, j3, l;
466 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
472 for (j = 0; j < l; j += 2) {
477 x0i = -a[j + 1] - a[j1 + 1];
479 x1i = -a[j + 1] + a[j1 + 1];
481 x2i = a[j2 + 1] + a[j3 + 1];
483 x3i = a[j2 + 1] - a[j3 + 1];
485 a[j + 1] = x0i - x2i;
487 a[j2 + 1] = x0i + x2i;
489 a[j1 + 1] = x1i - x3r;
491 a[j3 + 1] = x1i + x3r;
495 static void rftfsub_128_C(float* a) {
496 const float* c = rdft_w + 32;
498 float wkr, wki, xr, xi, yr, yi;
500 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
505 xr = a[j2 + 0] - a[k2 + 0];
506 xi = a[j2 + 1] + a[k2 + 1];
507 yr = wkr * xr - wki * xi;
508 yi = wkr * xi + wki * xr;
516 static void rftbsub_128_C(float* a) {
517 const float* c = rdft_w + 32;
519 float wkr, wki, xr, xi, yr, yi;
522 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
527 xr = a[j2 + 0] - a[k2 + 0];
528 xi = a[j2 + 1] + a[k2 + 1];
529 yr = wkr * xr + wki * xi;
530 yi = wkr * xi - wki * xr;
531 a[j2 + 0] = a[j2 + 0] - yr;
532 a[j2 + 1] = yi - a[j2 + 1];
533 a[k2 + 0] = yr + a[k2 + 0];
534 a[k2 + 1] = yi - a[k2 + 1];
539 void aec_rdft_forward_128(float* a) {
549 void aec_rdft_inverse_128(float* a) {
550 a[1] = 0.5f * (a[0] - a[1]);
557 // code path selection
558 rft_sub_128_t cft1st_128;
559 rft_sub_128_t cftmdl_128;
560 rft_sub_128_t rftfsub_128;
561 rft_sub_128_t rftbsub_128;
562 rft_sub_128_t cftfsub_128;
563 rft_sub_128_t cftbsub_128;
564 rft_sub_128_t bitrv2_128;
566 void aec_rdft_init(void) {
567 cft1st_128 = cft1st_128_C;
568 cftmdl_128 = cftmdl_128_C;
569 rftfsub_128 = rftfsub_128_C;
570 rftbsub_128 = rftbsub_128_C;
571 cftfsub_128 = cftfsub_128_C;
572 cftbsub_128 = cftbsub_128_C;
573 bitrv2_128 = bitrv2_128_C;
574 #if defined(WEBRTC_ARCH_X86_FAMILY)
575 if (WebRtc_GetCPUInfo(kSSE2)) {
576 aec_rdft_init_sse2();
579 #if defined(MIPS_FPU_LE)
580 aec_rdft_init_mips();
582 #if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
583 aec_rdft_init_neon();