2 * Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include <emmintrin.h>
16 * Two data formats are used by the FFT routines, internally. The
17 * interface to the main external FFT routines use interleaved complex
18 * values where the real part is followed by the imaginary part.
20 * One is the split format where a complex vector of real and imaginary
21 * values are split such that all of the real values are placed in the
22 * first half of the vector and the corresponding values are placed in
23 * the second half, in the same order. The conversion from interleaved
24 * complex values to split format and back is transparent to the
25 * external FFT interface.
27 * VComplex uses split format.
30 /** VComplex hold 4 complex float elements, with the real parts stored
31 * in real and corresponding imaginary parts in imag.
33 typedef struct VComplex {
39 static __inline void VC_MUL(VC *out, VC *a, VC *b) {
40 out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
41 _mm_mul_ps(a->imag, b->imag));
42 out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
43 _mm_mul_ps(a->imag, b->real));
46 /* out = conj(a) * b */
47 static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
48 out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
49 _mm_mul_ps(a->imag, b->imag));
50 out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
51 _mm_mul_ps(a->imag, b->real));
54 /* Scale complex by a real factor */
55 static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
56 out->real = _mm_mul_ps(factor, a->real);
57 out->imag = _mm_mul_ps(factor, a->imag);
61 static __inline void VC_ADD(VC *out, VC *a, VC *b) {
62 out->real = _mm_add_ps(a->real, b->real);
63 out->imag = _mm_add_ps(a->imag, b->imag);
67 * out.real = a.real + b.imag
68 * out.imag = a.imag + b.real
70 static __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
71 out->real = _mm_add_ps(a->real, b->imag);
72 out->imag = _mm_add_ps(b->real, a->imag);
75 /* VC_ADD and store the result with Split format. */
76 static __inline void VC_ADD_STORE_SPLIT(
81 _mm_store_ps(out, _mm_add_ps(a->real, b->real));
82 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
86 static __inline void VC_SUB(VC *out, VC *a, VC *b) {
87 out->real = _mm_sub_ps(a->real, b->real);
88 out->imag = _mm_sub_ps(a->imag, b->imag);
92 * out.real = a.real - b.imag
93 * out.imag = a.imag - b.real
95 static __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
96 out->real = _mm_sub_ps(a->real, b->imag);
97 out->imag = _mm_sub_ps(b->real, a->imag);
100 /* VC_SUB and store the result with Split format. */
101 static __inline void VC_SUB_STORE_SPLIT(
106 _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
107 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
111 * out.real = a.real + b.real
112 * out.imag = a.imag - b.imag
114 static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
115 out->real = _mm_add_ps(a->real, b->real);
116 out->imag = _mm_sub_ps(a->imag, b->imag);
120 * out.real = a.real + b.imag
121 * out.imag = a.imag - b.real
123 static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
124 out->real = _mm_add_ps(a->real, b->imag);
125 out->imag = _mm_sub_ps(a->imag, b->real);
128 /* VC_ADD_SUB_X and store the result with Split format. */
129 static __inline void VC_ADD_SUB_X_STORE_SPLIT(
134 _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
135 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
139 * out.real = a.real - b.real
140 * out.imag = a.imag + b.imag
142 static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
143 out->real = _mm_sub_ps(a->real, b->real);
144 out->imag = _mm_add_ps(a->imag, b->imag);
148 * out.real = a.real - b.imag
149 * out.imag = a.imag + b.real
151 static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
152 out->real = _mm_sub_ps(a->real, b->imag);
153 out->imag = _mm_add_ps(a->imag, b->real);
156 /* VC_SUB_ADD_X and store the result with Split format. */
157 static __inline void VC_SUB_ADD_X_STORE_SPLIT(
161 _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
162 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
167 * out[offset] = in.imag
169 static __inline void VC_STORE_SPLIT(
173 _mm_store_ps(out, in->real);
174 _mm_store_ps(out + offset, in->imag);
179 * out.imag = in[offset];
181 static __inline void VC_LOAD_SPLIT(
185 out->real = _mm_load_ps(in);
186 out->imag = _mm_load_ps(in + offset);
189 /* Vector Complex Unpack from Split format to Interleaved format. */
190 static __inline void VC_UNPACK(VC *out, VC *in) {
191 out->real = _mm_unpacklo_ps(in->real, in->imag);
192 out->imag = _mm_unpackhi_ps(in->real, in->imag);
196 * Vector Complex load from interleaved complex array.
197 * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
198 * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
200 static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
201 __m128 temp0 = _mm_load_ps(in);
202 __m128 temp1 = _mm_load_ps(in + 4);
203 out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
204 out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
207 * Vector Complex Load with Split format.
208 * The input address is not 16 byte aligned.
210 static __inline void VC_LOADU_SPLIT(
214 out->real = _mm_loadu_ps(in);
215 out->imag = _mm_loadu_ps(in + offset);
218 /* Reverse the order of the Complex Vector. */
219 static __inline void VC_REVERSE(VC *v) {
220 v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
221 v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
224 * Vector Complex store to interleaved complex array
225 * out[0] = in.real[0]
226 * out[1] = in.imag[0]
227 * out[2] = in.real[1]
228 * out[3] = in.imag[1]
229 * out[4] = in.real[2]
230 * out[5] = in.imag[2]
231 * out[6] = in.real[3]
232 * out[7] = in.imag[3]
234 static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
235 _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
236 _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
240 * Vector Complex Store with Interleaved format.
241 * Address is not 16 byte aligned.
243 static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
244 _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
245 _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
248 /* VC_ADD_X and store the result with Split format. */
249 static __inline void VC_ADD_X_STORE_SPLIT(
253 _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
254 _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
258 * VC_SUB_X and store the result with inverse order.
259 * Address is not 16 byte aligned.
261 static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
267 t = _mm_sub_ps(a->real, b->imag);
268 _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
269 t = _mm_sub_ps(b->real, a->imag);
270 _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
274 * Vector Complex Load from Interleaved format to Split format.
275 * Store the result into two __m128 registers.
277 static __inline void VC_LOAD_SHUFFLE(
282 VC_LOAD_INTERLEAVE(&temp, in);
287 /* Finish the butterfly calculation of forward radix4 and store the outputs. */
288 static __inline void RADIX4_FWD_BUTTERFLY_STORE(
298 /* CADD out0, t0, t2 */
299 VC_ADD_STORE_SPLIT(out0, t0, t2, n);
301 /* CSUB out2, t0, t2 */
302 VC_SUB_STORE_SPLIT(out2, t0, t2, n);
304 /* CADD_SUB_X out1, t1, t3 */
305 VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
307 /* CSUB_ADD_X out3, t1, t3 */
308 VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
311 /* Finish the butterfly calculation of inverse radix4 and store the outputs. */
312 static __inline void RADIX4_INV_BUTTERFLY_STORE(
322 /* CADD out0, t0, t2 */
323 VC_ADD_STORE_SPLIT(out0, t0, t2, n);
325 /* CSUB out2, t0, t2 */
326 VC_SUB_STORE_SPLIT(out2, t0, t2, n);
328 /* CSUB_ADD_X out1, t1, t3 */
329 VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
331 /* CADD_SUB_X out3, t1, t3 */
332 VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
335 /* Radix4 forward butterfly */
336 static __inline void RADIX4_FWD_BUTTERFLY(
350 /* CMUL tt1, Tw1, T1 */
351 VC_MUL(&tt1, Tw1, T1);
353 /* CMUL tt2, Tw2, T2 */
354 VC_MUL(&tt2, Tw2, T2);
356 /* CMUL tt3, Tw3, T3 */
357 VC_MUL(&tt3, Tw3, T3);
359 /* CADD t0, T0, tt2 */
360 VC_ADD(t0, T0, &tt2);
362 /* CSUB t1, T0, tt2 */
363 VC_SUB(t1, T0, &tt2);
365 /* CADD t2, tt1, tt3 */
366 VC_ADD(t2, &tt1, &tt3);
368 /* CSUB t3, tt1, tt3 */
369 VC_SUB(t3, &tt1, &tt3);
372 /* Radix4 inverse butterfly */
373 static __inline void RADIX4_INV_BUTTERFLY(
387 /* CMUL tt1, Tw1, T1 */
388 VC_CONJ_MUL(&tt1, Tw1, T1);
390 /* CMUL tt2, Tw2, T2 */
391 VC_CONJ_MUL(&tt2, Tw2, T2);
393 /* CMUL tt3, Tw3, T3 */
394 VC_CONJ_MUL(&tt3, Tw3, T3);
396 /* CADD t0, T0, tt2 */
397 VC_ADD(t0, T0, &tt2);
399 /* CSUB t1, T0, tt2 */
400 VC_SUB(t1, T0, &tt2);
402 /* CADD t2, tt1, tt3 */
403 VC_ADD(t2, &tt1, &tt3);
405 /* CSUB t3, tt1, tt3 */
406 VC_SUB(t3, &tt1, &tt3);
409 /* Radix4 butterfly in first stage for both forward and inverse */
410 static __inline void RADIX4_BUTTERFLY_FS(
419 /* CADD t0, T0, T2 */
422 /* CSUB t1, T0, T2 */
425 /* CADD t2, T1, T3 */
428 /* CSUB t3, T1, T3 */
433 * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
434 * Then Do transpose on the matrix.
435 * 3, 2, 1, 0 12, 8, 4, 0
436 * 7, 6, 5, 4 =====> 13, 9, 5, 1
437 * 11, 10, 9, 8 14, 10, 6, 2
438 * 15, 14, 13, 12 15, 11, 7, 3
440 static __inline void VC_LOAD_MATRIX_TRANSPOSE(
459 xmm0 = _mm_load_ps(pT0);
460 xmm1 = _mm_load_ps(pT1);
461 xmm2 = _mm_load_ps(pT2);
462 xmm3 = _mm_load_ps(pT3);
464 /* Matrix transpose */
465 xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
466 xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
467 xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
468 xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
469 T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
470 T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
471 T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
472 T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
474 xmm0 = _mm_load_ps(pT0 + n);
475 xmm1 = _mm_load_ps(pT1 + n);
476 xmm2 = _mm_load_ps(pT2 + n);
477 xmm3 = _mm_load_ps(pT3 + n);
479 /* Matrix transpose */
480 xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
481 xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
482 xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
483 xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
484 T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
485 T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
486 T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
487 T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));