2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #include "dl/api/omxtypes.h"
15 #include "dl/sp/api/mipsSP.h"
18 * Forward real FFT for FFT sizes larger than 16. Computed using the complex
19 * FFT for half the size.
21 OMXResult mips_FFTFwd_RToCCS_F32_complex(const OMX_F32* pSrc,
23 const MIPSFFTSpec_R_FC32* pFFTSpec) {
24 OMX_U32 n1_4, num_transforms, step;
25 const OMX_F32* w_re_ptr;
26 const OMX_F32* w_im_ptr;
27 OMX_U32 fft_size = 1 << pFFTSpec->order;
28 OMX_FC32* p_dst = (OMX_FC32*)pDst;
29 OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
30 OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
34 * Loop performing sub-transforms of size 4,
35 * which contain 2 butterfly operations.
37 num_transforms = (SUBTRANSFORM_CONST >> (17 - pFFTSpec->order)) | 1;
38 for (uint32_t n = 0; n < num_transforms; ++n) {
40 * n is in the range (0 .. num_transforms - 1).
41 * The size of the pFFTSpec->pOffset is (((SUBTRANSFORM_CONST >>
42 * (16 - TWIDDLE_TABLE_ORDER)) | 1)).
44 OMX_U32 offset = pFFTSpec->pOffset[n] << 2;
46 * Offset takes it's value from pFFTSpec->pOffset table which is initialized
47 * in the omxSP_FFTInit_R_F32 function, and is constant afterwards.
49 OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
50 OMX_FC32* p_tmp = p_buf + offset;
51 /* Treating the input as a complex vector. */
52 const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
54 tmp1 = p_src[p_bitrev[0]].Re + p_src[p_bitrev[1]].Re;
55 tmp2 = p_src[p_bitrev[2]].Re + p_src[p_bitrev[3]].Re;
56 tmp3 = p_src[p_bitrev[0]].Im + p_src[p_bitrev[1]].Im;
57 tmp4 = p_src[p_bitrev[2]].Im + p_src[p_bitrev[3]].Im;
59 p_tmp[0].Re = tmp1 + tmp2;
60 p_tmp[2].Re = tmp1 - tmp2;
61 p_tmp[0].Im = tmp3 + tmp4;
62 p_tmp[2].Im = tmp3 - tmp4;
64 tmp1 = p_src[p_bitrev[0]].Re - p_src[p_bitrev[1]].Re;
65 tmp2 = p_src[p_bitrev[2]].Re - p_src[p_bitrev[3]].Re;
66 tmp3 = p_src[p_bitrev[0]].Im - p_src[p_bitrev[1]].Im;
67 tmp4 = p_src[p_bitrev[2]].Im - p_src[p_bitrev[3]].Im;
69 p_tmp[1].Re = tmp1 + tmp4;
70 p_tmp[3].Re = tmp1 - tmp4;
71 p_tmp[1].Im = tmp3 - tmp2;
72 p_tmp[3].Im = tmp3 + tmp2;
76 * Loop performing sub-transforms of size 8,
77 * which contain four butterfly operations.
79 num_transforms = (num_transforms >> 1) | 1;
80 for (uint32_t n = 0; n < num_transforms; ++n) {
81 OMX_U32 offset = pFFTSpec->pOffset[n] << 3;
82 OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
83 OMX_FC32* p_tmp = p_buf + offset;
84 const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
86 tmp1 = p_src[p_bitrev[4]].Re + p_src[p_bitrev[5]].Re;
87 tmp2 = p_src[p_bitrev[6]].Re + p_src[p_bitrev[7]].Re;
88 tmp3 = p_src[p_bitrev[4]].Im + p_src[p_bitrev[5]].Im;
89 tmp4 = p_src[p_bitrev[6]].Im + p_src[p_bitrev[7]].Im;
96 p_tmp[4].Re = p_tmp[0].Re - tmp5;
97 p_tmp[0].Re = p_tmp[0].Re + tmp5;
98 p_tmp[4].Im = p_tmp[0].Im - tmp2;
99 p_tmp[0].Im = p_tmp[0].Im + tmp2;
100 p_tmp[6].Re = p_tmp[2].Re - tmp3;
101 p_tmp[2].Re = p_tmp[2].Re + tmp3;
102 p_tmp[6].Im = p_tmp[2].Im + tmp1;
103 p_tmp[2].Im = p_tmp[2].Im - tmp1;
105 tmp1 = p_src[p_bitrev[4]].Re - p_src[p_bitrev[5]].Re;
106 tmp2 = p_src[p_bitrev[6]].Re - p_src[p_bitrev[7]].Re;
107 tmp3 = p_src[p_bitrev[4]].Im - p_src[p_bitrev[5]].Im;
108 tmp4 = p_src[p_bitrev[6]].Im - p_src[p_bitrev[7]].Im;
110 tmp5 = SQRT1_2 * (tmp1 + tmp3);
111 tmp1 = SQRT1_2 * (tmp3 - tmp1);
112 tmp3 = SQRT1_2 * (tmp2 - tmp4);
113 tmp2 = SQRT1_2 * (tmp2 + tmp4);
120 p_tmp[5].Re = p_tmp[1].Re - tmp4;
121 p_tmp[1].Re = p_tmp[1].Re + tmp4;
122 p_tmp[5].Im = p_tmp[1].Im - tmp3;
123 p_tmp[1].Im = p_tmp[1].Im + tmp3;
124 p_tmp[7].Re = p_tmp[3].Re - tmp1;
125 p_tmp[3].Re = p_tmp[3].Re + tmp1;
126 p_tmp[7].Im = p_tmp[3].Im + tmp5;
127 p_tmp[3].Im = p_tmp[3].Im - tmp5;
130 step = 1 << (pFFTSpec->order - 4);
131 n1_4 = 4; /* Quarter of the sub-transform size. */
132 /* Outer loop that loops over FFT stages. */
133 for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 1; ++fft_stage) {
134 OMX_U32 n1_2 = 2 * n1_4;
135 OMX_U32 n3_4 = 3 * n1_4;
136 num_transforms = (num_transforms >> 1) | 1;
138 * Loop performing sub-transforms of size 16 and higher.
139 * The actual size depends on the stage.
141 for (uint32_t n = 0; n < num_transforms; ++n) {
142 OMX_U32 offset = pFFTSpec->pOffset[n] << fft_stage;
143 OMX_FC32* p_tmp = p_buf + offset;
145 tmp1 = p_tmp[n1_2].Re + p_tmp[n3_4].Re;
146 tmp2 = p_tmp[n1_2].Re - p_tmp[n3_4].Re;
147 tmp3 = p_tmp[n1_2].Im + p_tmp[n3_4].Im;
148 tmp4 = p_tmp[n1_2].Im - p_tmp[n3_4].Im;
150 p_tmp[n1_2].Re = p_tmp[0].Re - tmp1;
151 p_tmp[n1_2].Im = p_tmp[0].Im - tmp3;
152 p_tmp[0].Re = p_tmp[0].Re + tmp1;
153 p_tmp[0].Im = p_tmp[0].Im + tmp3;
154 p_tmp[n3_4].Re = p_tmp[n1_4].Re - tmp4;
155 p_tmp[n3_4].Im = p_tmp[n1_4].Im + tmp2;
156 p_tmp[n1_4].Re = p_tmp[n1_4].Re + tmp4;
157 p_tmp[n1_4].Im = p_tmp[n1_4].Im - tmp2;
159 /* Twiddle table is initialized for the maximal FFT size. */
160 w_re_ptr = pFFTSpec->pTwiddle + step;
162 pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
165 * Loop performing split-radix butterfly operations for
168 for (uint32_t i = 1; i < n1_4; ++i) {
172 tmp1 = w_re * p_tmp[n1_2 + i].Re + w_im * p_tmp[n1_2 + i].Im;
173 tmp2 = w_re * p_tmp[n1_2 + i].Im - w_im * p_tmp[n1_2 + i].Re;
174 tmp3 = w_re * p_tmp[n3_4 + i].Re - w_im * p_tmp[n3_4 + i].Im;
175 tmp4 = w_re * p_tmp[n3_4 + i].Im + w_im * p_tmp[n3_4 + i].Re;
182 p_tmp[n1_2 + i].Re = p_tmp[i].Re - tmp5;
183 p_tmp[n1_2 + i].Im = p_tmp[i].Im - tmp6;
184 p_tmp[i].Re = p_tmp[i].Re + tmp5;
185 p_tmp[i].Im = p_tmp[i].Im + tmp6;
186 p_tmp[n3_4 + i].Re = p_tmp[n1_4 + i].Re - tmp2;
187 p_tmp[n3_4 + i].Im = p_tmp[n1_4 + i].Im + tmp1;
188 p_tmp[n1_4 + i].Re = p_tmp[n1_4 + i].Re + tmp2;
189 p_tmp[n1_4 + i].Im = p_tmp[n1_4 + i].Im - tmp1;
199 /* Additional computation to get the output for full FFT size. */
200 w_re_ptr = pFFTSpec->pTwiddle + step;
201 w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
203 for (uint32_t i = 1; i < fft_size / 8; ++i) {
206 tmp3 = p_buf[fft_size / 2 - i].Re;
207 tmp4 = p_buf[fft_size / 2 - i].Im;
214 tmp1 = p_buf[i + fft_size / 4].Re;
215 tmp2 = p_buf[i + fft_size / 4].Im;
216 tmp3 = p_buf[fft_size / 4 - i].Re;
217 tmp4 = p_buf[fft_size / 4 - i].Im;
222 p_dst[i].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
223 p_dst[i].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
224 p_dst[fft_size / 2 - i].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
225 p_dst[fft_size / 2 - i].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
232 p_dst[i + fft_size / 4].Re = 0.5f * (tmp5 - w_im * tmp7 - w_re * tmp6);
233 p_dst[i + fft_size / 4].Im = 0.5f * (tmp8 + w_im * tmp6 - w_re * tmp7);
234 p_dst[fft_size / 4 - i].Re = 0.5f * (tmp5 + w_im * tmp7 + w_re * tmp6);
235 p_dst[fft_size / 4 - i].Im = 0.5f * (-tmp8 + w_im * tmp6 - w_re * tmp7);
240 tmp1 = p_buf[fft_size / 8].Re;
241 tmp2 = p_buf[fft_size / 8].Im;
242 tmp3 = p_buf[3 * fft_size / 8].Re;
243 tmp4 = p_buf[3 * fft_size / 8].Im;
253 p_dst[fft_size / 8].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
254 p_dst[fft_size / 8].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
255 p_dst[3 * fft_size / 8].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
256 p_dst[3 * fft_size / 8].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
258 p_dst[0].Re = p_buf[0].Re + p_buf[0].Im;
259 p_dst[fft_size / 4].Re = p_buf[fft_size / 4].Re;
260 p_dst[fft_size / 2].Re = p_buf[0].Re - p_buf[0].Im;
262 p_dst[fft_size / 4].Im = -p_buf[fft_size / 4].Im;
263 p_dst[fft_size / 2].Im = 0.0f;
265 return OMX_Sts_NoErr;