2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
10 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
11 @// to support float instead of SC32.
16 @// Compute a Radix 4 FFT stage for a N point complex signal
21 @// Include standard headers
23 #include "dl/api/arm/armCOMM_s.h"
24 #include "dl/api/arm/omxtypes_s.h"
26 @// M_VARIANTS ARM1136JS
28 @// Import symbols required from other files
29 @// (For example tables)
34 @// Set debugging level
35 @//DEBUG_ON SETL {TRUE}
39 @// Guarding implementation by the processor name
56 @//Local Scratch Registers
59 #define step r12 /*@// Reuse grpCount*/
60 #define outPointStep r3
65 #define t1 r3 /*@// Reuse outPointStep*/
67 @// Real and Imaginary parts used in the inner grp loop
77 @// Temporary reg to hold the twiddle multiplies
89 .macro FFTSTAGE scaled, inverse , name
91 @// Define stack arguments
94 @// Update grpCount and grpSize rightaway inorder to reuse
95 @// pGrpCount and pGrpSize regs
97 LSL grpCount,subFFTSize,#2
98 lsr subFFTNum, subFFTNum, #2
99 mov subFFTSize, grpCount
102 @// pT0+1 increments pT0 by 8 bytes
103 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
104 mov pointStep, subFFTNum, lsl #1
107 @// pOut0+1 increments pOut0 by 8 bytes
108 @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size
111 @// Use setCount as dummy. It's set correctly below.
112 smull outPointStep, setCount, grpCount, pointStep
114 LSL pointStep,pointStep,#2 @// 2*grpSize
117 MOV setCount,pointStep,LSR #3
119 @// Interchange grpLoop and setLoop
124 @// Set pSrc and pDst for the grpLoop
126 SUB diff,outPointStep,pointStep
128 @// Save setCount on stack to reuse the reg
130 ADD pSrc,pSrc,diff,LSL #2 @// pSrc += (grpCount-1)*grpStep
131 ADD pDst,pDst,diff @// pDst += (grpCount-1)*setCount
132 ADD step,step,diff @// step += (grpCount-1)*setCount
144 vldm.f32 pSrc, {x3r, x3i} @// data[1]
146 vldm.f32 pTwiddle, {x1r, x1i} @// coef[1]
148 vldm.f32 pTwiddle, {x2r, x2i} @// coef[2]
150 vldm.f32 pSrc, {x0r, x0i} @// data[2]
152 @// do first complex multiply
153 vmul.f32 t0r, x3r, x1r
154 vmul.f32 t0i, x3i, x1r
156 .ifeqs "\inverse", "TRUE"
157 vmla.f32 t0r, x3i, x1i
158 vmls.f32 t0i, x3r, x1i
162 vmls.f32 t0r, x3i, x1i
163 vmla.f32 t0i, x3r, x1i
168 add pTwiddle, pTwiddle, step
169 vldm pTwiddle, {x3r, x3i} @// coef[3]
170 sub pTwiddle, pTwiddle, step
172 @// do second complex multiply
173 vmul.f32 t0r, x0r, x2r
174 vmul.f32 t0i, x0i, x2r
176 .ifeqs "\inverse", "TRUE"
177 vmla.f32 t0r, x0i, x2i
178 vmls.f32 t0i, x0r, x2i
182 vmls.f32 t0r, x0i, x2i
183 vmla.f32 t0i, x0r, x2i
189 vldm pSrc, {x0r, x0i} @// data[3]
192 SUB pTwiddle,pTwiddle,step,LSL #1 @// reset pTwiddle
193 SUBS step,step,pointStep @// decrement loop counter
195 @// do third complex multiply
196 SUB pSrc,pSrc,pointStep,LSL #1 @// reset pSrc to data[0]
197 vmul.f32 t0r, x0r, x3r
198 vmul.f32 t0i, x0i, x3r
200 .ifeqs "\inverse", "TRUE"
201 vmla.f32 t0r, x0i, x3i
202 vmls.f32 t0i, x0r, x3i
206 vmls.f32 t0r, x0i, x3i
207 vmla.f32 t0i, x0r, x3i
212 vldm pSrc, {x0r, x0i} @// data[0]
214 @// finish first stage of 4 point FFT
215 vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 (u0)
218 vadd.f32 sr, x2r, x2r
219 vadd.f32 si, x2i, x2i
220 vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 (u1)
223 vadd.f32 x1r,x1r,x3r @// x1 = x1/2 + x3/2 (u2/2)
226 vadd.f32 sr, x3r, x3r
227 vadd.f32 si, x3i, x3i
228 vsub.f32 x3r,x1r,sr @// x3 = x1/2 - x3/2 (u3/2)
232 @// finish second stage of 4 point FFT
234 @// y0 = u1-u2 since twiddle's are stored as -ve values
238 vadd.f32 sr, x1r, x1r
239 vadd.f32 si, x1i, x1i
240 vadd.f32 x1r,x2r,sr @// y2 = u1+u2
242 vstm pDst, {x2r, x2i} @// store y0
244 vsub.f32 x0r,x0r,x3i @// y3 = u0+ju3
247 vadd.f32 sr, x3r, x3r
248 vadd.f32 si, x3i, x3i
249 vadd.f32 t2r,x0r,si @// y1 = u0-ju3
250 vsub.f32 t2i,x0i,sr @// t2 will be same as x2r reg
252 .ifeqs "\inverse", "TRUE"
253 add pDst, outPointStep
254 vstm pDst, {t2r, t2i} @// store y1
255 add pDst, outPointStep
256 vstm pDst, {x1r, x1i} @// store y2
257 add pDst, outPointStep
258 vstm pDst, {x0r, x0i} @// store y3
259 sub pDst, outPointStep
261 add pDst, outPointStep
262 vstm pDst, {x0r, x0i} @// store y1
263 add pDst, outPointStep
264 vstm pDst, {x1r, x1i} @// store y2
265 add pDst, outPointStep
266 vstm pDst, {t2r, t2i} @// store y3
267 sub pDst, outPointStep
270 SUB pDst,pDst,outPointStep, LSL #1 @// reset pDst
271 @// update the pDst for the next grp
272 SUBGE pDst,pDst,pointStep
273 @// update the pSrc for the next grp
274 SUBGE pSrc,pSrc,pointStep,LSL #2
279 ADD pSrc,pSrc,#8 @// pSrc += 1; for the next set
280 ADD pDst,pDst,#8 @// pDst += 1; for the next set
282 SUBS setCount,setCount,#1 @// decrement loop counter
287 @// Reset and Swap pSrc and pDst for the next stage
289 SUB pDst,pSrc,subFFTNum,LSL #3
290 SUB pSrc,t1,subFFTNum,LSL #3
295 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
296 FFTSTAGE "FALSE","FALSE",FWD
299 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
300 FFTSTAGE "FALSE","TRUE",INV
304 @// ENDIF @//ARM1136JS
308 @// Guarding implementation by the processor name