2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 // Use of this source code is governed by a BSD-style license
5 // that can be found in the LICENSE file in the root of the source
6 // tree. An additional intellectual property rights grant can be found
7 // in the file PATENTS. All contributing project authors may
8 // be found in the AUTHORS file in the root of the source tree.
11 // This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
12 // to support float instead of SC32.
17 // Compute a Radix 4 FFT stage for a N point complex signal
22 // Include standard headers
24 #include "dl/api/arm/arm64COMM_s.h"
25 #include "dl/api/arm/omxtypes_s.h"
28 // Import symbols required from other files
29 // (For example tables)
34 // Set debugging level
35 //DEBUG_ON SETL {TRUE}
39 // Guarding implementation by the processor name
44 // Guarding implementation by the processor name
47 // Import symbols required from other files
48 // (For example tables)
57 #define pSubFFTSize x4
64 //Local Scratch Registers
71 #define pointStep32 w8
72 #define outPointStep x9
73 #define stepTwiddle x10
111 .macro FFTSTAGE scaled, inverse , name
113 // Define stack arguments
115 // Move args values into our work registers
116 ldr subFFTNum, [pSubFFTNum]
117 ldr subFFTSize, [pSubFFTSize]
119 // Update grpCount and grpSize rightaway inorder to reuse
120 // pGrpCount and pGrpSize regs
122 LSL grpCount,subFFTSize,#2
123 LSR subFFTNum,subFFTNum,#2
124 MOV subFFTSize,grpCount
126 ld1 {dW1},[pTwiddle] //[wi | wr]
127 // pT0+1 increments pT0 by 8 bytes
128 // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
129 lsl pointStep,subFFTNum, #1
131 // pOut0+1 increments pOut0 by 8 bytes
132 // pOut0+outPointStep == increment of 8*outPointStep bytes
136 ld1 {dW2},[pTwiddle] //[wi | wr]
137 smull outPointStep,grpCount32,pointStep32
139 LSL pointStep,pointStep,#2 // 2*grpSize
141 ld1 {dW3},[pTwiddle] //[wi | wr]
142 lsl srcStep,pointStep, #1 // srcStep = 2*pointStep
144 ADD setStep,srcStep,pointStep // setStep = 3*pointStep
146 rsb setStep,setStep,#0 // setStep = - 3*pointStep
147 SUB srcStep,srcStep,#16 // srcStep = 2*pointStep-16
149 lsl dstStep,outPointStep, #1
151 ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
152 // dstStep = - 3*outPointStep+16
153 rsb dstStep,dstStep,#16
158 ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
159 ADD stepTwiddle,stepTwiddle,pointStep
160 ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
161 // set pTwiddle to the first point
162 ADD pTwiddle,pTwiddle,stepTwiddle
163 ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
164 lsl twStep,stepTwiddle, #2
166 // data[3] & update pSrc for the next set
167 ld2 {dXr3,dXi3},[pSrc],setStep
168 SUB twStep,stepTwiddle,twStep // twStep = -3*stepTwiddle
170 lsr setCount,pointStep, #3
172 // set pSrc to data[0] of the next set
174 // increment to data[1] of the next set
175 ADD pSrc,pSrc,pointStep
184 .ifeqs "\inverse", "TRUE"
185 fmul dZr1,dXr1,dW1[0]
186 fmul dZi1,dXi1,dW1[0]
187 fmul dZr2,dXr2,dW2[0]
188 fmul dZi2,dXi2,dW2[0]
189 fmul dZr3,dXr3,dW3[0]
190 fmul dZi3,dXi3,dW3[0]
192 fmla dZr1,dXi1,dW1[1] // real part
193 fmls dZi1,dXr1,dW1[1] // imag part
195 // data[1] for next iteration
196 ld2 {dXr1,dXi1},[pSrc],pointStep
198 fmla dZr2,dXi2,dW2[1] // real part
199 fmls dZi2,dXr2,dW2[1] // imag part
201 // data[2] for next iteration
202 ld2 {dXr2,dXi2},[pSrc],pointStep
204 fmla dZr3,dXi3,dW3[1] // real part
205 fmls dZi3,dXr3,dW3[1] // imag part
207 fmul dZr1,dXr1,dW1[0]
208 fmul dZi1,dXi1,dW1[0]
209 fmul dZr2,dXr2,dW2[0]
210 fmul dZi2,dXi2,dW2[0]
211 fmul dZr3,dXr3,dW3[0]
212 fmul dZi3,dXi3,dW3[0]
214 fmls dZr1,dXi1,dW1[1] // real part
215 fmla dZi1,dXr1,dW1[1] // imag part
217 // data[1] for next iteration
218 ld2 {dXr1,dXi1},[pSrc],pointStep
220 fmls dZr2,dXi2,dW2[1] // real part
221 fmla dZi2,dXr2,dW2[1] // imag part
223 // data[2] for next iteration
224 ld2 {dXr2,dXi2},[pSrc],pointStep
226 fmls dZr3,dXi3,dW3[1] // real part
227 fmla dZi3,dXr3,dW3[1] // imag part
230 // data[3] & update pSrc to data[0]
231 // But don't read on the very last iteration because that reads past
232 // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
238 add pSrc, pSrc, setStep
239 beq radix4SkipRead\name
241 ld2 {dXr3,dXi3},[pSrc],setStep
244 SUBS setCount,setCount,#2
246 // finish first stage of 4 point FFT
254 // data[0] for next iteration
255 ld2 {dXr0,dXi0},[pSrc], #16
263 // finish second stage of 4 point FFT
269 .ifeqs "\inverse", "TRUE"
272 st2 {dZr0,dZi0},[pDst],outPointStep
279 st2 {dZr3,dZi3},[pDst],outPointStep
282 st2 {dZr2,dZi2},[pDst],outPointStep
285 st2 {dZr1,dZi1},[pDst],dstStep
291 st2 {dZr0,dZi0},[pDst],outPointStep
298 st2 {dZr1,dZi1},[pDst],outPointStep
301 st2 {dZr2,dZi2},[pDst],outPointStep
304 st2 {dZr3,dZi3},[pDst],dstStep
309 // increment to data[1] of the next set
310 ADD pSrc,pSrc,pointStep
311 BGT radix4SetLoop\name
314 ld1 {dW1},[pTwiddle],stepTwiddle //[wi | wr]
315 // subtract 4 since grpCount multiplied by 4
316 SUBS grpCount,grpCount,#4
317 ld1 {dW2},[pTwiddle],stepTwiddle //[wi | wr]
318 // increment pSrc for the next grp
319 ADD pSrc,pSrc,srcStep
320 ld1 {dW3},[pTwiddle],twStep //[wi | wr]
321 BGT radix4GrpLoop\name
323 str subFFTNum, [pSubFFTNum]
324 str subFFTSize, [pSubFFTSize]
329 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
330 FFTSTAGE "FALSE","FALSE",FWD
334 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
335 FFTSTAGE "FALSE","TRUE",INV