2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
10 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11 @// to support float instead of SC32.
16 @// Compute a Radix 4 FFT stage for a N point complex signal
21 @// Include standard headers
23 #include "dl/api/arm/armCOMM_s.h"
24 #include "dl/api/arm/omxtypes_s.h"
26 @// Import symbols required from other files
27 @// (For example tables)
32 @// Set debugging level
33 @//DEBUG_ON SETL {TRUE}
36 @// Guarding implementation by the processor name
39 @// Import symbols required from other files
40 @// (For example tables)
41 @//IMPORT armAAC_constTable
56 @//Local Scratch Registers
58 #define outPointStep r3
62 #define stepTwiddle r9
71 #define dButterfly1Real02 D0.F32
72 #define dButterfly1Imag02 D1.F32
73 #define dButterfly1Real13 D2.F32
74 #define dButterfly1Imag13 D3.F32
75 #define dButterfly2Real02 D4.F32
76 #define dButterfly2Imag02 D5.F32
77 #define dButterfly2Real13 D6.F32
78 #define dButterfly2Imag13 D7.F32
131 .macro FFTSTAGE scaled, inverse , name
133 @// Define stack arguments
136 @// pOut0+1 increments pOut0 by 8 bytes
137 @// pOut0+outPointStep == increment of 8*outPointStep bytes
138 MOV outPointStep,subFFTSize,LSL #3
140 @// Update grpCount and grpSize rightaway
142 VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
144 LSL grpCount,subFFTSize,#2
146 VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
147 MOV subFFTNum,#1 @//after the last stage
149 VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
152 VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
153 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
155 @// update subFFTSize for the next stage
156 MOV subFFTSize,grpCount
157 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
158 MOV dstStep,outPointStep,LSL #1
160 @// AC.r AC.i BD.r BD.i
161 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
162 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
163 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
166 @// AC.r AC.i BD.r BD.i
167 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
170 @// Process two groups at a time
172 radix4lsGrpLoop\name :
175 ADD stepTwiddle,stepTwiddle,#16
177 ADD grpTwStep,stepTwiddle,#4
178 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
179 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle
180 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
181 MOV grpTwStep,grpTwStep,LSL #1
182 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
183 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle
186 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
189 @// grpCount is multiplied by 4
190 SUBS grpCount,grpCount,#8
192 .ifeqs "\inverse", "TRUE"
194 VMLA dZr1,dW1i,dXi1 @// real part
196 VMLS dZi1,dW1i,dXr1 @// imag part
201 VMLS dZr1,dW1i,dXi1 @// real part
203 VMLA dZi1,dW1i,dXr1 @// imag part
207 VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
209 .ifeqs "\inverse", "TRUE"
211 VMLA dZr2,dW2i,dXi2 @// real part
213 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
214 VMLS dZi2,dW2i,dXr2 @// imag part
219 VMLS dZr2,dW2i,dXi2 @// real part
221 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
222 VMLA dZi2,dW2i,dXr2 @// imag part
227 VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
229 @// move qX0 so as to load for the next iteration
232 .ifeqs "\inverse", "TRUE"
234 VMLA dZr3,dW3i,dXi3 @// real part
236 VLD1 dW3r,[pTwiddle :64],step24
237 VMLS dZi3,dW3i,dXr3 @// imag part
242 VMLS dZr3,dW3i,dXi3 @// real part
244 VLD1 dW3r,[pTwiddle :64],step24
245 VMLA dZi3,dW3i,dXr3 @// imag part
249 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
251 @// Don't do the load on the last iteration so we don't read past the end
253 addeq pSrc, pSrc, #64
254 beq radix4lsSkipRead\name
255 @// AC.r AC.i BD.r BD.i
256 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
258 @// AC.r AC.i BD.r BD.i
259 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
260 radix4lsSkipRead\name:
262 @// finish first stage of 4 point FFT
270 @// finish second stage of 4 point FFT
272 .ifeqs "\inverse", "TRUE"
277 VST2 {dZr0,dZi0},[pDst :128],outPointStep
281 VST2 {dZr3,dZi3},[pDst :128],outPointStep
284 VST2 {dZr2,dZi2},[pDst :128],outPointStep
287 @// dstStep = -outPointStep + 16
288 VST2 {dZr1,dZi1},[pDst :128],dstStep
296 VST2 {dZr0,dZi0},[pDst :128],outPointStep
300 VST2 {dZr1,dZi1},[pDst :128],outPointStep
303 VST2 {dZr2,dZi2},[pDst :128],outPointStep
306 @// dstStep = -outPointStep + 16
307 VST2 {dZr3,dZi3},[pDst :128],dstStep
312 BGT radix4lsGrpLoop\name
315 @// Reset and Swap pSrc and pDst for the next stage
317 @// Extra increment done in final iteration of the loop
319 @// pDst -= 4*size; pSrc -= 8*size bytes
320 SUB pDst,pSrc,outPointStep,LSL #2
321 SUB pSrc,pTmp,outPointStep
322 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1
323 @// Extra increment done in final iteration of the loop
324 SUB pTwiddle,pTwiddle,#16
329 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
330 FFTSTAGE "FALSE","FALSE",fwd
334 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
335 FFTSTAGE "FALSE","TRUE",inv