2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 // Use of this source code is governed by a BSD-style license
5 // that can be found in the LICENSE file in the root of the source
6 // tree. An additional intellectual property rights grant can be found
7 // in the file PATENTS. All contributing project authors may
8 // be found in the AUTHORS file in the root of the source tree.
10 // This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11 // to support float instead of SC32.
16 // Compute a Radix 4 FFT stage for a N point complex signal
21 // Include standard headers
23 #include "dl/api/arm/arm64COMM_s.h"
24 #include "dl/api/arm/omxtypes_s.h"
26 // Import symbols required from other files
27 // (For example tables)
32 // Set debugging level
33 //DEBUG_ON SETL {TRUE}
36 // Guarding implementation by the processor name
39 // Import symbols required from other files
40 // (For example tables)
41 //IMPORT armAAC_constTable
49 #define pSubFFTSize x4
56 //Local Scratch Registers
60 #define outPointStep x8
64 #define stepTwiddle x14
72 #define dButterfly1Real02 v0.2s
73 #define dButterfly1Real028b v0.8b
74 #define dButterfly1Imag02 v1.2s
75 #define dButterfly1Imag028b v1.8b
76 #define dButterfly1Real13 v2.2s
77 #define dButterfly1Real138b v2.8b
78 #define dButterfly1Imag13 v3.2s
79 #define dButterfly1Imag138b v3.8b
80 #define dButterfly2Real02 v4.2s
81 #define dButterfly2Imag02 v5.2s
82 #define dButterfly2Real13 v6.2s
83 #define dButterfly2Imag13 v7.2s
107 #define dW2r8b v10.8b
110 #define dW3r8b v12.8b
115 #define dZr08b v14.8b
116 #define dZi08b v15.8b
125 #define dZip8b v24.8b
127 .macro FFTSTAGE scaled, inverse , name
129 // Define stack arguments
131 // Move args values into our work registers
132 ldr subFFTNum, [pSubFFTNum]
133 ldr subFFTSize, [pSubFFTSize]
135 // pOut0+1 increments pOut0 by 8 bytes
136 // pOut0+outPointStep == increment of 8*outPointStep bytes
137 lsl outPointStep,subFFTSize, #3
139 // Update grpCount and grpSize rightaway
141 ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr]
143 LSL grpCount,subFFTSize,#2
145 ld1 {dW2r},[pTwiddle] // [wi|wr]
146 MOV subFFTNum,#1 //after the last stage
148 ld1 {dW3r},[pTwiddle],step16 // [wi|wr]
151 ld1 {dW2i},[pTwiddle],#8 // [wi|wr]
152 SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with
154 // update subFFTSize for the next stage
155 MOV subFFTSize,grpCount
156 ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
157 lsl dstStep,outPointStep, #1
159 // AC.r AC.i BD.r BD.i
160 ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
161 ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
163 rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16
166 // AC.r AC.i BD.r BD.i
167 ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
170 // Process two groups at a time
172 radix4lsGrpLoop\name :
175 zip1 dZip, dW2r, dW2i
176 zip2 dW2i, dW2r, dW2i
179 ADD stepTwiddle,stepTwiddle,#16
183 zip2 dW3i, dW3r, dW3i
185 ADD grpTwStep,stepTwiddle,#4
187 // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r
188 uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r
189 uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r
190 mov dButterfly1Real138b, dZip8b
192 SUB twStep,stepTwiddle,#16 // -16+stepTwiddle
194 // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
195 uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
196 uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
197 mov dButterfly1Imag138b, dZip8b
198 lsl grpTwStep,grpTwStep,#1
200 // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r
201 uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r
202 uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r
203 mov dButterfly1Real028b, dZip8b
204 rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle
206 // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
207 uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
208 uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
209 mov dButterfly1Imag028b, dZip8b
212 // grpCount is multiplied by 4
213 SUBS grpCount,grpCount,#8
215 .ifeqs "\inverse", "TRUE"
217 fmla dZr1,dW1i,dXi1 // real part
219 fmls dZi1,dW1i,dXr1 // imag part
224 fmls dZr1,dW1i,dXi1 // real part
226 fmla dZi1,dW1i,dXr1 // imag part
230 ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr]
232 .ifeqs "\inverse", "TRUE"
234 fmla dZr2,dW2i,dXi2 // real part
236 ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
237 fmls dZi2,dW2i,dXr2 // imag part
242 fmls dZr2,dW2i,dXi2 // real part
244 ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
245 fmla dZi2,dW2i,dXr2 // imag part
250 ld1 {dW2i},[pTwiddle],twStep // [wi|wr]
252 // move qX0 so as to load for the next iteration
257 .ifeqs "\inverse", "TRUE"
259 fmla dZr3,dW3i,dXi3 // real part
261 ld1 {dW3r},[pTwiddle],step24
262 fmls dZi3,dW3i,dXr3 // imag part
267 fmls dZr3,dW3i,dXi3 // real part
269 ld1 {dW3r},[pTwiddle],step24
270 fmla dZi3,dW3i,dXr3 // imag part
274 ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
276 // Don't do the load on the last iteration so we don't read past the end
278 bne skipIncrement\name
281 beq radix4lsSkipRead\name
282 // AC.r AC.i BD.r BD.i
283 ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
285 // AC.r AC.i BD.r BD.i
286 ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
287 radix4lsSkipRead\name:
289 // finish first stage of 4 point FFT
305 // finish second stage of 4 point FFT
307 .ifeqs "\inverse", "TRUE"
313 st2 {dZr0,dZi0},[pDst],outPointStep
320 st2 {dZr3,dZi3},[pDst],outPointStep
323 st2 {dZr2,dZi2},[pDst],outPointStep
326 // dstStep = -outPointStep + 16
327 st2 {dZr1,dZi1},[pDst],dstStep
337 st2 {dZr0,dZi0},[pDst],outPointStep
344 st2 {dZr1,dZi1},[pDst],outPointStep
347 st2 {dZr2,dZi2},[pDst],outPointStep
350 // dstStep = -outPointStep + 16
351 st2 {dZr3,dZi3},[pDst],dstStep
356 BGT radix4lsGrpLoop\name
361 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
362 FFTSTAGE "FALSE","FALSE",fwd
366 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
367 FFTSTAGE "FALSE","TRUE",inv