2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 // Use of this source code is governed by a BSD-style license
5 // that can be found in the LICENSE file in the root of the source
6 // tree. An additional intellectual property rights grant can be found
7 // in the file PATENTS. All contributing project authors may
8 // be found in the AUTHORS file in the root of the source tree.
10 // This is a modification of
11 // armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
17 // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
18 // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
23 // Include standard headers
25 #include "dl/api/arm/arm64COMM_s.h"
26 #include "dl/api/arm/omxtypes_s.h"
29 // Import symbols required from other files
30 // (For example tables)
33 // Set debugging level
34 //DEBUG_ON SETL {TRUE}
38 // Guarding implementation by the processor name
42 // Guarding implementation by the processor name
55 //Local Scratch Registers
68 #define pTwiddleTmp x11
69 #define argTwiddle1 x12
109 #define dW0Tmp v10.2s
110 #define dW1Neg v11.2s
113 #define dZip8b v19.8b
116 .macro FFTSTAGE scaled, inverse, name
120 asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum
121 lsl step, subFFTNum, #2 // step = N/2 * 8 bytes
124 // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
125 // Note: W^(k) is stored as negated value and also need to
126 // conjugate the values from the table
128 // Z(0) : no need of twiddle multiply
129 // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
131 ld1 {dX0},[pSrc],step
132 ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes
135 // twStep = 3N/8 * 8 bytes pointing to W^1
136 SUB twStep,step,size,LSL #1
138 lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes
139 SUB step1,step1,#8 // (N/4-1)*8 bytes
141 fadd dY0,dX0,dX1 // [b+d | a+c]
142 fsub dY1,dX0,dX1 // [b-d | a-c]
143 fmul dY0, dY0, half[0]
144 fmul dY1, dY1, half[0]
146 // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
158 st1 {dX0s}[0],[pOut1], #4
159 ADD pTwiddleTmp,pTwiddle,#8 // W^2
160 st1 {dX1s}[1],[pOut1], #4
161 ADD argTwiddle1,pTwiddle,twStep // W^1
164 BLT decrementScale\name
168 // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
169 // Note: W^k is stored as negative values in the table and also
170 // need to conjugate the values from the table.
172 // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
173 // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
177 evenOddButterflyLoop\name :
180 ld1 {dW0r},[argTwiddle1],step1
181 ld1 {dW1r},[argTwiddle1], #8
183 ld2 {dX0r,dX0i},[pSrc],step
184 SUB argTwiddle1,argTwiddle1,step1
185 ld2 {dX1r,dX1i},[pSrc], #16
187 SUB step1,step1,#8 // (N/4-2)*8 bytes
188 ld1 {dW0i},[pTwiddleTmp],step1
189 ld1 {dW1i},[pTwiddleTmp], #8
192 SUB pTwiddleTmp,pTwiddleTmp,step1
198 fsub dT2,dX0r,dX1r // a-c
199 fadd dT3,dX0i,dX1i // b+d
200 fadd dT0,dX0r,dX1r // a+c
201 fsub dT1,dX0i,dX1i // b-d
204 fmul dT2, dT2, half[0]
205 fmul dT3, dT3, half[0]
207 fmul dT0, dT0, half[0]
208 fmul dT1, dT1, half[0]
231 fadd dY1r,dT0,dX1i // F(N/2 -1)
238 fadd dY0r,dT0,dX0i // F(1)
242 st2 {dY0r,dY0i},[pOut1],step
243 st2 {dY1r,dY1i},[pOut1], #16
245 SUB step,step,#32 // (N/2-4)*8 bytes
248 BGT evenOddButterflyLoop\name
251 // set both the ptrs to the last element
255 // Last element can be expanded as follows
256 // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
258 // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
259 // 1/2[2a+j0] - j (c-jd) [0+j2b]
261 // Since (c,d) = (0,1) for the last element, result is just (a,-b)
266 st1 {dX0rs}[0],[pOut1], #4
268 st1 {dX0rs}[1],[pOut1]
272 decrementScale\name :
276 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
277 FFTSTAGE "FALSE","TRUE",Inv