2 @ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
4 @ Use of this source code is governed by a BSD-style license
5 @ that can be found in the LICENSE file in the root of the source
6 @ tree. An additional intellectual property rights grant can be found
7 @ in the file PATENTS. All contributing project authors may
8 @ be found in the AUTHORS file in the root of the source tree.
10 @ Some code in this file was originally from file
11 @ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
12 @ follows. It has been relicensed with permission from the copyright holders.
17 @ Last Modified Revision: 7485
18 @ Last Modified Date: Fri, 21 Sep 2007
20 @ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
25 @ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
26 @ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
27 @ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
31 #include "dl/api/arm/armCOMM_s.h"
32 #include "dl/api/arm/omxtypes_s.h"
43 @//Local Scratch Registers
55 @ Total num of radix stages to comple the FFT.
59 #define diffMinusOne r2
67 #define pTwiddleTmp r11
68 #define argTwiddle1 r12
80 #define dX0rS32 D0.S32
89 #define dW0rS32 D4.S32
90 #define dW0iS32 D5.S32
91 #define dW1rS32 D6.S32
92 #define dW1iS32 D7.S32
110 #define dW0Tmp D10.S16
111 #define dW1Neg D11.S16
113 @ Structure offsets for the FFTSpec
114 .set ARMsFFTSpec_N, 0
115 .set ARMsFFTSpec_pBitRev, 4
116 .set ARMsFFTSpec_pTwiddle, 8
117 .set ARMsFFTSpec_pBuf, 12
119 .MACRO FFTSTAGE scaled, inverse, name
121 @ Read the size from structure and take log
122 LDR N, [pFFTSpec, #ARMsFFTSpec_N]
124 @ Read other structure parameters
125 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
126 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
128 MOV size,N,ASR #1 @ preserve the contents of N
129 MOV step,N,LSL #1 @ step = N/2 * 4 bytes
131 @ Process different FFT sizes with different loops.
133 BLE smallFFTSize\name
135 @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
136 @ Note: W^(k) is stored as negated value and also need to
137 @ conjugate the values from the table.
139 @ Z(0) : no need of twiddle multiply
140 @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
142 VLD1 dX0S32[0],[pSrc],step
143 ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
145 VLD1 dX1S32[0],[pSrc]!
146 SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
148 MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
149 SUB step1,step1,#4 @ (N/4-1)*4 bytes
151 VHADD dY0,dX0,dX1 @ [b+d | a+c]
152 VHSUB dY1,dX0,dX1 @ [b-d | a-c]
153 VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
155 .ifeqs "\scaled", "TRUE"
167 ADD pTwiddleTmp,pTwiddle,#4 @ W^2
169 ADD argTwiddle1,pTwiddle,twStep @ W^1
171 BLT decrementScale\name
175 SUB step1,step1,#4 @ (N/4-1)*8 bytes
178 @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
179 @ Note: W^k is stored as negative values in the table and also need to
180 @ conjugate the values from the table.
181 @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
182 @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
184 evenOddButterflyLoop\name:
185 VLD2 {dX0r,dX0i},[pSrc],step
186 VLD2 {dX1r,dX1i},[pSrc]!
189 VLD1 dW0r,[argTwiddle1],step1
191 VLD1 dW1r,[argTwiddle1]!
192 VHSUB dT2,dX0r,dX1r @ a-c
193 SUB argTwiddle1, argTwiddle1, step1
196 VLD1 dW0i,[pTwiddleTmp],step2
197 VHADD dT3,dX0i,dX1i @ b+d
198 VLD1 dW1i,[pTwiddleTmp]!
199 VHADD dT0,dX0r,dX1r @ a+c
200 VHSUB dT1,dX0i,dX1i @ b-d
201 SUB pTwiddleTmp, pTwiddleTmp, step2
208 VZIP dW1iS32, dW1rS32
224 .ifeqs "\scaled", "TRUE"
225 VHADD dY1r,dT0,dX1i @ F(N/2 -1)
228 VADD dY1r,dT0,dX1i @ F(N/2 -1)
232 .ifeqs "\scaled", "TRUE"
233 VHADD dY0r,dT0,dX0i @ F(1)
236 VADD dY0r,dT0,dX0i @ F(1)
242 VST2 {dY0r,dY0i},[pOut1],step
243 VST2 {dY1r,dY1i},[pOut1]
245 SUB pOut1, pOut1, step
248 BGT evenOddButterflyLoop\name
250 SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
255 @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
256 @ Note: W^(k) is stored as negated value and also need to
257 @ conjugate the values from the table.
259 @ Z(0) : no need of twiddle multiply
260 @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
262 VLD1 dX0S32[0],[pSrc],step
263 ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
265 VLD1 dX1S32[0],[pSrc]!
266 SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
268 MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
269 SUB step1,step1,#4 @ (N/4-1)*4 bytes
271 VHADD dY0,dX0,dX1 @ [b+d | a+c]
272 VHSUB dY1,dX0,dX1 @ [b-d | a-c]
273 VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
275 .ifeqs "\scaled", "TRUE"
287 ADD pTwiddleTmp,pTwiddle,#4 @ W^2
289 ADD argTwiddle1,pTwiddle,twStep @ W^1
291 BLT decrementScale\name
294 @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
295 @ Note: W^k is stored as negative values in the table and also need to
296 @ conjugate the values from the table.
297 @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
298 @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
302 evenOddButterflyLoopSize4\name:
303 VLD1 dW0rS32[0],[argTwiddle1],step1
304 VLD1 dW1rS32[0],[argTwiddle1]!
306 VLD2 {dX0r[0],dX0i[0]},[pSrc]!
307 VLD2 {dX0r[1],dX0i[1]},[pSrc],step
309 SUB argTwiddle1,argTwiddle1,step1
310 VLD2 {dX1r[0],dX1i[0]},[pSrc]!
311 VLD2 {dX1r[1],dX1i[1]},[pSrc]!
313 SUB step1,step1,#4 @ (N/4-2)*4 bytes
314 VLD1 dW0iS32[0],[pTwiddleTmp],step1
315 VLD1 dW1iS32[0],[pTwiddleTmp]!
318 SUB pTwiddleTmp,pTwiddleTmp,step1
323 VHSUB dT2,dX0r,dX1r @ a-c
324 VHADD dT3,dX0i,dX1i @ b+d
326 VHADD dT0,dX0r,dX1r @ a+c
327 VHSUB dT1,dX0i,dX1i @ b-d
344 .ifeqs "\scaled", "TRUE"
345 VHADD dY1r,dT0,dX1i @ F(N/2 -1)
348 VADD dY1r,dT0,dX1i @ F(N/2 -1)
358 .ifeqs "\scaled", "TRUE"
359 VHADD dY0r,dT0,dX0i @ F(1)
362 VADD dY0r,dT0,dX0i @ F(1)
366 VST2 {dY0r[0],dY0i[0]},[pOut1]!
367 VST2 {dY0r[1],dY0i[1]},[pOut1],step
369 VST2 {dY1r[0],dY1i[0]},[pOut1]!
370 VST2 {dY1r[1],dY1i[1]},[pOut1]!
372 SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
375 @ Last element can be expanded as follows
376 @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
377 @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
378 @ 1/2[2a+j0] - j (c-jd) [0+j2b]
380 @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
383 VLD1 dX0rS32[0],[pSrc]
385 .ifeqs "\scaled", "TRUE"
389 VST1 dX0r[0],[pOut1]!
394 .ifeqs "\scaled", "TRUE"
400 M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
401 FFTSTAGE "FALSE","TRUE",Inv
404 M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
405 FFTSTAGE "TRUE","TRUE",InvSfs