// // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. // // Use of this source code is governed by a BSD-style license // that can be found in the LICENSE file in the root of the source // tree. An additional intellectual property rights grant can be found // in the file PATENTS. All contributing project authors may // be found in the AUTHORS file in the root of the source tree. // // This is a modification of // armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float // instead of SC32. // // // Description: // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation // // // Include standard headers #include "dl/api/arm/arm64COMM_s.h" #include "dl/api/arm/omxtypes_s.h" // Import symbols required from other files // (For example tables) // Set debugging level //DEBUG_ON SETL {TRUE} // Guarding implementation by the processor name // Guarding implementation by the processor name //Input Registers #define pSrc x0 #define pTwiddle x1 #define pOut x2 #define subFFTNum x3 // Output registers //Local Scratch Registers #define argTwiddle x5 #define argDst x6 #define subFFTSize x7 #define N subFFTNum #define pOut1 x13 #define size x7 #define step x8 #define step1 x9 #define twStep x10 #define pTwiddleTmp x11 #define argTwiddle1 x12 // Neon registers #define dX0 v0.2s #define dX0s v0.s #define dShift v1.2s #define dX1 v1.2s #define dX1s v1.s #define dY0 v2.2s #define dY08b v2.8b #define dY1 v3.2s #define dX0r v0.2s #define dX0rs v0.s #define dX0i v1.2s #define dX1r v2.2s #define dX1i v3.2s #define dW0r v4.2s #define dW0r8b v4.8b #define dW0i v5.2s #define dW1r v6.2s #define dW1r8b v6.8b #define dW1i v7.2s #define dT0 v8.2s #define dT1 v9.2s #define dT2 v10.2s #define dT3 v11.2s #define qT0 v12.2s #define qT1 v14.2s #define qT2 v16.2s #define qT3 v18.2s #define dY0r v4.2s #define dY0i v5.2s #define dY1r v6.2s #define dY1i v7.2s #define dY2 v4.2s #define dY3 v5.2s #define dW0 v6.2s #define dW1 v7.2s #define dW0Tmp v10.2s #define dW1Neg v11.2s #define dZip v19.2s #define dZip8b v19.8b #define half v13.2s .macro FFTSTAGE scaled, inverse, name fmov half, 0.5 asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum lsl step, subFFTNum, #2 // step = N/2 * 8 bytes // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} // Note: W^(k) is stored as negated value and also need to // conjugate the values from the table // Z(0) : no need of twiddle multiply // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } ld1 {dX0},[pSrc],step ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes ld1 {dX1},[pSrc], #8 // twStep = 3N/8 * 8 bytes pointing to W^1 SUB twStep,step,size,LSL #1 lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes SUB step1,step1,#8 // (N/4-1)*8 bytes fadd dY0,dX0,dX1 // [b+d | a+c] fsub dY1,dX0,dX1 // [b-d | a-c] fmul dY0, dY0, half[0] fmul dY1, dY1, half[0] // dY0= [a-c | a+c] ;dY1= [b-d | b+d] // VZIP dY0,dY1 zip1 dZip,dY0,dY1 zip2 dY1,dY0,dY1 mov dY08b, dZip8b fsub dX0,dY0,dY1 SUBS size,size,#2 fadd dX1,dY0,dY1 SUB pSrc,pSrc,step st1 {dX0s}[0],[pOut1], #4 ADD pTwiddleTmp,pTwiddle,#8 // W^2 st1 {dX1s}[1],[pOut1], #4 ADD argTwiddle1,pTwiddle,twStep // W^1 BLT decrementScale\name BEQ lastElement\name // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] // Note: W^k is stored as negative values in the table and also // need to conjugate the values from the table. // // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1) SUB step,step,#24 evenOddButterflyLoop\name : ld1 {dW0r},[argTwiddle1],step1 ld1 {dW1r},[argTwiddle1], #8 ld2 {dX0r,dX0i},[pSrc],step SUB argTwiddle1,argTwiddle1,step1 ld2 {dX1r,dX1i},[pSrc], #16 SUB step1,step1,#8 // (N/4-2)*8 bytes ld1 {dW0i},[pTwiddleTmp],step1 ld1 {dW1i},[pTwiddleTmp], #8 SUB pSrc,pSrc,step SUB pTwiddleTmp,pTwiddleTmp,step1 rev64 dX1r,dX1r rev64 dX1i,dX1i SUBS size,size,#4 fsub dT2,dX0r,dX1r // a-c fadd dT3,dX0i,dX1i // b+d fadd dT0,dX0r,dX1r // a+c fsub dT1,dX0i,dX1i // b-d SUB step1,step1,#8 fmul dT2, dT2, half[0] fmul dT3, dT3, half[0] fmul dT0, dT0, half[0] fmul dT1, dT1, half[0] // VZIP dW1r,dW1i // VZIP dW0r,dW0i zip1 dZip, dW1r,dW1i zip2 dW1i,dW1r,dW1i mov dW1r8b, dZip8b zip1 dZip,dW0r,dW0i zip2 dW0i,dW0r,dW0i mov dW0r8b, dZip8b fmul dX1r,dW1r,dT2 fmul dX1i,dW1r,dT3 fmul dX0r,dW0r,dT2 fmul dX0i,dW0r,dT3 fmls dX1r,dW1i,dT3 fmla dX1i,dW1i,dT2 fmla dX0r,dW0i,dT3 fmls dX0i,dW0i,dT2 fadd dY1r,dT0,dX1i // F(N/2 -1) fsub dY1i,dX1r,dT1 rev64 dY1r,dY1r rev64 dY1i,dY1i fadd dY0r,dT0,dX0i // F(1) fsub dY0i,dT1,dX0r st2 {dY0r,dY0i},[pOut1],step st2 {dY1r,dY1i},[pOut1], #16 SUB pOut1,pOut1,step SUB step,step,#32 // (N/2-4)*8 bytes BGT evenOddButterflyLoop\name // set both the ptrs to the last element SUB pSrc,pSrc,#8 SUB pOut1,pOut1,#8 // Last element can be expanded as follows // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as // -ve) // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] // 1/2[2a+j0] - j (c-jd) [0+j2b] // (a+bc, -bd) // Since (c,d) = (0,1) for the last element, result is just (a,-b) lastElement\name : ld1 {dX0r},[pSrc] st1 {dX0rs}[0],[pOut1], #4 fneg dX0r,dX0r st1 {dX0rs}[1],[pOut1] decrementScale\name : .endm M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15 FFTSTAGE "FALSE","TRUE",Inv M_END .end