src/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S

   1 //
   2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
   3 //
   4 //  Use of this source code is governed by a BSD-style license
   5 //  that can be found in the LICENSE file in the root of the source
   6 //  tree. An additional intellectual property rights grant can be found
   7 //  in the file PATENTS.  All contributing project authors may
   8 //  be found in the AUTHORS file in the root of the source tree.
   9 //
  10 //  This is a modification of
  11 //  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
  12 //  instead of SC32.
  13 //
  14
  15 //
  16 // Description:
  17 // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
  18 // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
  19 //
  20 //
  21
  22
  23 // Include standard headers
  24
  25 #include "dl/api/arm/arm64COMM_s.h"
  26 #include "dl/api/arm/omxtypes_s.h"
  27
  28
  29 // Import symbols required from other files
  30 // (For example tables)
  31
  32
  33 // Set debugging level
  34 //DEBUG_ON    SETL {TRUE}
  35
  36
  37
  38 // Guarding implementation by the processor name
  39
  40
  41
  42       // Guarding implementation by the processor name
  43
  44
  45
  46 //Input Registers
  47
  48 #define pSrc            x0
  49 #define pTwiddle        x1
  50 #define pOut            x2
  51 #define subFFTNum       x3
  52
  53 // Output registers
  54
  55 //Local Scratch Registers
  56
  57 #define argTwiddle      x5
  58 #define argDst          x6
  59 #define subFFTSize      x7
  60 #define N               subFFTNum
  61
  62 #define pOut1           x13
  63
  64 #define size            x7
  65 #define step            x8
  66 #define step1           x9
  67 #define twStep          x10
  68 #define pTwiddleTmp     x11
  69 #define argTwiddle1     x12
  70
  71 // Neon registers
  72
  73 #define dX0     v0.2s
  74 #define dX0s    v0.s
  75 #define dShift  v1.2s
  76 #define dX1     v1.2s
  77 #define dX1s    v1.s
  78 #define dY0     v2.2s
  79 #define dY08b   v2.8b
  80 #define dY1     v3.2s
  81 #define dX0r    v0.2s
  82 #define dX0rs   v0.s
  83 #define dX0i    v1.2s
  84 #define dX1r    v2.2s
  85 #define dX1i    v3.2s
  86 #define dW0r    v4.2s
  87 #define dW0r8b  v4.8b
  88 #define dW0i    v5.2s
  89 #define dW1r    v6.2s
  90 #define dW1r8b  v6.8b
  91 #define dW1i    v7.2s
  92 #define dT0     v8.2s
  93 #define dT1     v9.2s
  94 #define dT2     v10.2s
  95 #define dT3     v11.2s
  96 #define qT0     v12.2s
  97 #define qT1     v14.2s
  98 #define qT2     v16.2s
  99 #define qT3     v18.2s
 100 #define dY0r    v4.2s
 101 #define dY0i    v5.2s
 102 #define dY1r    v6.2s
 103 #define dY1i    v7.2s
 104
 105 #define dY2     v4.2s
 106 #define dY3     v5.2s
 107 #define dW0     v6.2s
 108 #define dW1     v7.2s
 109 #define dW0Tmp  v10.2s
 110 #define dW1Neg  v11.2s
 111
 112 #define dZip    v19.2s
 113 #define dZip8b  v19.8b
 114 #define half    v13.2s
 115
 116         .macro FFTSTAGE scaled, inverse, name
 117
 118         fmov    half, 0.5
 119
 120         asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
 121         lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
 122
 123
 124         // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
 125         // Note: W^(k) is stored as negated value and also need to
 126         // conjugate the values from the table
 127
 128         // Z(0) : no need of twiddle multiply
 129         // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
 130
 131         ld1     {dX0},[pSrc],step
 132         ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
 133
 134         ld1     {dX1},[pSrc], #8
 135         // twStep = 3N/8 * 8 bytes pointing to W^1
 136         SUB     twStep,step,size,LSL #1
 137
 138         lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
 139         SUB     step1,step1,#8                // (N/4-1)*8 bytes
 140
 141         fadd    dY0,dX0,dX1                   // [b+d | a+c]
 142         fsub    dY1,dX0,dX1                   // [b-d | a-c]
 143         fmul    dY0, dY0, half[0]
 144         fmul    dY1, dY1, half[0]
 145
 146         // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
 147         // VZIP    dY0,dY1
 148         zip1    dZip,dY0,dY1
 149         zip2    dY1,dY0,dY1
 150         mov     dY08b, dZip8b
 151
 152         fsub   dX0,dY0,dY1
 153         SUBS   size,size,#2
 154         fadd   dX1,dY0,dY1
 155
 156         SUB     pSrc,pSrc,step
 157
 158         st1     {dX0s}[0],[pOut1], #4
 159         ADD     pTwiddleTmp,pTwiddle,#8       // W^2
 160         st1     {dX1s}[1],[pOut1], #4
 161         ADD     argTwiddle1,pTwiddle,twStep   // W^1
 162
 163
 164         BLT     decrementScale\name
 165         BEQ     lastElement\name
 166
 167
 168         // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
 169         // Note: W^k is stored as negative values in the table and also
 170         // need to conjugate the values from the table.
 171         //
 172         // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
 173         // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
 174
 175
 176         SUB     step,step,#24
 177 evenOddButterflyLoop\name :
 178
 179
 180         ld1     {dW0r},[argTwiddle1],step1
 181         ld1     {dW1r},[argTwiddle1], #8
 182
 183         ld2     {dX0r,dX0i},[pSrc],step
 184         SUB     argTwiddle1,argTwiddle1,step1
 185         ld2     {dX1r,dX1i},[pSrc], #16
 186
 187         SUB     step1,step1,#8                // (N/4-2)*8 bytes
 188         ld1     {dW0i},[pTwiddleTmp],step1
 189         ld1     {dW1i},[pTwiddleTmp], #8
 190         SUB     pSrc,pSrc,step
 191
 192         SUB     pTwiddleTmp,pTwiddleTmp,step1
 193         rev64   dX1r,dX1r
 194         rev64   dX1i,dX1i
 195         SUBS    size,size,#4
 196
 197
 198         fsub    dT2,dX0r,dX1r                 // a-c
 199         fadd    dT3,dX0i,dX1i                 // b+d
 200         fadd    dT0,dX0r,dX1r                 // a+c
 201         fsub    dT1,dX0i,dX1i                 // b-d
 202         SUB     step1,step1,#8
 203
 204         fmul    dT2, dT2, half[0]
 205         fmul    dT3, dT3, half[0]
 206
 207         fmul    dT0, dT0, half[0]
 208         fmul    dT1, dT1, half[0]
 209
 210         // VZIP    dW1r,dW1i
 211         // VZIP    dW0r,dW0i
 212         zip1    dZip, dW1r,dW1i
 213         zip2    dW1i,dW1r,dW1i
 214         mov     dW1r8b, dZip8b
 215         zip1    dZip,dW0r,dW0i
 216         zip2    dW0i,dW0r,dW0i
 217         mov     dW0r8b, dZip8b
 218
 219         fmul   dX1r,dW1r,dT2
 220         fmul   dX1i,dW1r,dT3
 221         fmul   dX0r,dW0r,dT2
 222         fmul   dX0i,dW0r,dT3
 223
 224         fmls   dX1r,dW1i,dT3
 225         fmla   dX1i,dW1i,dT2
 226
 227         fmla   dX0r,dW0i,dT3
 228         fmls   dX0i,dW0i,dT2
 229
 230
 231         fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
 232         fsub    dY1i,dX1r,dT1
 233
 234         rev64   dY1r,dY1r
 235         rev64   dY1i,dY1i
 236
 237
 238         fadd    dY0r,dT0,dX0i                 // F(1)
 239         fsub    dY0i,dT1,dX0r
 240
 241
 242         st2     {dY0r,dY0i},[pOut1],step
 243         st2     {dY1r,dY1i},[pOut1], #16
 244         SUB     pOut1,pOut1,step
 245         SUB     step,step,#32                 // (N/2-4)*8 bytes
 246
 247
 248         BGT     evenOddButterflyLoop\name
 249
 250
 251         // set both the ptrs to the last element
 252         SUB     pSrc,pSrc,#8
 253         SUB     pOut1,pOut1,#8
 254
 255         // Last element can be expanded as follows
 256         // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
 257         // -ve)
 258         // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
 259         // 1/2[2a+j0] - j (c-jd) [0+j2b]
 260         // (a+bc, -bd)
 261         // Since (c,d) = (0,1) for the last element, result is just (a,-b)
 262
 263 lastElement\name :
 264         ld1     {dX0r},[pSrc]
 265
 266         st1     {dX0rs}[0],[pOut1], #4
 267         fneg    dX0r,dX0r
 268         st1     {dX0rs}[1],[pOut1]
 269
 270
 271
 272 decrementScale\name :
 273
 274         .endm
 275
 276         M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
 277             FFTSTAGE "FALSE","TRUE",Inv
 278         M_END
 279
 280         .end