src/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S

   1 //
   2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
   3 //
   4 //  Use of this source code is governed by a BSD-style license
   5 //  that can be found in the LICENSE file in the root of the source
   6 //  tree. An additional intellectual property rights grant can be found
   7 //  in the file PATENTS.  All contributing project authors may
   8 //  be found in the AUTHORS file in the root of the source tree.
   9 //
  10 //
  11 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
  12 //  to support float instead of SC32.
  13 //
  14
  15 //
  16 // Description:
  17 // Compute a Radix 4 FFT stage for a N point complex signal
  18 //
  19 //
  20
  21
  22 // Include standard headers
  23
  24 #include "dl/api/arm/arm64COMM_s.h"
  25 #include "dl/api/arm/omxtypes_s.h"
  26
  27
  28 // Import symbols required from other files
  29 // (For example tables)
  30
  31
  32
  33
  34 // Set debugging level
  35 //DEBUG_ON    SETL {TRUE}
  36
  37
  38
  39 // Guarding implementation by the processor name
  40
  41
  42
  43
  44 // Guarding implementation by the processor name
  45
  46
  47 // Import symbols required from other files
  48 // (For example tables)
  49
  50
  51 //Input Registers
  52
  53 #define pSrc            x0
  54 #define pDst            x1
  55 #define pTwiddle        x2
  56 #define pSubFFTNum      x3
  57 #define pSubFFTSize     x4
  58
  59
  60
  61 //Output Registers
  62
  63
  64 //Local Scratch Registers
  65
  66 #define subFFTNum       x5
  67 #define subFFTSize      x6
  68 #define grpCount        x7
  69 #define grpCount32      w7
  70 #define pointStep       x8
  71 #define pointStep32     w8
  72 #define outPointStep    x9
  73 #define stepTwiddle     x10
  74 #define setCount        x11
  75 #define srcStep         x12
  76 #define setStep         x13
  77 #define dstStep         x14
  78 #define twStep          x15
  79
  80 // Neon Registers
  81
  82 #define dW1     v0.2s
  83 #define dW2     v1.2s
  84 #define dW3     v2.2s
  85
  86 #define dXr0    v4.2s
  87 #define dXi0    v5.2s
  88 #define dXr1    v6.2s
  89 #define dXi1    v7.2s
  90 #define dXr2    v8.2s
  91 #define dXi2    v9.2s
  92 #define dXr3    v10.2s
  93 #define dXi3    v11.2s
  94 #define dYr0    v12.2s
  95 #define dYi0    v13.2s
  96 #define dYr1    v14.2s
  97 #define dYi1    v15.2s
  98 #define dYr2    v16.2s
  99 #define dYi2    v17.2s
 100 #define dYr3    v18.2s
 101 #define dYi3    v19.2s
 102 #define dZr0    v20.2s
 103 #define dZi0    v21.2s
 104 #define dZr1    v22.2s
 105 #define dZi1    v23.2s
 106 #define dZr2    v24.2s
 107 #define dZi2    v25.2s
 108 #define dZr3    v26.2s
 109 #define dZi3    v27.2s
 110
 111         .macro FFTSTAGE scaled, inverse , name
 112
 113         // Define stack arguments
 114
 115         // Move args values into our work registers
 116         ldr     subFFTNum, [pSubFFTNum]
 117         ldr     subFFTSize, [pSubFFTSize]
 118
 119         // Update grpCount and grpSize rightaway inorder to reuse
 120         // pGrpCount and pGrpSize regs
 121
 122         LSL     grpCount,subFFTSize,#2
 123         LSR     subFFTNum,subFFTNum,#2
 124         MOV     subFFTSize,grpCount
 125
 126         ld1      {dW1},[pTwiddle]                    //[wi | wr]
 127         // pT0+1 increments pT0 by 8 bytes
 128         // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
 129         lsl     pointStep,subFFTNum, #1
 130
 131         // pOut0+1 increments pOut0 by 8 bytes
 132         // pOut0+outPointStep == increment of 8*outPointStep bytes
 133         //   = 2*size bytes
 134
 135         MOV     stepTwiddle,#0
 136         ld1      {dW2},[pTwiddle]                    //[wi | wr]
 137         smull   outPointStep,grpCount32,pointStep32
 138
 139         LSL     pointStep,pointStep,#2             // 2*grpSize
 140
 141         ld1      {dW3},[pTwiddle]                  //[wi | wr]
 142         lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep
 143
 144         ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep
 145
 146         rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
 147         SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16
 148
 149         lsl     dstStep,outPointStep, #1
 150
 151         ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
 152         // dstStep = - 3*outPointStep+16
 153         rsb     dstStep,dstStep,#16
 154
 155
 156 radix4GrpLoop\name :
 157
 158         ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
 159         ADD      stepTwiddle,stepTwiddle,pointStep
 160         ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
 161         // set pTwiddle to the first point
 162         ADD      pTwiddle,pTwiddle,stepTwiddle
 163         ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
 164         lsl      twStep,stepTwiddle, #2
 165
 166         //  data[3] & update pSrc for the next set
 167         ld2     {dXr3,dXi3},[pSrc],setStep
 168         SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle
 169
 170         lsr      setCount,pointStep, #3
 171
 172         // set pSrc to data[0] of the next set
 173         ADD     pSrc,pSrc,#16
 174         // increment to data[1] of the next set
 175         ADD     pSrc,pSrc,pointStep
 176
 177
 178         // Loop on the sets
 179
 180 radix4SetLoop\name :
 181
 182
 183
 184         .ifeqs  "\inverse", "TRUE"
 185             fmul   dZr1,dXr1,dW1[0]
 186             fmul   dZi1,dXi1,dW1[0]
 187             fmul   dZr2,dXr2,dW2[0]
 188             fmul   dZi2,dXi2,dW2[0]
 189             fmul   dZr3,dXr3,dW3[0]
 190             fmul   dZi3,dXi3,dW3[0]
 191
 192             fmla   dZr1,dXi1,dW1[1]                // real part
 193             fmls   dZi1,dXr1,dW1[1]                // imag part
 194
 195             //  data[1] for next iteration
 196             ld2     {dXr1,dXi1},[pSrc],pointStep
 197
 198             fmla   dZr2,dXi2,dW2[1]                // real part
 199             fmls   dZi2,dXr2,dW2[1]                // imag part
 200
 201             //  data[2] for next iteration
 202             ld2     {dXr2,dXi2},[pSrc],pointStep
 203
 204             fmla   dZr3,dXi3,dW3[1]                // real part
 205             fmls   dZi3,dXr3,dW3[1]                // imag part
 206         .else
 207             fmul   dZr1,dXr1,dW1[0]
 208             fmul   dZi1,dXi1,dW1[0]
 209             fmul   dZr2,dXr2,dW2[0]
 210             fmul   dZi2,dXi2,dW2[0]
 211             fmul   dZr3,dXr3,dW3[0]
 212             fmul   dZi3,dXi3,dW3[0]
 213
 214             fmls   dZr1,dXi1,dW1[1]                // real part
 215             fmla   dZi1,dXr1,dW1[1]                // imag part
 216
 217             //  data[1] for next iteration
 218             ld2     {dXr1,dXi1},[pSrc],pointStep
 219
 220             fmls   dZr2,dXi2,dW2[1]                // real part
 221             fmla   dZi2,dXr2,dW2[1]                // imag part
 222
 223             //  data[2] for next iteration
 224             ld2     {dXr2,dXi2},[pSrc],pointStep
 225
 226             fmls   dZr3,dXi3,dW3[1]                // real part
 227             fmla   dZi3,dXr3,dW3[1]                // imag part
 228         .endif
 229
 230         //  data[3] & update pSrc to data[0]
 231         // But don't read on the very last iteration because that reads past
 232         // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
 233         cmp     grpCount, #4
 234
 235         b.ne    skipUpdate\name
 236         cmp     setCount, #2
 237         b.ne    skipUpdate\name
 238         add     pSrc, pSrc, setStep
 239         beq     radix4SkipRead\name
 240 skipUpdate\name:
 241         ld2     {dXr3,dXi3},[pSrc],setStep
 242 radix4SkipRead\name:
 243
 244         SUBS    setCount,setCount,#2
 245
 246         // finish first stage of 4 point FFT
 247         // fadd    qY0,qX0,qZ2
 248         // fsub    qY2,qX0,qZ2
 249         fadd    dYr0,dXr0,dZr2
 250         fsub    dYr2,dXr0,dZr2
 251         fadd    dYi0,dXi0,dZi2
 252         fsub    dYi2,dXi0,dZi2
 253
 254         //  data[0] for next iteration
 255         ld2     {dXr0,dXi0},[pSrc], #16
 256         // fadd    qY1,qZ1,qZ3
 257         // fsub    qY3,qZ1,qZ3
 258         fadd    dYr1,dZr1,dZr3
 259         fsub    dYr3,dZr1,dZr3
 260         fadd    dYi1,dZi1,dZi3
 261         fsub    dYi3,dZi1,dZi3
 262
 263         // finish second stage of 4 point FFT
 264
 265         // fsub    qZ0,qY2,qY1
 266         fsub    dZr0,dYr2,dYr1
 267         fsub    dZi0,dYi2,dYi1
 268
 269         .ifeqs  "\inverse", "TRUE"
 270
 271             fadd    dZr3,dYr0,dYi3
 272             st2     {dZr0,dZi0},[pDst],outPointStep
 273             fsub    dZi3,dYi0,dYr3
 274
 275             // fadd    qZ2,qY2,qY1
 276             fadd    dZr2,dYr2,dYr1
 277             fadd    dZi2,dYi2,dYi1
 278
 279             st2     {dZr3,dZi3},[pDst],outPointStep
 280
 281             fsub    dZr1,dYr0,dYi3
 282             st2     {dZr2,dZi2},[pDst],outPointStep
 283             fadd    dZi1,dYi0,dYr3
 284
 285             st2     {dZr1,dZi1},[pDst],dstStep
 286
 287
 288         .else
 289
 290             fsub    dZr1,dYr0,dYi3
 291             st2     {dZr0,dZi0},[pDst],outPointStep
 292             fadd    dZi1,dYi0,dYr3
 293
 294             // fadd    qZ2,qY2,qY1
 295             fadd    dZr2,dYr2,dYr1
 296             fadd    dZi2,dYi2,dYi1
 297
 298             st2     {dZr1,dZi1},[pDst],outPointStep
 299
 300             fadd    dZr3,dYr0,dYi3
 301             st2     {dZr2,dZi2},[pDst],outPointStep
 302             fsub    dZi3,dYi0,dYr3
 303
 304             st2     {dZr3,dZi3},[pDst],dstStep
 305
 306
 307         .endif
 308
 309         // increment to data[1] of the next set
 310         ADD     pSrc,pSrc,pointStep
 311         BGT     radix4SetLoop\name
 312
 313
 314         ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
 315         // subtract 4 since grpCount multiplied by 4
 316         SUBS    grpCount,grpCount,#4
 317         ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
 318         // increment pSrc for the next grp
 319         ADD     pSrc,pSrc,srcStep
 320         ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
 321         BGT     radix4GrpLoop\name
 322
 323         str     subFFTNum, [pSubFFTNum]
 324         str     subFFTSize, [pSubFFTSize]
 325
 326         .endm
 327
 328
 329         M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
 330             FFTSTAGE "FALSE","FALSE",FWD
 331         M_END
 332
 333
 334         M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
 335             FFTSTAGE "FALSE","TRUE",INV
 336         M_END
 337
 338
 339         .end