src/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S

   1 @
   2 @  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
   3 @
   4 @  Use of this source code is governed by a BSD-style license
   5 @  that can be found in the LICENSE file in the root of the source
   6 @  tree. An additional intellectual property rights grant can be found
   7 @  in the file PATENTS.  All contributing project authors may
   8 @  be found in the AUTHORS file in the root of the source tree.
   9 @
  10 @ Some code in this file was originally from file
  11 @ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
  12 @ follows. It has been relicensed with permission from the copyright holders.
  13 @
  14
  15 @
  16 @ OpenMAX DL: v1.0.2
  17 @ Last Modified Revision:   7485
  18 @ Last Modified Date:       Fri, 21 Sep 2007
  19 @
  20 @ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
  21 @
  22
  23 @
  24 @ Description:
  25 @ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
  26 @ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
  27 @ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
  28 @ formula.
  29 @
  30
  31 #include "dl/api/arm/armCOMM_s.h"
  32 #include "dl/api/arm/omxtypes_s.h"
  33
  34 @//Input Registers
  35 #define pSrc            r0
  36 #define pDst            r1
  37 #define pFFTSpec        r2
  38 #define scale           r3
  39
  40 @ Output registers
  41 #define result          r0
  42
  43 @//Local Scratch Registers
  44 #define argTwiddle      r1
  45 #define argDst          r2
  46 #define argScale        r4
  47 #define tmpOrder        r4
  48 #define pTwiddle        r4
  49 #define pOut            r5
  50 #define subFFTSize      r7
  51 #define subFFTNum       r6
  52 #define N               r6
  53 #define order           r14
  54 #define diff            r9
  55 @ Total num of radix stages to comple the FFT.
  56 #define count           r8
  57 #define x0r             r4
  58 #define x0i             r5
  59 #define diffMinusOne    r2
  60 #define round           r3
  61 #define pOut1           r2
  62 #define size            r7
  63 #define step            r8
  64 #define step1           r9
  65 #define step2           r10
  66 #define twStep          r10
  67 #define pTwiddleTmp     r11
  68 #define argTwiddle1     r12
  69 #define zero            r14
  70
  71 @ Neon registers
  72 #define dX0             D0.S16
  73 #define dX0S32          D0.S32
  74 #define dShift          D1.S16
  75 #define dX1             D1.S16
  76 #define dX1S32          D1.S32
  77 #define dY0             D2.S16
  78 #define dY1             D3.S16
  79 #define dX0r            D0.S16
  80 #define dX0rS32         D0.S32
  81 #define dX0i            D1.S16
  82 #define dX1r            D2.S16
  83 #define dX1i            D3.S16
  84 #define qX1             Q1.S16
  85 #define dW0r            D4.S16
  86 #define dW0i            D5.S16
  87 #define dW1r            D6.S16
  88 #define dW1i            D7.S16
  89 #define dW0rS32         D4.S32
  90 #define dW0iS32         D5.S32
  91 #define dW1rS32         D6.S32
  92 #define dW1iS32         D7.S32
  93 #define dT0             D8.S16
  94 #define dT1             D9.S16
  95 #define dT2             D10.S16
  96 #define dT3             D11.S16
  97 #define qT0             Q6.S32
  98 #define qT1             Q7.S32
  99 #define qT2             Q8.S32
 100 #define qT3             Q9.S32
 101 #define dY0r            D4.S16
 102 #define dY0i            D5.S16
 103 #define dY1r            D6.S16
 104 #define dY1i            D7.S16
 105 #define qY1             Q3.S16
 106 #define dY2             D4.S16
 107 #define dY3             D5.S16
 108 #define dW0             D6.S16
 109 #define dW1             D7.S16
 110 #define dW0Tmp          D10.S16
 111 #define dW1Neg          D11.S16
 112
 113         @ Structure offsets for the FFTSpec
 114         .set    ARMsFFTSpec_N, 0
 115         .set    ARMsFFTSpec_pBitRev, 4
 116         .set    ARMsFFTSpec_pTwiddle, 8
 117         .set    ARMsFFTSpec_pBuf, 12
 118
 119         .MACRO FFTSTAGE scaled, inverse, name
 120
 121         @ Read the size from structure and take log
 122         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
 123
 124         @ Read other structure parameters
 125         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
 126         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
 127
 128         MOV     size,N,ASR #1        @ preserve the contents of N
 129         MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
 130
 131         @ Process different FFT sizes with different loops.
 132         CMP    size,#4
 133         BLE    smallFFTSize\name
 134
 135         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
 136         @ Note: W^(k) is stored as negated value and also need to
 137         @ conjugate the values from the table.
 138
 139         @ Z(0) : no need of twiddle multiply
 140         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
 141
 142         VLD1    dX0S32[0],[pSrc],step
 143         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
 144
 145         VLD1    dX1S32[0],[pSrc]!
 146         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
 147
 148         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
 149         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
 150
 151         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
 152         VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
 153         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
 154
 155         .ifeqs  "\scaled", "TRUE"
 156             VHSUB   dX0,dY0,dY1
 157             SUBS    size,size,#2
 158             VHADD   dX1,dY0,dY1
 159         .else
 160             VSUB   dX0,dY0,dY1
 161             SUBS    size,size,#2
 162             VADD   dX1,dY0,dY1
 163         .endif
 164
 165         SUB     pSrc,pSrc,step
 166         VST1    dX0[0],[pOut1]!
 167         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
 168         VST1    dX1[1],[pOut1]!
 169         ADD     argTwiddle1,pTwiddle,twStep            @ W^1
 170
 171         BLT     decrementScale\name
 172         BEQ     lastElement\name
 173
 174         SUB     step,step,#20
 175         SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
 176         SUB     step2, step1, #4
 177
 178         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
 179         @ Note: W^k is stored as negative values in the table and also need to
 180         @ conjugate the values from the table.
 181         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
 182         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
 183
 184 evenOddButterflyLoop\name:
 185         VLD2    {dX0r,dX0i},[pSrc],step
 186         VLD2    {dX1r,dX1i},[pSrc]!
 187         SUB     pSrc, pSrc, step
 188
 189         VLD1    dW0r,[argTwiddle1],step1
 190         VREV64  qX1,qX1
 191         VLD1    dW1r,[argTwiddle1]!
 192         VHSUB   dT2,dX0r,dX1r                          @ a-c
 193         SUB     argTwiddle1, argTwiddle1, step1
 194         SUB     step1,step1,#16
 195
 196         VLD1    dW0i,[pTwiddleTmp],step2
 197         VHADD   dT3,dX0i,dX1i                          @ b+d
 198         VLD1    dW1i,[pTwiddleTmp]!
 199         VHADD   dT0,dX0r,dX1r                          @ a+c
 200         VHSUB   dT1,dX0i,dX1i                          @ b-d
 201         SUB     pTwiddleTmp, pTwiddleTmp, step2
 202         SUB     step2,step2,#16
 203
 204         SUBS    size,size,#8
 205
 206         VZIP    dW1r,dW1i
 207         VTRN    dW0r,dW0i
 208         VZIP    dW1iS32, dW1rS32
 209
 210         VMULL   qT0,dW1i,dT2
 211         VMLSL   qT0,dW1r,dT3
 212         VMULL   qT1,dW1i,dT3
 213         VMLAL   qT1,dW1r,dT2
 214         VMULL   qT2,dW0r,dT2
 215         VMLAL   qT2,dW0i,dT3
 216         VMULL   qT3,dW0r,dT3
 217         VMLSL   qT3,dW0i,dT2
 218
 219         VRSHRN  dX1r,qT0,#15
 220         VRSHRN  dX1i,qT1,#15
 221         VRSHRN  dX0r,qT2,#15
 222         VRSHRN  dX0i,qT3,#15
 223
 224         .ifeqs  "\scaled", "TRUE"
 225             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
 226             VHSUB    dY1i,dX1r,dT1
 227         .else
 228             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
 229             VSUB    dY1i,dX1r,dT1
 230         .endif
 231
 232         .ifeqs  "\scaled", "TRUE"
 233             VHADD    dY0r,dT0,dX0i                     @ F(1)
 234             VHSUB    dY0i,dT1,dX0r
 235         .else
 236             VADD    dY0r,dT0,dX0i                      @ F(1)
 237             VSUB    dY0i,dT1,dX0r
 238         .endif
 239
 240         VREV64  qY1,qY1
 241
 242         VST2    {dY0r,dY0i},[pOut1],step
 243         VST2    {dY1r,dY1i},[pOut1]
 244         ADD     pOut1,pOut1,#16
 245         SUB     pOut1, pOut1, step
 246         SUB     step,step,#32
 247
 248         BGT     evenOddButterflyLoop\name
 249
 250         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
 251         SUB     pOut1,pOut1,#4
 252         B       lastElement\name
 253
 254 smallFFTSize\name:
 255         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
 256         @ Note: W^(k) is stored as negated value and also need to
 257         @ conjugate the values from the table.
 258
 259         @ Z(0) : no need of twiddle multiply
 260         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
 261
 262         VLD1    dX0S32[0],[pSrc],step
 263         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
 264
 265         VLD1    dX1S32[0],[pSrc]!
 266         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
 267
 268         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
 269         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
 270
 271         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
 272         VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
 273         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
 274
 275         .ifeqs  "\scaled", "TRUE"
 276             VHSUB   dX0,dY0,dY1
 277             SUBS    size,size,#2
 278             VHADD   dX1,dY0,dY1
 279         .else
 280             VSUB   dX0,dY0,dY1
 281             SUBS    size,size,#2
 282             VADD   dX1,dY0,dY1
 283         .endif
 284
 285         SUB     pSrc,pSrc,step
 286         VST1    dX0[0],[pOut1]!
 287         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
 288         VST1    dX1[1],[pOut1]!
 289         ADD     argTwiddle1,pTwiddle,twStep            @ W^1
 290
 291         BLT     decrementScale\name
 292         BEQ     lastElement\name
 293
 294         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
 295         @ Note: W^k is stored as negative values in the table and also need to
 296         @ conjugate the values from the table.
 297         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
 298         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
 299
 300         SUB     step,step,#12
 301
 302 evenOddButterflyLoopSize4\name:
 303         VLD1    dW0rS32[0],[argTwiddle1],step1
 304         VLD1    dW1rS32[0],[argTwiddle1]!
 305
 306         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
 307         VLD2    {dX0r[1],dX0i[1]},[pSrc],step
 308         SUB     pSrc,pSrc,#4
 309         SUB     argTwiddle1,argTwiddle1,step1
 310         VLD2    {dX1r[0],dX1i[0]},[pSrc]!
 311         VLD2    {dX1r[1],dX1i[1]},[pSrc]!
 312
 313         SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
 314         VLD1    dW0iS32[0],[pTwiddleTmp],step1
 315         VLD1    dW1iS32[0],[pTwiddleTmp]!
 316         SUB     pSrc,pSrc,step
 317
 318         SUB     pTwiddleTmp,pTwiddleTmp,step1
 319         VREV32  dX1r,dX1r
 320         VREV32  dX1i,dX1i
 321         SUBS    size,size,#4
 322
 323         VHSUB   dT2,dX0r,dX1r                          @ a-c
 324         VHADD   dT3,dX0i,dX1i                          @ b+d
 325         SUB     step1,step1,#4
 326         VHADD   dT0,dX0r,dX1r                          @ a+c
 327         VHSUB   dT1,dX0i,dX1i                          @ b-d
 328
 329         VTRN    dW1r,dW1i
 330         VTRN    dW0r,dW0i
 331
 332         VMULL   qT0,dW1r,dT2
 333         VMLSL   qT0,dW1i,dT3
 334         VMULL   qT1,dW1r,dT3
 335         VMLAL   qT1,dW1i,dT2
 336         VMULL   qT2,dW0r,dT2
 337         VMLAL   qT2,dW0i,dT3
 338         VMULL   qT3,dW0r,dT3
 339         VMLSL   qT3,dW0i,dT2
 340
 341         VRSHRN  dX1r,qT0,#15
 342         VRSHRN  dX1i,qT1,#15
 343
 344         .ifeqs  "\scaled", "TRUE"
 345             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
 346             VHSUB    dY1i,dX1r,dT1
 347         .else
 348             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
 349             VSUB    dY1i,dX1r,dT1
 350         .endif
 351
 352         VREV32  dY1r,dY1r
 353         VREV32  dY1i,dY1i
 354
 355         VRSHRN  dX0r,qT2,#15
 356         VRSHRN  dX0i,qT3,#15
 357
 358         .ifeqs  "\scaled", "TRUE"
 359             VHADD    dY0r,dT0,dX0i                     @ F(1)
 360             VHSUB    dY0i,dT1,dX0r
 361         .else
 362             VADD    dY0r,dT0,dX0i                      @ F(1)
 363             VSUB    dY0i,dT1,dX0r
 364         .endif
 365
 366         VST2    {dY0r[0],dY0i[0]},[pOut1]!
 367         VST2    {dY0r[1],dY0i[1]},[pOut1],step
 368         SUB     pOut1, #4
 369         VST2    {dY1r[0],dY1i[0]},[pOut1]!
 370         VST2    {dY1r[1],dY1i[1]},[pOut1]!
 371         SUB     pOut1,pOut1,step
 372         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
 373         SUB     pOut1,pOut1,#4
 374
 375         @ Last element can be expanded as follows
 376         @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
 377         @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
 378         @ 1/2[2a+j0] - j (c-jd) [0+j2b]
 379         @ (a+bc, -bd)
 380         @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
 381
 382 lastElement\name:
 383         VLD1    dX0rS32[0],[pSrc]
 384
 385         .ifeqs  "\scaled", "TRUE"
 386             VSHR    dX0r,dX0r,#1
 387         .endif
 388
 389         VST1    dX0r[0],[pOut1]!
 390         VNEG    dX0r,dX0r
 391         VST1    dX0r[1],[pOut1]
 392
 393 decrementScale\name:
 394         .ifeqs  "\scaled", "TRUE"
 395             SUB scale,scale,#1
 396         .endif
 397
 398         .endm
 399
 400         M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
 401         FFTSTAGE "FALSE","TRUE",Inv
 402         M_END
 403
 404         M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
 405         FFTSTAGE "TRUE","TRUE",InvSfs
 406         M_END
 407
 408
 409         .end