Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / arm64 / armSP_FFT_CToC_FC32_Radix4_ls_s.S
1 //
2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 //
4 //  Use of this source code is governed by a BSD-style license
5 //  that can be found in the LICENSE file in the root of the source
6 //  tree. An additional intellectual property rights grant can be found
7 //  in the file PATENTS.  All contributing project authors may
8 //  be found in the AUTHORS file in the root of the source tree.
9 //
10 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11 //  to support float instead of SC32.
12 //
13
14 //
15 // Description:
16 // Compute a Radix 4 FFT stage for a N point complex signal
17 //
18 //
19
20
21 // Include standard headers
22
23 #include "dl/api/arm/arm64COMM_s.h"
24 #include "dl/api/arm/omxtypes_s.h"
25
26 // Import symbols required from other files
27 // (For example tables)
28
29
30
31
32 // Set debugging level
33 //DEBUG_ON    SETL {TRUE}
34
35
36 // Guarding implementation by the processor name
37
38
39 // Import symbols required from other files
40 // (For example tables)
41     //IMPORT  armAAC_constTable
42
43 //Input Registers
44
45 #define pSrc            x0
46 #define pDst            x1
47 #define pTwiddle        x2
48 #define pSubFFTNum      x3
49 #define pSubFFTSize     x4      
50
51
52
53 //Output Registers
54
55
56 //Local Scratch Registers
57
58 #define subFFTNum       x5
59 #define subFFTSize      x6
60 #define outPointStep    x8
61 #define grpCount        x9
62 #define dstStep         x10
63 #define grpTwStep       x13
64 #define stepTwiddle     x14
65 #define twStep          x15
66 #define step16          x11
67 #define step24          x12
68
69
70 // Neon Registers
71
72 #define dButterfly1Real02       v0.2s
73 #define dButterfly1Real028b     v0.8b
74 #define dButterfly1Imag02       v1.2s
75 #define dButterfly1Imag028b     v1.8b
76 #define dButterfly1Real13       v2.2s
77 #define dButterfly1Real138b     v2.8b
78 #define dButterfly1Imag13       v3.2s
79 #define dButterfly1Imag138b     v3.8b
80 #define dButterfly2Real02       v4.2s
81 #define dButterfly2Imag02       v5.2s
82 #define dButterfly2Real13       v6.2s
83 #define dButterfly2Imag13       v7.2s
84 #define dXr0                    v0.2s
85 #define dXi0                    v1.2s
86 #define dXr08b                  v0.8b
87 #define dXi08b                  v1.8b
88 #define dXr1                    v2.2s
89 #define dXi1                    v3.2s
90 #define dXr2                    v4.2s
91 #define dXi2                    v5.2s
92 #define dXr3                    v6.2s
93 #define dXi3                    v7.2s
94
95 #define dYr0                    v16.2s
96 #define dYi0                    v17.2s
97 #define dYr1                    v18.2s
98 #define dYi1                    v19.2s
99 #define dYr2                    v20.2s
100 #define dYi2                    v21.2s
101 #define dYr3                    v22.2s
102 #define dYi3                    v23.2s
103
104 #define dW1r                    v8.2s
105 #define dW1i                    v9.2s
106 #define dW2r                    v10.2s
107 #define dW2r8b                  v10.8b
108 #define dW2i                    v11.2s
109 #define dW3r                    v12.2s
110 #define dW3r8b                  v12.8b
111 #define dW3i                    v13.2s
112
113 #define dZr0                    v14.2s
114 #define dZi0                    v15.2s
115 #define dZr08b                  v14.8b
116 #define dZi08b                  v15.8b
117 #define dZr1                    v26.2s
118 #define dZi1                    v27.2s
119 #define dZr2                    v28.2s
120 #define dZi2                    v29.2s
121 #define dZr3                    v30.2s
122 #define dZi3                    v31.2s
123
124 #define dZip                    v24.2s
125 #define dZip8b                  v24.8b
126
127         .macro FFTSTAGE scaled, inverse , name
128
129         // Define stack arguments
130
131         // Move args values into our work registers
132         ldr     subFFTNum, [pSubFFTNum]
133         ldr     subFFTSize, [pSubFFTSize]
134
135         // pOut0+1 increments pOut0 by 8 bytes
136         // pOut0+outPointStep == increment of 8*outPointStep bytes
137         lsl     outPointStep,subFFTSize, #3
138
139         // Update grpCount and grpSize rightaway
140
141         ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
142         MOV     step16,#16
143         LSL     grpCount,subFFTSize,#2
144
145         ld1    {dW2r},[pTwiddle]                  // [wi|wr]
146         MOV     subFFTNum,#1                      //after the last stage
147
148         ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
149         MOV     stepTwiddle,#0
150
151         ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
152         SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with
153
154         // update subFFTSize for the next stage
155         MOV     subFFTSize,grpCount
156         ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
157         lsl     dstStep,outPointStep, #1
158
159         // AC.r AC.i BD.r BD.i
160         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
161         ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep
162
163         rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
164         MOV     step24,#24
165
166         // AC.r AC.i BD.r BD.i
167         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
168
169
170         // Process two groups at a time
171
172 radix4lsGrpLoop\name :
173
174         // VZIP    dW2r,dW2i
175         zip1    dZip, dW2r, dW2i
176         zip2    dW2i, dW2r, dW2i
177         mov     dW2r8b, dZip8b
178
179         ADD     stepTwiddle,stepTwiddle,#16
180
181         // VZIP    dW3r,dW3i
182         zip1    dZip, dW3r,dW3i
183         zip2    dW3i, dW3r, dW3i
184         mov     dW3r8b, dZip8b
185         ADD     grpTwStep,stepTwiddle,#4
186
187         // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
188         uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
189         uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
190         mov      dButterfly1Real138b, dZip8b
191
192         SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle
193
194         // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
195         uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
196         uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
197         mov      dButterfly1Imag138b, dZip8b
198         lsl     grpTwStep,grpTwStep,#1
199
200         // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
201         uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
202         uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
203         mov      dButterfly1Real028b, dZip8b
204         rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle
205
206         // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
207         uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
208         uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
209         mov      dButterfly1Imag028b, dZip8b
210
211
212         // grpCount is multiplied by 4
213         SUBS    grpCount,grpCount,#8
214
215         .ifeqs  "\inverse", "TRUE"
216             fmul   dZr1,dW1r,dXr1
217             fmla   dZr1,dW1i,dXi1                       // real part
218             fmul   dZi1,dW1r,dXi1
219             fmls   dZi1,dW1i,dXr1                       // imag part
220
221         .else
222
223             fmul   dZr1,dW1r,dXr1
224             fmls   dZr1,dW1i,dXi1                       // real part
225             fmul   dZi1,dW1r,dXi1
226             fmla   dZi1,dW1i,dXr1                       // imag part
227
228         .endif
229
230         ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]
231
232         .ifeqs  "\inverse", "TRUE"
233             fmul   dZr2,dW2r,dXr2
234             fmla   dZr2,dW2i,dXi2                       // real part
235             fmul   dZi2,dW2r,dXi2
236             ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
237             fmls   dZi2,dW2i,dXr2                       // imag part
238
239         .else
240
241             fmul   dZr2,dW2r,dXr2
242             fmls   dZr2,dW2i,dXi2                       // real part
243             fmul   dZi2,dW2r,dXi2
244             ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
245             fmla   dZi2,dW2i,dXr2                       // imag part
246
247         .endif
248
249
250         ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]
251
252         // move qX0 so as to load for the next iteration
253         // MOV     qZ0,qX0
254         mov     dZr08b, dXr08b
255         mov     dZi08b, dXi08b
256
257         .ifeqs  "\inverse", "TRUE"
258             fmul   dZr3,dW3r,dXr3
259             fmla   dZr3,dW3i,dXi3                       // real part
260             fmul   dZi3,dW3r,dXi3
261             ld1    {dW3r},[pTwiddle],step24
262             fmls   dZi3,dW3i,dXr3                       // imag part
263
264         .else
265
266             fmul   dZr3,dW3r,dXr3
267             fmls   dZr3,dW3i,dXi3                       // real part
268             fmul   dZi3,dW3r,dXi3
269             ld1    {dW3r},[pTwiddle],step24
270             fmla   dZi3,dW3i,dXr3                       // imag part
271
272         .endif
273
274         ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]
275
276         // Don't do the load on the last iteration so we don't read past the end
277         // of pSrc.
278         bne     skipIncrement\name
279         add     pSrc, pSrc, #64
280 skipIncrement\name:     
281         beq     radix4lsSkipRead\name
282         // AC.r AC.i BD.r BD.i
283         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
284
285         // AC.r AC.i BD.r BD.i
286         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
287 radix4lsSkipRead\name:
288
289         // finish first stage of 4 point FFT
290
291         // fadd    qY0,qZ0,qZ2
292         fadd    dYr0,dZr0,dZr2
293         fadd    dYi0,dZi0,dZi2
294         // fsub    qY2,qZ0,qZ2
295         fsub    dYr2,dZr0,dZr2
296         fsub    dYi2,dZi0,dZi2
297         // fadd    qY1,qZ1,qZ3
298         fadd    dYr1,dZr1,dZr3
299         fadd    dYi1,dZi1,dZi3
300         // fsub    qY3,qZ1,qZ3
301         fsub    dYr3,dZr1,dZr3
302         fsub    dYi3,dZi1,dZi3
303
304
305         // finish second stage of 4 point FFT
306
307         .ifeqs  "\inverse", "TRUE"
308
309             // fsub    qZ0,qY2,qY1
310             fsub    dZr0,dYr2,dYr1
311             fsub    dZi0,dYi2,dYi1
312             fadd    dZr3,dYr0,dYi3
313             st2    {dZr0,dZi0},[pDst],outPointStep
314             fsub    dZi3,dYi0,dYr3
315
316             // fadd    qZ2,qY2,qY1
317             fadd    dZr2,dYr2,dYr1
318             fadd    dZi2,dYi2,dYi1
319
320             st2    {dZr3,dZi3},[pDst],outPointStep
321
322             fsub    dZr1,dYr0,dYi3
323             st2    {dZr2,dZi2},[pDst],outPointStep
324             fadd    dZi1,dYi0,dYr3
325
326             // dstStep = -outPointStep + 16
327             st2    {dZr1,dZi1},[pDst],dstStep
328
329
330         .else
331
332             // fsub    qZ0,qY2,qY1
333             fsub    dZr0,dYr2,dYr1
334             fsub    dZi0,dYi2,dYi1
335
336             fsub    dZr1,dYr0,dYi3
337             st2    {dZr0,dZi0},[pDst],outPointStep
338             fadd    dZi1,dYi0,dYr3
339
340             // fadd    qZ2,qY2,qY1
341             fadd    dZr2,dYr2,dYr1
342             fadd    dZi2,dYi2,dYi1
343
344             st2    {dZr1,dZi1},[pDst],outPointStep
345
346             fadd    dZr3,dYr0,dYi3
347             st2    {dZr2,dZi2},[pDst],outPointStep
348             fsub    dZi3,dYi0,dYr3
349
350             // dstStep = -outPointStep + 16
351             st2    {dZr3,dZi3},[pDst],dstStep
352
353
354         .endif
355
356         BGT     radix4lsGrpLoop\name
357
358         .endm
359
360
361         M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
362         FFTSTAGE "FALSE","FALSE",fwd
363         M_END
364
365
366         M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
367         FFTSTAGE "FALSE","TRUE",inv
368         M_END
369
370
371         .end