Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / neon / armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
1 @//
2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @//  Use of this source code is governed by a BSD-style license
5 @//  that can be found in the LICENSE file in the root of the source
6 @//  tree. An additional intellectual property rights grant can be found
7 @//  in the file PATENTS.  All contributing project authors may
8 @//  be found in the AUTHORS file in the root of the source tree.
9 @//
10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11 @//  to support float instead of SC32.
12 @//
13
14 @//
15 @// Description:
16 @// Compute a Radix 4 FFT stage for a N point complex signal
17 @//
18 @//
19
20
21 @// Include standard headers
22
23 #include "dl/api/arm/armCOMM_s.h"
24 #include "dl/api/arm/omxtypes_s.h"
25
26 @// Import symbols required from other files
27 @// (For example tables)
28
29
30
31
32 @// Set debugging level
33 @//DEBUG_ON    SETL {TRUE}
34
35
36 @// Guarding implementation by the processor name
37
38
39 @// Import symbols required from other files
40 @// (For example tables)
41     @//IMPORT  armAAC_constTable
42
43 @//Input Registers
44
45 #define pSrc            r0
46 #define pDst            r2
47 #define pTwiddle        r1
48 #define subFFTNum       r6
49 #define subFFTSize      r7
50
51
52
53 @//Output Registers
54
55
56 @//Local Scratch Registers
57
58 #define outPointStep    r3
59 #define grpCount        r4
60 #define dstStep         r5
61 #define grpTwStep       r8
62 #define stepTwiddle     r9
63 #define twStep          r10
64 #define pTmp            r4
65 #define step16          r11
66 #define step24          r12
67
68
69 @// Neon Registers
70
71 #define dButterfly1Real02       D0.F32
72 #define dButterfly1Imag02       D1.F32
73 #define dButterfly1Real13       D2.F32
74 #define dButterfly1Imag13       D3.F32
75 #define dButterfly2Real02       D4.F32
76 #define dButterfly2Imag02       D5.F32
77 #define dButterfly2Real13       D6.F32
78 #define dButterfly2Imag13       D7.F32
79 #define dXr0                    D0.F32
80 #define dXi0                    D1.F32
81 #define dXr1                    D2.F32
82 #define dXi1                    D3.F32
83 #define dXr2                    D4.F32
84 #define dXi2                    D5.F32
85 #define dXr3                    D6.F32
86 #define dXi3                    D7.F32
87
88 #define dYr0                    D16.F32
89 #define dYi0                    D17.F32
90 #define dYr1                    D18.F32
91 #define dYi1                    D19.F32
92 #define dYr2                    D20.F32
93 #define dYi2                    D21.F32
94 #define dYr3                    D22.F32
95 #define dYi3                    D23.F32
96
97 #define dW1r                    D8.F32
98 #define dW1i                    D9.F32
99 #define dW2r                    D10.F32
100 #define dW2i                    D11.F32
101 #define dW3r                    D12.F32
102 #define dW3i                    D13.F32
103 #define qT0                     d14.f32
104 #define qT1                     d16.F32
105 #define qT2                     d18.F32
106 #define qT3                     d20.f32
107 #define qT4                     d22.f32
108 #define qT5                     d24.f32
109
110 #define dZr0                    D14.F32
111 #define dZi0                    D15.F32
112 #define dZr1                    D26.F32
113 #define dZi1                    D27.F32
114 #define dZr2                    D28.F32
115 #define dZi2                    D29.F32
116 #define dZr3                    D30.F32
117 #define dZi3                    D31.F32
118
119 #define qX0                     Q0.F32
120 #define qY0                     Q8.F32
121 #define qY1                     Q9.F32
122 #define qY2                     Q10.F32
123 #define qY3                     Q11.F32
124 #define qZ0                     Q7.F32
125 #define qZ1                     Q13.F32
126 #define qZ2                     Q14.F32
127 #define qZ3                     Q15.F32
128
129
130
131         .macro FFTSTAGE scaled, inverse , name
132
133         @// Define stack arguments
134
135
136         @// pOut0+1 increments pOut0 by 8 bytes
137         @// pOut0+outPointStep == increment of 8*outPointStep bytes
138         MOV     outPointStep,subFFTSize,LSL #3
139
140         @// Update grpCount and grpSize rightaway
141
142         VLD2    {dW1r,dW1i},[pTwiddle :128]             @// [wi|wr]
143         MOV     step16,#16
144         LSL     grpCount,subFFTSize,#2
145
146         VLD1    dW2r,[pTwiddle :64]                     @// [wi|wr]
147         MOV     subFFTNum,#1                            @//after the last stage
148
149         VLD1    dW3r,[pTwiddle :64],step16              @// [wi|wr]
150         MOV     stepTwiddle,#0
151
152         VLD1    dW2i,[pTwiddle :64]!                    @// [wi|wr]
153         SUB     grpTwStep,stepTwiddle,#8                @// grpTwStep = -8 to start with
154
155         @// update subFFTSize for the next stage
156         MOV     subFFTSize,grpCount
157         VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
158         MOV     dstStep,outPointStep,LSL #1
159
160         @// AC.r AC.i BD.r BD.i
161         VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
162         ADD     dstStep,dstStep,outPointStep            @// dstStep = 3*outPointStep
163         RSB     dstStep,dstStep,#16                     @// dstStep = - 3*outPointStep+16
164         MOV     step24,#24
165
166         @// AC.r AC.i BD.r BD.i
167         VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
168
169
170         @// Process two groups at a time
171
172 radix4lsGrpLoop\name :
173
174         VZIP    dW2r,dW2i
175         ADD     stepTwiddle,stepTwiddle,#16
176         VZIP    dW3r,dW3i
177         ADD     grpTwStep,stepTwiddle,#4
178         VUZP     dButterfly1Real13, dButterfly2Real13   @// B.r D.r
179         SUB     twStep,stepTwiddle,#16                  @// -16+stepTwiddle
180         VUZP     dButterfly1Imag13, dButterfly2Imag13   @// B.i D.i
181         MOV     grpTwStep,grpTwStep,LSL #1
182         VUZP     dButterfly1Real02, dButterfly2Real02   @// A.r C.r
183         RSB     grpTwStep,grpTwStep,#0                  @// -8-2*stepTwiddle
184
185
186         VUZP     dButterfly1Imag02, dButterfly2Imag02   @// A.i C.i
187
188
189         @// grpCount is multiplied by 4
190         SUBS    grpCount,grpCount,#8
191
192         .ifeqs  "\inverse", "TRUE"
193             VMUL   dZr1,dW1r,dXr1
194             VMLA   dZr1,dW1i,dXi1                       @// real part
195             VMUL   dZi1,dW1r,dXi1
196             VMLS   dZi1,dW1i,dXr1                       @// imag part
197
198         .else
199
200             VMUL   dZr1,dW1r,dXr1
201             VMLS   dZr1,dW1i,dXi1                       @// real part
202             VMUL   dZi1,dW1r,dXi1
203             VMLA   dZi1,dW1i,dXr1                       @// imag part
204
205         .endif
206
207         VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
208
209         .ifeqs  "\inverse", "TRUE"
210             VMUL   dZr2,dW2r,dXr2
211             VMLA   dZr2,dW2i,dXi2                       @// real part
212             VMUL   dZi2,dW2r,dXi2
213             VLD1   dW2r,[pTwiddle :64],step16           @// [wi|wr]
214             VMLS   dZi2,dW2i,dXr2                       @// imag part
215
216         .else
217
218             VMUL   dZr2,dW2r,dXr2
219             VMLS   dZr2,dW2i,dXi2                       @// real part
220             VMUL   dZi2,dW2r,dXi2
221             VLD1    dW2r,[pTwiddle :64],step16          @// [wi|wr]
222             VMLA   dZi2,dW2i,dXr2                       @// imag part
223
224         .endif
225
226
227         VLD1    dW2i,[pTwiddle :64],twStep              @// [wi|wr]
228
229         @// move qX0 so as to load for the next iteration
230         VMOV     qZ0,qX0
231
232         .ifeqs  "\inverse", "TRUE"
233             VMUL   dZr3,dW3r,dXr3
234             VMLA   dZr3,dW3i,dXi3                       @// real part
235             VMUL   dZi3,dW3r,dXi3
236             VLD1    dW3r,[pTwiddle :64],step24
237             VMLS   dZi3,dW3i,dXr3                       @// imag part
238
239         .else
240
241             VMUL   dZr3,dW3r,dXr3
242             VMLS   dZr3,dW3i,dXi3                       @// real part
243             VMUL   dZi3,dW3r,dXi3
244             VLD1    dW3r,[pTwiddle :64],step24
245             VMLA   dZi3,dW3i,dXr3                       @// imag part
246
247         .endif
248
249         VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
250
251         @// Don't do the load on the last iteration so we don't read past the end
252         @// of pSrc.
253         addeq   pSrc, pSrc, #64
254         beq     radix4lsSkipRead\name
255         @// AC.r AC.i BD.r BD.i
256         VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
257
258         @// AC.r AC.i BD.r BD.i
259         VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
260 radix4lsSkipRead\name:
261
262         @// finish first stage of 4 point FFT
263
264         VADD    qY0,qZ0,qZ2
265         VSUB    qY2,qZ0,qZ2
266         VADD    qY1,qZ1,qZ3
267         VSUB    qY3,qZ1,qZ3
268
269
270         @// finish second stage of 4 point FFT
271
272         .ifeqs  "\inverse", "TRUE"
273
274             VSUB    qZ0,qY2,qY1
275
276             VADD    dZr3,dYr0,dYi3
277             VST2    {dZr0,dZi0},[pDst :128],outPointStep
278             VSUB    dZi3,dYi0,dYr3
279
280             VADD    qZ2,qY2,qY1
281             VST2    {dZr3,dZi3},[pDst :128],outPointStep
282
283             VSUB    dZr1,dYr0,dYi3
284             VST2    {dZr2,dZi2},[pDst :128],outPointStep
285             VADD    dZi1,dYi0,dYr3
286
287             @// dstStep = -outPointStep + 16
288             VST2    {dZr1,dZi1},[pDst :128],dstStep
289
290
291         .else
292
293             VSUB    qZ0,qY2,qY1
294
295             VSUB    dZr1,dYr0,dYi3
296             VST2    {dZr0,dZi0},[pDst :128],outPointStep
297             VADD    dZi1,dYi0,dYr3
298
299             VADD    qZ2,qY2,qY1
300             VST2    {dZr1,dZi1},[pDst :128],outPointStep
301
302             VADD    dZr3,dYr0,dYi3
303             VST2    {dZr2,dZi2},[pDst :128],outPointStep
304             VSUB    dZi3,dYi0,dYr3
305
306             @// dstStep = -outPointStep + 16
307             VST2    {dZr3,dZi3},[pDst :128],dstStep
308
309
310         .endif
311
312         BGT     radix4lsGrpLoop\name
313
314
315         @// Reset and Swap pSrc and pDst for the next stage
316         MOV     pTmp,pDst
317         @// Extra increment done in final iteration of the loop
318         SUB     pSrc,pSrc,#64
319         @// pDst -= 4*size; pSrc -= 8*size bytes
320         SUB     pDst,pSrc,outPointStep,LSL #2
321         SUB     pSrc,pTmp,outPointStep
322         SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
323         @// Extra increment done in final iteration of the loop
324         SUB     pTwiddle,pTwiddle,#16
325
326         .endm
327
328
329         M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
330         FFTSTAGE "FALSE","FALSE",fwd
331         M_END
332
333
334         M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
335         FFTSTAGE "FALSE","TRUE",inv
336         M_END
337
338
339         .end