Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / arm64 / armSP_FFT_CToC_FC32_Radix4_s.S
1 //
2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 //
4 //  Use of this source code is governed by a BSD-style license
5 //  that can be found in the LICENSE file in the root of the source
6 //  tree. An additional intellectual property rights grant can be found
7 //  in the file PATENTS.  All contributing project authors may
8 //  be found in the AUTHORS file in the root of the source tree.
9 //
10 //
11 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
12 //  to support float instead of SC32.
13 //
14
15 //
16 // Description:
17 // Compute a Radix 4 FFT stage for a N point complex signal
18 //
19 //
20
21
22 // Include standard headers
23
24 #include "dl/api/arm/arm64COMM_s.h"
25 #include "dl/api/arm/omxtypes_s.h"
26
27
28 // Import symbols required from other files
29 // (For example tables)
30
31
32
33
34 // Set debugging level
35 //DEBUG_ON    SETL {TRUE}
36
37
38
39 // Guarding implementation by the processor name
40
41
42
43
44 // Guarding implementation by the processor name
45
46
47 // Import symbols required from other files
48 // (For example tables)
49
50
51 //Input Registers
52
53 #define pSrc            x0
54 #define pDst            x1
55 #define pTwiddle        x2
56 #define pSubFFTNum      x3
57 #define pSubFFTSize     x4      
58
59
60
61 //Output Registers
62
63
64 //Local Scratch Registers
65
66 #define subFFTNum       x5
67 #define subFFTSize      x6
68 #define grpCount        x7
69 #define grpCount32      w7
70 #define pointStep       x8
71 #define pointStep32     w8
72 #define outPointStep    x9
73 #define stepTwiddle     x10
74 #define setCount        x11
75 #define srcStep         x12
76 #define setStep         x13
77 #define dstStep         x14
78 #define twStep          x15
79
80 // Neon Registers
81
82 #define dW1     v0.2s
83 #define dW2     v1.2s
84 #define dW3     v2.2s
85
86 #define dXr0    v4.2s
87 #define dXi0    v5.2s
88 #define dXr1    v6.2s
89 #define dXi1    v7.2s
90 #define dXr2    v8.2s
91 #define dXi2    v9.2s
92 #define dXr3    v10.2s
93 #define dXi3    v11.2s
94 #define dYr0    v12.2s
95 #define dYi0    v13.2s
96 #define dYr1    v14.2s
97 #define dYi1    v15.2s
98 #define dYr2    v16.2s
99 #define dYi2    v17.2s
100 #define dYr3    v18.2s
101 #define dYi3    v19.2s
102 #define dZr0    v20.2s
103 #define dZi0    v21.2s
104 #define dZr1    v22.2s
105 #define dZi1    v23.2s
106 #define dZr2    v24.2s
107 #define dZi2    v25.2s
108 #define dZr3    v26.2s
109 #define dZi3    v27.2s
110
111         .macro FFTSTAGE scaled, inverse , name
112
113         // Define stack arguments
114
115         // Move args values into our work registers
116         ldr     subFFTNum, [pSubFFTNum]
117         ldr     subFFTSize, [pSubFFTSize]
118
119         // Update grpCount and grpSize rightaway inorder to reuse
120         // pGrpCount and pGrpSize regs
121
122         LSL     grpCount,subFFTSize,#2
123         LSR     subFFTNum,subFFTNum,#2
124         MOV     subFFTSize,grpCount
125
126         ld1      {dW1},[pTwiddle]                    //[wi | wr]
127         // pT0+1 increments pT0 by 8 bytes
128         // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
129         lsl     pointStep,subFFTNum, #1
130
131         // pOut0+1 increments pOut0 by 8 bytes
132         // pOut0+outPointStep == increment of 8*outPointStep bytes
133         //   = 2*size bytes
134
135         MOV     stepTwiddle,#0
136         ld1      {dW2},[pTwiddle]                    //[wi | wr]
137         smull   outPointStep,grpCount32,pointStep32
138
139         LSL     pointStep,pointStep,#2             // 2*grpSize
140
141         ld1      {dW3},[pTwiddle]                  //[wi | wr]
142         lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep
143
144         ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep
145
146         rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
147         SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16
148
149         lsl     dstStep,outPointStep, #1
150
151         ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
152         // dstStep = - 3*outPointStep+16
153         rsb     dstStep,dstStep,#16
154
155
156 radix4GrpLoop\name :
157
158         ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
159         ADD      stepTwiddle,stepTwiddle,pointStep
160         ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
161         // set pTwiddle to the first point
162         ADD      pTwiddle,pTwiddle,stepTwiddle
163         ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
164         lsl      twStep,stepTwiddle, #2
165
166         //  data[3] & update pSrc for the next set
167         ld2     {dXr3,dXi3},[pSrc],setStep
168         SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle
169
170         lsr      setCount,pointStep, #3
171
172         // set pSrc to data[0] of the next set
173         ADD     pSrc,pSrc,#16
174         // increment to data[1] of the next set
175         ADD     pSrc,pSrc,pointStep
176
177
178         // Loop on the sets
179
180 radix4SetLoop\name :
181
182
183
184         .ifeqs  "\inverse", "TRUE"
185             fmul   dZr1,dXr1,dW1[0]
186             fmul   dZi1,dXi1,dW1[0]
187             fmul   dZr2,dXr2,dW2[0]
188             fmul   dZi2,dXi2,dW2[0]
189             fmul   dZr3,dXr3,dW3[0]
190             fmul   dZi3,dXi3,dW3[0]
191
192             fmla   dZr1,dXi1,dW1[1]                // real part
193             fmls   dZi1,dXr1,dW1[1]                // imag part
194
195             //  data[1] for next iteration
196             ld2     {dXr1,dXi1},[pSrc],pointStep
197
198             fmla   dZr2,dXi2,dW2[1]                // real part
199             fmls   dZi2,dXr2,dW2[1]                // imag part
200
201             //  data[2] for next iteration
202             ld2     {dXr2,dXi2},[pSrc],pointStep
203
204             fmla   dZr3,dXi3,dW3[1]                // real part
205             fmls   dZi3,dXr3,dW3[1]                // imag part
206         .else
207             fmul   dZr1,dXr1,dW1[0]
208             fmul   dZi1,dXi1,dW1[0]
209             fmul   dZr2,dXr2,dW2[0]
210             fmul   dZi2,dXi2,dW2[0]
211             fmul   dZr3,dXr3,dW3[0]
212             fmul   dZi3,dXi3,dW3[0]
213
214             fmls   dZr1,dXi1,dW1[1]                // real part
215             fmla   dZi1,dXr1,dW1[1]                // imag part
216
217             //  data[1] for next iteration
218             ld2     {dXr1,dXi1},[pSrc],pointStep
219
220             fmls   dZr2,dXi2,dW2[1]                // real part
221             fmla   dZi2,dXr2,dW2[1]                // imag part
222
223             //  data[2] for next iteration
224             ld2     {dXr2,dXi2},[pSrc],pointStep
225
226             fmls   dZr3,dXi3,dW3[1]                // real part
227             fmla   dZi3,dXr3,dW3[1]                // imag part
228         .endif
229
230         //  data[3] & update pSrc to data[0]
231         // But don't read on the very last iteration because that reads past 
232         // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
233         cmp     grpCount, #4
234         
235         b.ne    skipUpdate\name
236         cmp     setCount, #2
237         b.ne    skipUpdate\name
238         add     pSrc, pSrc, setStep
239         beq     radix4SkipRead\name
240 skipUpdate\name:
241         ld2     {dXr3,dXi3},[pSrc],setStep
242 radix4SkipRead\name:
243
244         SUBS    setCount,setCount,#2
245
246         // finish first stage of 4 point FFT
247         // fadd    qY0,qX0,qZ2
248         // fsub    qY2,qX0,qZ2
249         fadd    dYr0,dXr0,dZr2
250         fsub    dYr2,dXr0,dZr2
251         fadd    dYi0,dXi0,dZi2
252         fsub    dYi2,dXi0,dZi2
253
254         //  data[0] for next iteration
255         ld2     {dXr0,dXi0},[pSrc], #16
256         // fadd    qY1,qZ1,qZ3
257         // fsub    qY3,qZ1,qZ3
258         fadd    dYr1,dZr1,dZr3
259         fsub    dYr3,dZr1,dZr3
260         fadd    dYi1,dZi1,dZi3
261         fsub    dYi3,dZi1,dZi3
262
263         // finish second stage of 4 point FFT
264
265         // fsub    qZ0,qY2,qY1
266         fsub    dZr0,dYr2,dYr1
267         fsub    dZi0,dYi2,dYi1
268
269         .ifeqs  "\inverse", "TRUE"
270
271             fadd    dZr3,dYr0,dYi3
272             st2     {dZr0,dZi0},[pDst],outPointStep
273             fsub    dZi3,dYi0,dYr3
274
275             // fadd    qZ2,qY2,qY1
276             fadd    dZr2,dYr2,dYr1
277             fadd    dZi2,dYi2,dYi1
278
279             st2     {dZr3,dZi3},[pDst],outPointStep
280
281             fsub    dZr1,dYr0,dYi3
282             st2     {dZr2,dZi2},[pDst],outPointStep
283             fadd    dZi1,dYi0,dYr3
284
285             st2     {dZr1,dZi1},[pDst],dstStep
286
287
288         .else
289
290             fsub    dZr1,dYr0,dYi3
291             st2     {dZr0,dZi0},[pDst],outPointStep
292             fadd    dZi1,dYi0,dYr3
293
294             // fadd    qZ2,qY2,qY1
295             fadd    dZr2,dYr2,dYr1
296             fadd    dZi2,dYi2,dYi1
297
298             st2     {dZr1,dZi1},[pDst],outPointStep
299
300             fadd    dZr3,dYr0,dYi3
301             st2     {dZr2,dZi2},[pDst],outPointStep
302             fsub    dZi3,dYi0,dYr3
303
304             st2     {dZr3,dZi3},[pDst],dstStep
305
306
307         .endif
308
309         // increment to data[1] of the next set
310         ADD     pSrc,pSrc,pointStep
311         BGT     radix4SetLoop\name
312
313
314         ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
315         // subtract 4 since grpCount multiplied by 4
316         SUBS    grpCount,grpCount,#4
317         ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
318         // increment pSrc for the next grp
319         ADD     pSrc,pSrc,srcStep
320         ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
321         BGT     radix4GrpLoop\name
322
323         str     subFFTNum, [pSubFFTNum]
324         str     subFFTSize, [pSubFFTSize]
325
326         .endm
327
328
329         M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
330             FFTSTAGE "FALSE","FALSE",FWD
331         M_END
332
333
334         M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
335             FFTSTAGE "FALSE","TRUE",INV
336         M_END
337
338
339         .end