Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / neon / armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
1 @//
2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @//  Use of this source code is governed by a BSD-style license
5 @//  that can be found in the LICENSE file in the root of the source
6 @//  tree. An additional intellectual property rights grant can be found
7 @//  in the file PATENTS.  All contributing project authors may
8 @//  be found in the AUTHORS file in the root of the source tree.
9 @//
10 @//  This file was originally licensed as follows. It has been
11 @//  relicensed with permission from the copyright holders.
12
13 @//
14 @//
15 @// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision:   7765
18 @// Last Modified Date:       Thu, 27 Sep 2007
19 @//
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @//
22 @//
23 @//
24 @// Description:
25 @// Compute a Radix 4 FFT stage for a N point complex signal
26 @//
27 @//
28
29
30 @// Include standard headers
31
32 #include "dl/api/arm/armCOMM_s.h"
33 #include "dl/api/arm/omxtypes_s.h"
34
35
36 @// Import symbols required from other files
37 @// (For example tables)
38
39
40
41
42 @// Set debugging level
43 @//DEBUG_ON    SETL {TRUE}
44
45
46 @// Guarding implementation by the processor name
47
48
49
50
51
52
53 @// Guarding implementation by the processor name
54
55
56 @// Import symbols required from other files
57 @// (For example tables)
58     @//IMPORT  armAAC_constTable
59
60 @//Input Registers
61
62 #define pSrc                            r0
63 #define pDst                            r2
64 #define pTwiddle                        r1
65 #define subFFTNum                       r6
66 #define subFFTSize                      r7
67
68
69
70 @//Output Registers
71
72
73 @//Local Scratch Registers
74
75 #define outPointStep                    r3
76 #define grpCount                        r4
77 #define dstStep                         r5
78 #define pw1                             r8
79 #define pw2                             r9
80 #define pw3                             r10
81 #define pTmp                            r4
82
83
84 @// Neon Registers
85
86 #define dButterfly1Real02               D0.S16
87 #define dButterfly1Imag02               D1.S16
88 #define dButterfly1Real13               D2.S16
89 #define dButterfly1Imag13               D3.S16
90 #define dButterfly2Real02               D4.S16
91 #define dButterfly2Imag02               D5.S16
92 #define dButterfly2Real13               D6.S16
93 #define dButterfly2Imag13               D7.S16
94 #define dXr0                            D0.S16
95 #define dXi0                            D1.S16
96 #define dXr1                            D2.S16
97 #define dXi1                            D3.S16
98 #define dXr2                            D4.S16
99 #define dXi2                            D5.S16
100 #define dXr3                            D6.S16
101 #define dXi3                            D7.S16
102
103 #define dW1rS32                         D8.S32
104 #define dW1iS32                         D9.S32
105 #define dW2rS32                         D10.S32
106 #define dW2iS32                         D11.S32
107 #define dW3rS32                         D12.S32
108 #define dW3iS32                         D13.S32
109
110 #define dW1r                            D8.S16
111 #define dW1i                            D9.S16
112 #define dW2r                            D10.S16
113 #define dW2i                            D11.S16
114 #define dW3r                            D12.S16
115 #define dW3i                            D13.S16
116
117 #define dTmp0                           D12.S16
118 #define dTmp1                           D13.S16
119 #define dTmp1S32                        D13.S32
120 #define dTmp2S32                        D14.S32
121 #define dTmp3S32                        D15.S32
122
123 #define dYr0                            D18.S16
124 #define dYi0                            D19.S16
125 #define dYr1                            D16.S16
126 #define dYi1                            D17.S16
127 #define dYr2                            D20.S16
128 #define dYi2                            D21.S16
129 #define dYr3                            D14.S16
130 #define dYi3                            D15.S16
131 #define qY0                             Q9.S16
132 #define qY1                             Q8.S16
133 #define qY2                             Q10.S16
134 #define qY3                             Q7.S16
135
136 #define qX0                             Q0.S16
137 #define qX1                             Q1.S16
138 #define qX2                             Q2.S16
139 #define qX3                             Q3.S16
140
141 #define qT0                             Q9.S32
142 #define qT1                             Q10.S32
143 #define qT2                             Q7.S32
144 #define qT3                             Q8.S32
145
146 #define dZr0                            D22.S16
147 #define dZi0                            D23.S16
148 #define dZr1                            D24.S16
149 #define dZi1                            D25.S16
150 #define dZr2                            D26.S16
151 #define dZi2                            D27.S16
152 #define dZr3                            D28.S16
153 #define dZi3                            D29.S16
154
155 #define qZ0                             Q11.S16
156 #define qZ1                             Q12.S16
157 #define qZ2                             Q13.S16
158 #define qZ3                             Q14.S16
159
160
161         .macro FFTSTAGE scaled, inverse , name
162
163         @// Define stack arguments
164
165         MOV     pw2,pTwiddle
166
167         MOV     pw3,pTwiddle
168         MOV     pw1,pTwiddle
169         @// pOut0+1 increments pOut0 by 8 bytes
170         @// pOut0+outPointStep == increment of 4*outPointStep bytes
171         MOV     outPointStep,subFFTSize,LSL #2
172
173         MOV     subFFTNum,#1                            @//after the last stage
174         LSL     grpCount,subFFTSize,#2
175
176
177         @// Update grpCount and grpSize rightaway
178
179         @// update subFFTSize for the next stage
180         MOV     subFFTSize,grpCount
181         MOV     dstStep,outPointStep,LSL #1
182
183         ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
184         RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
185
186         @// Process 4 groups at a time
187
188 grpLoop\name:
189         VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
190         VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
191
192         @// Load the second twiddle for 4 groups : w^2
193         @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
194         VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
195
196         VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
197
198         @// Load the third twiddle for 4 groups : w^3
199         @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
200         VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
201
202         VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
203         VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
204
205         VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
206
207         VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
208
209         VLD2 {dW1r,dW1i}, [pw1 :128]!
210
211         @// Rearrange the third twiddle
212         VUZP    dW3r,dW3i
213         SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
214
215         .ifeqs  "\inverse", "TRUE"
216             VMULL   qT0,dXr1,dW1r
217             VMLAL   qT0,dXi1,dW1i                       @// real part
218             VMULL   qT1,dXi1,dW1r
219             VMLSL   qT1,dXr1,dW1i                       @// imag part
220
221         .else
222             VMULL   qT0,dXr1,dW1r
223             VMLSL   qT0,dXi1,dW1i                       @// real part
224             VMULL   qT1,dXi1,dW1r
225             VMLAL   qT1,dXr1,dW1i                       @// imag part
226
227         .endif
228
229         @// Load the first twiddle for 4 groups : w^1
230         @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
231
232         .ifeqs  "\inverse", "TRUE"
233             VMULL   qT2,dXr2,dW2r
234             VMLAL   qT2,dXi2,dW2i                       @// real part
235             VMULL   qT3,dXi2,dW2r
236             VMLSL   qT3,dXr2,dW2i                       @// imag part
237
238         .else
239             VMULL   qT2,dXr2,dW2r
240             VMLSL   qT2,dXi2,dW2i                       @// real part
241             VMULL   qT3,dXi2,dW2r
242             VMLAL   qT3,dXr2,dW2i                       @// imag part
243
244         .endif
245
246         VRSHRN  dZr1,qT0,#15
247         VRSHRN  dZi1,qT1,#15
248
249
250
251         .ifeqs  "\inverse", "TRUE"
252             VMULL   qT0,dXr3,dW3r
253             VMLAL   qT0,dXi3,dW3i                       @// real part
254             VMULL   qT1,dXi3,dW3r
255             VMLSL   qT1,dXr3,dW3i                       @// imag part
256
257         .else
258             VMULL   qT0,dXr3,dW3r
259             VMLSL   qT0,dXi3,dW3i                       @// real part
260             VMULL   qT1,dXi3,dW3r
261             VMLAL   qT1,dXr3,dW3i                       @// imag part
262
263         .endif
264
265         VRSHRN  dZr2,qT2,#15
266         VRSHRN  dZi2,qT3,#15
267
268         VRSHRN  dZr3,qT0,#15
269         VRSHRN  dZi3,qT1,#15
270
271         .ifeqs "\scaled", "TRUE"
272
273             @// finish first stage of 4 point FFT
274
275             VHADD    qY0,qX0,qZ2
276             VHSUB    qY2,qX0,qZ2
277             VHADD    qY1,qZ1,qZ3
278
279             VHSUB    qY3,qZ1,qZ3
280
281             @// finish second stage of 4 point FFT
282
283             VHSUB    qZ0,qY2,qY1
284             VHADD    qZ2,qY2,qY1
285
286
287             .ifeqs "\inverse", "TRUE"
288
289                 VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
290                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
291                 VHSUB    dZi3,dYi0,dYr3
292
293                 VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
294                 VHADD    dZi1,dYi0,dYr3
295                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
296                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
297                 VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
298
299             .else
300
301                 VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
302                 VHADD    dZi1,dYi0,dYr3
303
304                 VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
305                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
306                 VHSUB    dZi3,dYi0,dYr3
307                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
308                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
309                 VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
310
311             .endif
312
313         .else
314
315             @// finish first stage of 4 point FFT
316
317             VADD    qY0,qX0,qZ2
318             VSUB    qY2,qX0,qZ2
319             VADD    qY1,qZ1,qZ3
320
321             VSUB    qY3,qZ1,qZ3
322
323             @// finish second stage of 4 point FFT
324
325             VSUB    qZ0,qY2,qY1
326             VADD    qZ2,qY2,qY1
327
328
329             .ifeqs "\inverse", "TRUE"
330
331                 VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
332                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
333                 VSUB    dZi3,dYi0,dYr3
334
335                 VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
336                 VADD    dZi1,dYi0,dYr3
337                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
338                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
339                 VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
340
341             .else
342
343                 VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
344                 VADD    dZi1,dYi0,dYr3
345
346                 VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
347                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
348                 VSUB    dZi3,dYi0,dYr3
349                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
350                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
351                 VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
352
353             .endif
354
355
356
357
358         .endif
359
360         BGT     grpLoop\name
361
362
363         @// Reset and Swap pSrc and pDst for the next stage
364         MOV     pTmp,pDst
365         SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
366         SUB     pSrc,pTmp,outPointStep
367
368         .endm
369
370
371         M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
372         FFTSTAGE "FALSE","FALSE",FWD
373         M_END
374
375
376         M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
377         FFTSTAGE "FALSE","TRUE",INV
378         M_END
379
380
381         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
382         FFTSTAGE "TRUE","FALSE",FWDSFS
383         M_END
384
385
386         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
387         FFTSTAGE "TRUE","TRUE",INVSFS
388         M_END
389
390
391
392
393
394
395     .end