Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / neon / armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
1 @//
2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @//  Use of this source code is governed by a BSD-style license
5 @//  that can be found in the LICENSE file in the root of the source
6 @//  tree. An additional intellectual property rights grant can be found
7 @//  in the file PATENTS.  All contributing project authors may
8 @//  be found in the AUTHORS file in the root of the source tree.
9 @//
10 @//  This file was originally licensed as follows. It has been
11 @//  relicensed with permission from the copyright holders.
12 @//
13
14 @// 
15 @// File Name:  armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision:   7767
18 @// Last Modified Date:       Thu, 27 Sep 2007
19 @// 
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @// 
22 @// 
23 @//
24 @// Description:
25 @// Compute a Radix 4 FFT stage for a N point complex signal
26 @// 
27
28
29
30         
31 @// Include standard headers
32
33 #include "dl/api/arm/armCOMM_s.h"
34 #include "dl/api/arm/omxtypes_s.h"
35         
36         
37 @// Import symbols required from other files
38 @// (For example tables)
39     
40         
41         
42         
43 @// Set debugging level        
44 @//DEBUG_ON    SETL {TRUE}
45
46
47
48 @// Guarding implementation by the processor name
49     
50     
51     
52     
53 @// Guarding implementation by the processor name
54     
55     
56 @// Import symbols required from other files
57 @// (For example tables)
58     
59     
60 @//Input Registers
61
62 #define pSrc            r0
63 #define pDst            r2
64 #define pTwiddle        r1
65 #define subFFTNum       r6
66 #define subFFTSize      r7
67
68
69
70 @//Output Registers
71
72
73 @//Local Scratch Registers
74
75 #define grpCount        r3
76 #define pointStep       r4
77 #define outPointStep    r5
78 #define stepTwiddle     r12
79 #define setCount        r14
80 #define srcStep         r8
81 #define setStep         r9
82 #define dstStep         r10
83 #define twStep          r11
84 #define t1              r3
85
86 @// Neon Registers
87
88 #define dW1     D0.S32
89 #define dW2     D1.S32
90 #define dW3     D2.S32   
91
92 #define dXr0    D4.S32
93 #define dXi0    D5.S32
94 #define dXr1    D6.S32
95 #define dXi1    D7.S32
96 #define dXr2    D8.S32
97 #define dXi2    D9.S32
98 #define dXr3    D10.S32
99 #define dXi3    D11.S32
100 #define dYr0    D12.S32
101 #define dYi0    D13.S32
102 #define dYr1    D14.S32
103 #define dYi1    D15.S32
104 #define dYr2    D16.S32
105 #define dYi2    D17.S32
106 #define dYr3    D18.S32
107 #define dYi3    D19.S32
108 #define qT0     Q8.S64   
109 #define qT1     Q9.S64
110 #define qT2     Q6.S64
111 #define qT3     Q7.S64
112
113 #define dZr0    D20.S32
114 #define dZi0    D21.S32
115 #define dZr1    D22.S32
116 #define dZi1    D23.S32
117 #define dZr2    D24.S32
118 #define dZi2    D25.S32
119 #define dZr3    D26.S32
120 #define dZi3    D27.S32
121
122 #define qY0     Q6.S32
123 #define qY1     Q7.S32
124 #define qY2     Q8.S32
125 #define qY3     Q9.S32   
126 #define qX0     Q2.S32
127 #define qZ0     Q10.S32
128 #define qZ1     Q11.S32
129 #define qZ2     Q12.S32
130 #define qZ3     Q13.S32
131
132         
133         .macro FFTSTAGE scaled, inverse , name
134         
135         @// Define stack arguments
136         
137         
138         @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
139         
140         LSL     grpCount,subFFTSize,#2
141         LSR     subFFTNum,subFFTNum,#2  
142         MOV     subFFTSize,grpCount
143         
144         VLD1     dW1,[pTwiddle]                             @//[wi | wr]
145         @// pT0+1 increments pT0 by 8 bytes
146         @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
147         MOV     pointStep,subFFTNum,LSL #1
148         
149         
150         @// pOut0+1 increments pOut0 by 8 bytes
151         @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size bytes
152         
153         MOV     stepTwiddle,#0
154         VLD1     dW2,[pTwiddle]                             @//[wi | wr]
155         SMULBB  outPointStep,grpCount,pointStep  
156         LSL     pointStep,pointStep,#2                      @// 2*grpSize    
157         
158         VLD1     dW3,[pTwiddle]                             @//[wi | wr]
159         MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
160         ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
161         @//RSB     setStep,setStep,#16                         @// setStep = - 3*pointStep+16
162         RSB     setStep,setStep,#0                         @// setStep = - 3*pointStep
163         SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
164         
165         MOV     dstStep,outPointStep,LSL #1
166         ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
167         RSB     dstStep,dstStep,#16                          @// dstStep = - 3*outPointStep+16
168         
169
170         
171 grpLoop\name :  
172         
173         VLD2    {dXr0,dXi0},[pSrc],pointStep                @//  data[0]
174         ADD      stepTwiddle,stepTwiddle,pointStep
175         VLD2    {dXr1,dXi1},[pSrc],pointStep                @//  data[1]
176         ADD      pTwiddle,pTwiddle,stepTwiddle              @// set pTwiddle to the first point
177         VLD2    {dXr2,dXi2},[pSrc],pointStep                @//  data[2]
178         MOV      twStep,stepTwiddle,LSL #2
179         
180         VLD2    {dXr3,dXi3},[pSrc],setStep                  @//  data[3] & update pSrc for the next set
181         SUB      twStep,stepTwiddle,twStep                  @// twStep = -3*stepTwiddle
182         
183         MOV      setCount,pointStep,LSR #3
184         ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
185         ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
186        
187         
188         @// Loop on the sets
189
190 setLoop\name :  
191         
192         
193         
194         SUBS    setCount,setCount,#2                    @// decrement the loop counter
195         
196         .ifeqs  "\inverse", "TRUE"
197             VMULL   qT0,dXr1,dW1[0]
198             VMLAL   qT0,dXi1,dW1[1]                       @// real part
199             VMULL   qT1,dXi1,dW1[0]
200             VMLSL   qT1,dXr1,dW1[1]                       @// imag part
201             
202         .else
203             VMULL   qT0,dXr1,dW1[0]
204             VMLSL   qT0,dXi1,dW1[1]                       @// real part
205             VMULL   qT1,dXi1,dW1[0]
206             VMLAL   qT1,dXr1,dW1[1]                       @// imag part
207         
208         .endif
209         
210         VLD2    {dXr1,dXi1},[pSrc],pointStep              @//  data[1] for next iteration
211         
212         .ifeqs  "\inverse", "TRUE"
213             VMULL   qT2,dXr2,dW2[0]
214             VMLAL   qT2,dXi2,dW2[1]                       @// real part
215             VMULL   qT3,dXi2,dW2[0]
216             VMLSL   qT3,dXr2,dW2[1]                       @// imag part
217             
218         .else
219             VMULL   qT2,dXr2,dW2[0]
220             VMLSL   qT2,dXi2,dW2[1]                       @// real part
221             VMULL   qT3,dXi2,dW2[0]
222             VMLAL   qT3,dXr2,dW2[1]                       @// imag part
223         
224         .endif
225         
226         VRSHRN  dZr1,qT0,#31
227         VRSHRN  dZi1,qT1,#31
228         VLD2    {dXr2,dXi2},[pSrc],pointStep              @//  data[2] for next iteration
229         
230         
231         .ifeqs  "\inverse", "TRUE"
232             VMULL   qT0,dXr3,dW3[0]
233             VMLAL   qT0,dXi3,dW3[1]                       @// real part
234             VMULL   qT1,dXi3,dW3[0]
235             VMLSL   qT1,dXr3,dW3[1]                       @// imag part
236             
237         .else
238             VMULL   qT0,dXr3,dW3[0]
239             VMLSL   qT0,dXi3,dW3[1]                       @// real part
240             VMULL   qT1,dXi3,dW3[0]
241             VMLAL   qT1,dXr3,dW3[1]                       @// imag part
242         
243         .endif
244         
245         VRSHRN  dZr2,qT2,#31
246         VRSHRN  dZi2,qT3,#31
247         
248         
249         VRSHRN  dZr3,qT0,#31
250         VRSHRN  dZi3,qT1,#31
251         VLD2    {dXr3,dXi3},[pSrc],setStep            @//  data[3] & update pSrc to data[0]
252         
253         .ifeqs "\scaled", "TRUE"
254         
255             @// finish first stage of 4 point FFT 
256             VHADD    qY0,qX0,qZ2
257             VHSUB    qY2,qX0,qZ2
258                         
259             VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0] for next iteration
260             VHADD    qY1,qZ1,qZ3
261             VHSUB    qY3,qZ1,qZ3
262             
263             @// finish second stage of 4 point FFT 
264             
265             VHSUB    qZ0,qY2,qY1
266             
267             
268             .ifeqs  "\inverse", "TRUE"
269                 
270                 VHADD    dZr3,dYr0,dYi3
271                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
272                 VHSUB    dZi3,dYi0,dYr3
273                 
274                 VHADD    qZ2,qY2,qY1
275                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
276             
277                 VHSUB    dZr1,dYr0,dYi3
278                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
279                 VHADD    dZi1,dYi0,dYr3
280             
281                 VST2    {dZr1,dZi1},[pDst :128],dstStep
282                 
283                 
284             .else
285                 
286                 VHSUB    dZr1,dYr0,dYi3
287                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
288                 VHADD    dZi1,dYi0,dYr3
289             
290                 VHADD    qZ2,qY2,qY1
291                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
292             
293                 VHADD    dZr3,dYr0,dYi3
294                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
295                 VHSUB    dZi3,dYi0,dYr3
296             
297                 VST2    {dZr3,dZi3},[pDst :128],dstStep
298
299             
300             .endif
301         
302         
303         .else
304         
305             @// finish first stage of 4 point FFT 
306             VADD    qY0,qX0,qZ2
307             VSUB    qY2,qX0,qZ2
308                         
309             VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0] for next iteration
310             VADD    qY1,qZ1,qZ3
311             VSUB    qY3,qZ1,qZ3
312             
313             @// finish second stage of 4 point FFT 
314             
315             VSUB    qZ0,qY2,qY1
316             
317             
318             .ifeqs  "\inverse", "TRUE"
319                 
320                 VADD    dZr3,dYr0,dYi3
321                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
322                 VSUB    dZi3,dYi0,dYr3
323                 
324                 VADD    qZ2,qY2,qY1
325                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
326             
327                 VSUB    dZr1,dYr0,dYi3
328                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
329                 VADD    dZi1,dYi0,dYr3
330             
331                 VST2    {dZr1,dZi1},[pDst :128],dstStep
332                 
333                 
334             .else
335                 
336                 VSUB    dZr1,dYr0,dYi3
337                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
338                 VADD    dZi1,dYi0,dYr3
339             
340                 VADD    qZ2,qY2,qY1
341                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
342             
343                 VADD    dZr3,dYr0,dYi3
344                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
345                 VSUB    dZi3,dYi0,dYr3
346             
347                 VST2    {dZr3,dZi3},[pDst :128],dstStep
348
349             
350             .endif
351             
352         .endif
353         
354         ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set              
355         BGT     setLoop\name
356         
357         
358         VLD1     dW1,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
359         SUBS    grpCount,grpCount,#4                    @// subtract 4 since grpCount multiplied by 4               
360         VLD1     dW2,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
361         ADD     pSrc,pSrc,srcStep                       @// increment pSrc for the next grp
362         VLD1     dW3,[pTwiddle :64],twStep                       @//[wi | wr]
363         BGT     grpLoop\name
364
365                 
366         @// Reset and Swap pSrc and pDst for the next stage
367         MOV     t1,pDst
368         SUB     pDst,pSrc,outPointStep,LSL #2                  @// pDst -= 2*size; pSrc -= 8*size bytes           
369         SUB     pSrc,t1,outPointStep    
370         
371         
372         .endm
373         
374         
375         M_START armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
376             FFTSTAGE "FALSE","FALSE",FWD
377         M_END
378
379         
380         M_START armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
381             FFTSTAGE "FALSE","TRUE",INV
382         M_END
383  
384         
385         M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
386             FFTSTAGE "TRUE","FALSE",FWDSFS
387         M_END
388
389         
390         M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
391             FFTSTAGE "TRUE","TRUE",INVSFS
392         M_END
393
394         
395         .end