- add third_party src.
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / neon / armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
1 @
2 @  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @
4 @  Use of this source code is governed by a BSD-style license
5 @  that can be found in the LICENSE file in the root of the source
6 @  tree. An additional intellectual property rights grant can be found
7 @  in the file PATENTS.  All contributing project authors may
8 @  be found in the AUTHORS file in the root of the source tree.
9 @
10 @ Some code in this file was originally from file
11 @ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
12 @ follows. It has been relicensed with permission from the copyright holders.
13 @
14
15 @
16 @ OpenMAX DL: v1.0.2
17 @ Last Modified Revision:   7485
18 @ Last Modified Date:       Fri, 21 Sep 2007
19
20 @ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @
22
23 @
24 @ Description:
25 @ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
26 @ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
27 @ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
28 @ formula.
29
30         
31 #include "dl/api/arm/armCOMM_s.h"
32 #include "dl/api/arm/omxtypes_s.h"
33         
34 @//Input Registers
35 #define pSrc            r0
36 #define pDst            r1
37 #define pFFTSpec        r2
38 #define scale           r3
39
40 @ Output registers
41 #define result          r0
42
43 @//Local Scratch Registers
44 #define argTwiddle      r1
45 #define argDst          r2
46 #define argScale        r4
47 #define tmpOrder        r4
48 #define pTwiddle        r4
49 #define pOut            r5
50 #define subFFTSize      r7     
51 #define subFFTNum       r6
52 #define N               r6
53 #define order           r14
54 #define diff            r9
55 @ Total num of radix stages to comple the FFT.
56 #define count           r8
57 #define x0r             r4    
58 #define x0i             r5
59 #define diffMinusOne    r2
60 #define round           r3
61 #define pOut1           r2
62 #define size            r7
63 #define step            r8            
64 #define step1           r9
65 #define step2           r10
66 #define twStep          r10
67 #define pTwiddleTmp     r11
68 #define argTwiddle1     r12
69 #define zero            r14
70
71 @ Neon registers
72 #define dX0             D0.S16
73 #define dX0S32          D0.S32
74 #define dShift          D1.S16
75 #define dX1             D1.S16
76 #define dX1S32          D1.S32
77 #define dY0             D2.S16
78 #define dY1             D3.S16
79 #define dX0r            D0.S16            
80 #define dX0rS32         D0.S32
81 #define dX0i            D1.S16
82 #define dX1r            D2.S16
83 #define dX1i            D3.S16
84 #define qX1             Q1.S16
85 #define dW0r            D4.S16
86 #define dW0i            D5.S16
87 #define dW1r            D6.S16
88 #define dW1i            D7.S16
89 #define dW0rS32         D4.S32
90 #define dW0iS32         D5.S32
91 #define dW1rS32         D6.S32
92 #define dW1iS32         D7.S32
93 #define dT0             D8.S16
94 #define dT1             D9.S16
95 #define dT2             D10.S16
96 #define dT3             D11.S16
97 #define qT0             Q6.S32
98 #define qT1             Q7.S32
99 #define qT2             Q8.S32
100 #define qT3             Q9.S32
101 #define dY0r            D4.S16
102 #define dY0i            D5.S16
103 #define dY1r            D6.S16
104 #define dY1i            D7.S16
105 #define qY1             Q3.S16
106 #define dY2             D4.S16
107 #define dY3             D5.S16
108 #define dW0             D6.S16
109 #define dW1             D7.S16
110 #define dW0Tmp          D10.S16
111 #define dW1Neg          D11.S16
112
113         @ Structure offsets for the FFTSpec             
114         .set    ARMsFFTSpec_N, 0
115         .set    ARMsFFTSpec_pBitRev, 4
116         .set    ARMsFFTSpec_pTwiddle, 8
117         .set    ARMsFFTSpec_pBuf, 12
118
119         .MACRO FFTSTAGE scaled, inverse, name
120         
121         @ Read the size from structure and take log
122         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
123         
124         @ Read other structure parameters
125         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
126         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
127         
128         MOV     size,N,ASR #1        @ preserve the contents of N
129         MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
130         
131         @ Process different FFT sizes with different loops.
132         CMP    size,#4
133         BLE    smallFFTSize\name
134         
135         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
136         @ Note: W^(k) is stored as negated value and also need to
137         @ conjugate the values from the table.
138         
139         @ Z(0) : no need of twiddle multiply
140         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
141         
142         VLD1    dX0S32[0],[pSrc],step
143         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
144                 
145         VLD1    dX1S32[0],[pSrc]!
146         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
147         
148         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
149         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
150         
151         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
152         VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
153         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
154         
155         .ifeqs  "\scaled", "TRUE"
156             VHSUB   dX0,dY0,dY1
157             SUBS    size,size,#2
158             VHADD   dX1,dY0,dY1
159         .else
160             VSUB   dX0,dY0,dY1
161             SUBS    size,size,#2
162             VADD   dX1,dY0,dY1
163         .endif
164                     
165         SUB     pSrc,pSrc,step
166         VST1    dX0[0],[pOut1]!
167         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
168         VST1    dX1[1],[pOut1]!
169         ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
170         
171         BLT     decrementScale\name
172         BEQ     lastElement\name
173                         
174         SUB     step,step,#20
175         SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
176         SUB     step2, step1, #4
177                         
178         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
179         @ Note: W^k is stored as negative values in the table and also need to
180         @ conjugate the values from the table.
181         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
182         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
183
184 evenOddButterflyLoop\name:     
185         VLD2    {dX0r,dX0i},[pSrc],step
186         VLD2    {dX1r,dX1i},[pSrc]!
187         SUB     pSrc, pSrc, step
188
189         VLD1    dW0r,[argTwiddle1],step1
190         VREV64  qX1,qX1
191         VLD1    dW1r,[argTwiddle1]!
192         VHSUB   dT2,dX0r,dX1r                          @ a-c
193         SUB     argTwiddle1, argTwiddle1, step1
194         SUB     step1,step1,#16
195
196         VLD1    dW0i,[pTwiddleTmp],step2
197         VHADD   dT3,dX0i,dX1i                          @ b+d
198         VLD1    dW1i,[pTwiddleTmp]!
199         VHADD   dT0,dX0r,dX1r                          @ a+c
200         VHSUB   dT1,dX0i,dX1i                          @ b-d
201         SUB     pTwiddleTmp, pTwiddleTmp, step2
202         SUB     step2,step2,#16
203
204         SUBS    size,size,#8
205         
206         VZIP    dW1r,dW1i
207         VTRN    dW0r,dW0i
208         VZIP    dW1iS32, dW1rS32
209                                 
210         VMULL   qT0,dW1i,dT2
211         VMLSL   qT0,dW1r,dT3
212         VMULL   qT1,dW1i,dT3
213         VMLAL   qT1,dW1r,dT2
214         VMULL   qT2,dW0r,dT2
215         VMLAL   qT2,dW0i,dT3
216         VMULL   qT3,dW0r,dT3
217         VMLSL   qT3,dW0i,dT2
218         
219         VRSHRN  dX1r,qT0,#15
220         VRSHRN  dX1i,qT1,#15
221         VRSHRN  dX0r,qT2,#15
222         VRSHRN  dX0i,qT3,#15
223         
224         .ifeqs  "\scaled", "TRUE"
225             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
226             VHSUB    dY1i,dX1r,dT1
227         .else
228             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
229             VSUB    dY1i,dX1r,dT1
230         .endif
231         
232         .ifeqs  "\scaled", "TRUE"
233             VHADD    dY0r,dT0,dX0i                     @ F(1)
234             VHSUB    dY0i,dT1,dX0r
235         .else
236             VADD    dY0r,dT0,dX0i                      @ F(1)
237             VSUB    dY0i,dT1,dX0r
238         .endif
239         
240         VREV64  qY1,qY1
241
242         VST2    {dY0r,dY0i},[pOut1],step
243         VST2    {dY1r,dY1i},[pOut1]
244         ADD     pOut1,pOut1,#16
245         SUB     pOut1, pOut1, step
246         SUB     step,step,#32
247        
248         BGT     evenOddButterflyLoop\name
249
250         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
251         SUB     pOut1,pOut1,#4
252         B       lastElement\name
253         
254 smallFFTSize\name:
255         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
256         @ Note: W^(k) is stored as negated value and also need to
257         @ conjugate the values from the table.
258         
259         @ Z(0) : no need of twiddle multiply
260         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
261         
262         VLD1    dX0S32[0],[pSrc],step
263         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
264                 
265         VLD1    dX1S32[0],[pSrc]!
266         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
267         
268         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
269         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
270         
271         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
272         VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
273         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
274         
275         .ifeqs  "\scaled", "TRUE"
276             VHSUB   dX0,dY0,dY1
277             SUBS    size,size,#2
278             VHADD   dX1,dY0,dY1
279         .else
280             VSUB   dX0,dY0,dY1
281             SUBS    size,size,#2
282             VADD   dX1,dY0,dY1
283         .endif
284                     
285         SUB     pSrc,pSrc,step
286         VST1    dX0[0],[pOut1]!
287         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
288         VST1    dX1[1],[pOut1]!
289         ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
290         
291         BLT     decrementScale\name
292         BEQ     lastElement\name
293                         
294         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
295         @ Note: W^k is stored as negative values in the table and also need to
296         @ conjugate the values from the table.
297         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
298         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
299
300         SUB     step,step,#12
301
302 evenOddButterflyLoopSize4\name:     
303         VLD1    dW0rS32[0],[argTwiddle1],step1
304         VLD1    dW1rS32[0],[argTwiddle1]!
305         
306         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
307         VLD2    {dX0r[1],dX0i[1]},[pSrc],step
308         SUB     pSrc,pSrc,#4
309         SUB     argTwiddle1,argTwiddle1,step1
310         VLD2    {dX1r[0],dX1i[0]},[pSrc]!
311         VLD2    {dX1r[1],dX1i[1]},[pSrc]!
312         
313         SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
314         VLD1    dW0iS32[0],[pTwiddleTmp],step1
315         VLD1    dW1iS32[0],[pTwiddleTmp]!
316         SUB     pSrc,pSrc,step
317         
318         SUB     pTwiddleTmp,pTwiddleTmp,step1
319         VREV32  dX1r,dX1r
320         VREV32  dX1i,dX1i
321         SUBS    size,size,#4
322                         
323         VHSUB   dT2,dX0r,dX1r                          @ a-c
324         VHADD   dT3,dX0i,dX1i                          @ b+d
325         SUB     step1,step1,#4
326         VHADD   dT0,dX0r,dX1r                          @ a+c
327         VHSUB   dT1,dX0i,dX1i                          @ b-d
328         
329         VTRN    dW1r,dW1i
330         VTRN    dW0r,dW0i
331                                 
332         VMULL   qT0,dW1r,dT2
333         VMLSL   qT0,dW1i,dT3
334         VMULL   qT1,dW1r,dT3
335         VMLAL   qT1,dW1i,dT2
336         VMULL   qT2,dW0r,dT2
337         VMLAL   qT2,dW0i,dT3
338         VMULL   qT3,dW0r,dT3
339         VMLSL   qT3,dW0i,dT2
340         
341         VRSHRN  dX1r,qT0,#15
342         VRSHRN  dX1i,qT1,#15
343         
344         .ifeqs  "\scaled", "TRUE"
345             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
346             VHSUB    dY1i,dX1r,dT1
347         .else
348             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
349             VSUB    dY1i,dX1r,dT1
350         .endif
351         
352         VREV32  dY1r,dY1r
353         VREV32  dY1i,dY1i
354                             
355         VRSHRN  dX0r,qT2,#15
356         VRSHRN  dX0i,qT3,#15
357         
358         .ifeqs  "\scaled", "TRUE"
359             VHADD    dY0r,dT0,dX0i                     @ F(1)
360             VHSUB    dY0i,dT1,dX0r
361         .else
362             VADD    dY0r,dT0,dX0i                      @ F(1)
363             VSUB    dY0i,dT1,dX0r
364         .endif
365         
366         VST2    {dY0r[0],dY0i[0]},[pOut1]!
367         VST2    {dY0r[1],dY0i[1]},[pOut1],step
368         SUB     pOut1, #4
369         VST2    {dY1r[0],dY1i[0]},[pOut1]!
370         VST2    {dY1r[1],dY1i[1]},[pOut1]!
371         SUB     pOut1,pOut1,step
372         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
373         SUB     pOut1,pOut1,#4
374         
375         @ Last element can be expanded as follows
376         @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
377         @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
378         @ 1/2[2a+j0] - j (c-jd) [0+j2b]
379         @ (a+bc, -bd)
380         @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
381         
382 lastElement\name:      
383         VLD1    dX0rS32[0],[pSrc]
384         
385         .ifeqs  "\scaled", "TRUE"
386             VSHR    dX0r,dX0r,#1
387         .endif
388         
389         VST1    dX0r[0],[pOut1]!
390         VNEG    dX0r,dX0r
391         VST1    dX0r[1],[pOut1]
392
393 decrementScale\name:          
394         .ifeqs  "\scaled", "TRUE"
395             SUB scale,scale,#1
396         .endif
397         
398         .endm
399         
400         M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
401         FFTSTAGE "FALSE","TRUE",Inv
402         M_END
403         
404         M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
405         FFTSTAGE "TRUE","TRUE",InvSfs
406         M_END
407
408         
409         .end