- add third_party src.
[platform/framework/web/crosswalk.git] / src / third_party / openmax_dl / dl / sp / src / arm / neon / armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
1 @//
2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @//  Use of this source code is governed by a BSD-style license
5 @//  that can be found in the LICENSE file in the root of the source
6 @//  tree. An additional intellectual property rights grant can be found
7 @//  in the file PATENTS.  All contributing project authors may
8 @//  be found in the AUTHORS file in the root of the source tree.
9 @//
10 @//  This file was originally licensed as follows. It has been
11 @//  relicensed with permission from the copyright holders.
12
13 @//
14 @//
15 @// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision:   7766
18 @// Last Modified Date:       Thu, 27 Sep 2007
19 @//
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @//
22 @//
23 @//
24 @// Description:
25 @// Compute a first stage Radix 8 FFT stage for a N point complex signal
26 @//
27 @//
28
29
30 @// Include standard headers
31
32 #include "dl/api/arm/armCOMM_s.h"
33 #include "dl/api/arm/omxtypes_s.h"
34
35
36 @// Import symbols required from other files
37 @// (For example tables)
38
39
40 @// Set debugging level
41 @//DEBUG_ON    SETL {TRUE}
42
43
44
45 @// Guarding implementation by the processor name
46
47
48
49
50 @// Guarding implementation by the processor name
51
52
53 @//Input Registers
54
55 #define pSrc                            r0
56 #define pDst                            r2
57 #define pTwiddle                        r1
58 #define subFFTNum                       r6
59 #define subFFTSize                      r7
60 @// dest buffer for the next stage (not pSrc for first stage)
61 #define pPingPongBuf                    r5
62
63
64 @//Output Registers
65
66
67 @//Local Scratch Registers
68
69 #define grpSize                         r3
70 @// Reuse grpSize as setCount
71 #define setCount                        r3
72 #define pointStep                       r4
73 #define outPointStep                    r4
74 #define setStep                         r8
75 #define step1                           r9
76 #define step2                           r10
77 #define t0                              r11
78
79
80 @// Neon Registers
81
82 #define dXr0                            D14.S16
83 #define dXi0                            D15.S16
84 #define dXr1                            D2.S16
85 #define dXi1                            D3.S16
86 #define dXr2                            D4.S16
87 #define dXi2                            D5.S16
88 #define dXr3                            D6.S16
89 #define dXi3                            D7.S16
90 #define dXr4                            D8.S16
91 #define dXi4                            D9.S16
92 #define dXr5                            D10.S16
93 #define dXi5                            D11.S16
94 #define dXr6                            D12.S16
95 #define dXi6                            D13.S16
96 #define dXr7                            D0.S16
97 #define dXi7                            D1.S16
98 #define qX0                             Q7.S16
99 #define qX1                             Q1.S16
100 #define qX2                             Q2.S16
101 #define qX3                             Q3.S16
102 #define qX4                             Q4.S16
103 #define qX5                             Q5.S16
104 #define qX6                             Q6.S16
105 #define qX7                             Q0.S16
106
107 #define dUr0                            D16.S16
108 #define dUi0                            D17.S16
109 #define dUr2                            D18.S16
110 #define dUi2                            D19.S16
111 #define dUr4                            D20.S16
112 #define dUi4                            D21.S16
113 #define dUr6                            D22.S16
114 #define dUi6                            D23.S16
115 #define dUr1                            D24.S16
116 #define dUi1                            D25.S16
117 #define dUr3                            D26.S16
118 #define dUi3                            D27.S16
119 #define dUr5                            D28.S16
120 #define dUi5                            D29.S16
121 @// reuse dXr7 and dXi7
122 #define dUr7                            D30.S16
123 #define dUi7                            D31.S16
124 #define qU0                             Q8.S16
125 #define qU1                             Q12.S16
126 #define qU2                             Q9.S16
127 #define qU3                             Q13.S16
128 #define qU4                             Q10.S16
129 #define qU5                             Q14.S16
130 #define qU6                             Q11.S16
131 #define qU7                             Q15.S16
132
133
134
135 #define dVr0                            D24.S16
136 #define dVi0                            D25.S16
137 #define dVr2                            D26.S16
138 #define dVi2                            D27.S16
139 #define dVr4                            D28.S16
140 #define dVi4                            D29.S16
141 #define dVr6                            D30.S16
142 #define dVi6                            D31.S16
143 #define dVr1                            D16.S16
144 #define dVi1                            D17.S16
145 #define dVr3                            D18.S16
146 #define dVi3                            D19.S16
147 #define dVr5                            D20.S16
148 #define dVi5                            D21.S16
149 @// reuse dUi7
150 #define dVr7                            D22.S16
151 @// reuse dUr7
152 #define dVi7                            D23.S16
153 #define qV0                             Q12.S16
154 #define qV1                             Q8.S16
155 #define qV2                             Q13.S16
156 #define qV3                             Q9.S16
157 #define qV4                             Q14.S16
158 #define qV5                             Q10.S16
159 #define qV6                             Q15.S16
160 #define qV7                             Q11.S16
161
162
163
164 #define dYr0                            D16.S16
165 #define dYi0                            D17.S16
166 #define dYr2                            D18.S16
167 #define dYi2                            D19.S16
168 #define dYr4                            D20.S16
169 #define dYi4                            D21.S16
170 #define dYr6                            D22.S16
171 #define dYi6                            D23.S16
172 #define dYr1                            D24.S16
173 #define dYi1                            D25.S16
174 #define dYr3                            D26.S16
175 #define dYi3                            D27.S16
176 #define dYr5                            D28.S16
177 #define dYi5                            D29.S16
178 @// reuse dYr4 and dYi4
179 #define dYr7                            D30.S16
180 #define dYi7                            D31.S16
181 #define qY0                             Q8.S16
182 #define qY1                             Q12.S16
183 #define qY2                             Q9.S16
184 #define qY3                             Q13.S16
185 #define qY4                             Q10.S16
186 #define qY5                             Q14.S16
187 #define qY6                             Q11.S16
188 #define qY7                             Q15.S16
189
190
191 #define dT0                             D0.S16
192 #define dT1                             D1.S16
193
194
195 @// Define constants
196         .set   ONEBYSQRT2, 0x00005A82        @// Q15 format
197
198
199         .MACRO FFTSTAGE scaled, inverse , name
200
201         @// Define stack arguments
202
203         @// Update pSubFFTSize and pSubFFTNum regs
204         MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
205         LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q15 format
206
207         @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
208         LSR     grpSize,subFFTNum,#3
209         MOV     subFFTNum,grpSize
210
211
212         @// pT0+1 increments pT0 by 4 bytes
213         @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
214         @// Note: outPointStep = pointStep for firststage
215
216         MOV     pointStep,grpSize,LSL #2
217
218
219         @// Calculate the step of input data for the next set
220         @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
221         VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
222         MOV     step1,grpSize,LSL #3
223
224         MOV     step2,pointStep,LSL #3
225         VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
226         SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
227         RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
228
229
230
231         VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
232         VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
233         VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
234         VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
235         VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
236         @// grp = 0 a special case since all the twiddle factors are 1
237         @// Loop on the sets : 4 sets at a time
238
239 grpZeroSetLoop\name:
240         VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
241                                                            @//  setStep = -7*pointStep + 16
242
243         @// Decrement setcount
244         SUBS    setCount,setCount,#4                    @// decrement the set loop counter
245
246
247         .ifeqs "\scaled", "TRUE"
248             @// finish first stage of 8 point FFT
249
250             VHADD    qU0,qX0,qX4
251             VHADD    qU2,qX1,qX5
252             VHADD    qU4,qX2,qX6
253             VHADD    qU6,qX3,qX7
254
255             @// finish second stage of 8 point FFT
256
257             VHADD    qV0,qU0,qU4
258             VHSUB    qV2,qU0,qU4
259             VHADD    qV4,qU2,qU6
260             VHSUB    qV6,qU2,qU6
261
262             @// finish third stage of 8 point FFT
263
264             VHADD    qY0,qV0,qV4
265             VHSUB    qY4,qV0,qV4
266             VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
267
268             .ifeqs  "\inverse", "TRUE"
269
270                 VHSUB    dYr2,dVr2,dVi6
271                 VHADD    dYi2,dVi2,dVr6
272
273                 VHADD    dYr6,dVr2,dVi6
274                 VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
275                 VHSUB    dYi6,dVi2,dVr6
276
277                 VHSUB    qU1,qX0,qX4
278                 VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
279
280                 VHSUB    qU3,qX1,qX5
281                 VHSUB    qU5,qX2,qX6
282                 VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
283
284             .ELSE
285
286                 VHADD    dYr6,dVr2,dVi6
287                 VHSUB    dYi6,dVi2,dVr6
288
289                 VHSUB    dYr2,dVr2,dVi6
290                 VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
291                 VHADD    dYi2,dVi2,dVr6
292
293
294                 VHSUB    qU1,qX0,qX4
295                 VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
296                 VHSUB    qU3,qX1,qX5
297                 VHSUB    qU5,qX2,qX6
298                 VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
299
300
301             .ENDIF
302
303             @// finish first stage of 8 point FFT
304
305             VHSUB    qU7,qX3,qX7
306             VMOV    dT0[0],t0
307
308             @// finish second stage of 8 point FFT
309
310             VHSUB    dVr1,dUr1,dUi5
311             VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
312             VHADD    dVi1,dUi1,dUr5
313             VHADD    dVr3,dUr1,dUi5
314             VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
315             VHSUB    dVi3,dUi1,dUr5
316
317             VHSUB    dVr5,dUr3,dUi7
318             VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
319             VHADD    dVi5,dUi3,dUr7
320             VHADD    dVr7,dUr3,dUi7
321             VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
322             VHSUB    dVi7,dUi3,dUr7
323
324             @// finish third stage of 8 point FFT
325
326             .ifeqs  "\inverse", "TRUE"
327
328                 @// calculate a*v5
329                 VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
330                 VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
331                 VQRDMULH    dVi5,dVi5,dT0[0]
332
333                 VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
334                 VSUB    dVr5,dT1,dVi5                               @// a * V5
335                 VADD    dVi5,dT1,dVi5
336
337                 VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
338
339                 @// calculate  b*v7
340                 VQRDMULH    dT1,dVr7,dT0[0]
341                 VQRDMULH    dVi7,dVi7,dT0[0]
342
343                 VHADD    qY1,qV1,qV5
344                 VHSUB    qY5,qV1,qV5
345
346
347                 VADD    dVr7,dT1,dVi7                               @// b * V7
348                 VSUB    dVi7,dVi7,dT1
349                 SUB     pDst, pDst, step2                           @// set pDst to y1
350
351                 VHSUB    dYr3,dVr3,dVr7
352                 VHSUB    dYi3,dVi3,dVi7
353                 VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
354                 VHADD    dYr7,dVr3,dVr7
355                 VHADD    dYi7,dVi3,dVi7
356
357
358                 VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
359                 VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
360 #if 0
361                 VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
362 #else
363                 VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
364 #endif
365             .ELSE
366
367                 @// calculate  b*v7
368                 VQRDMULH    dT1,dVr7,dT0[0]
369                 VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
370                 VQRDMULH    dVi7,dVi7,dT0[0]
371
372                 VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
373                 VADD    dVr7,dT1,dVi7                               @// b * V7
374                 VSUB    dVi7,dVi7,dT1
375
376                 VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
377
378                 @// calculate a*v5
379                 VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
380                 VQRDMULH    dVi5,dVi5,dT0[0]
381
382                 VHADD    dYr7,dVr3,dVr7
383                 VHADD    dYi7,dVi3,dVi7
384                 SUB     pDst, pDst, step2                           @// set pDst to y1
385
386                 VSUB    dVr5,dT1,dVi5                               @// a * V5
387                 VADD    dVi5,dT1,dVi5
388
389                 VHSUB    qY5,qV1,qV5
390
391                 VHSUB    dYr3,dVr3,dVr7
392                 VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
393                 VHSUB    dYi3,dVi3,dVi7
394                 VHADD    qY1,qV1,qV5
395
396
397                 VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
398                 VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
399 #if 0
400                 VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
401 #else
402                 VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
403 #endif
404
405             .ENDIF
406
407
408
409         .ELSE
410             @// finish first stage of 8 point FFT
411
412             VADD    qU0,qX0,qX4
413             VADD    qU2,qX1,qX5
414             VADD    qU4,qX2,qX6
415             VADD    qU6,qX3,qX7
416
417             @// finish second stage of 8 point FFT
418
419             VADD    qV0,qU0,qU4
420             VSUB    qV2,qU0,qU4
421             VADD    qV4,qU2,qU6
422             VSUB    qV6,qU2,qU6
423
424             @// finish third stage of 8 point FFT
425
426             VADD    qY0,qV0,qV4
427             VSUB    qY4,qV0,qV4
428             VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
429
430             .ifeqs  "\inverse", "TRUE"
431
432                 VSUB    dYr2,dVr2,dVi6
433                 VADD    dYi2,dVi2,dVr6
434
435                 VADD    dYr6,dVr2,dVi6
436                 VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
437                 VSUB    dYi6,dVi2,dVr6
438
439                 VSUB    qU1,qX0,qX4
440                 VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
441
442                 VSUB    qU3,qX1,qX5
443                 VSUB    qU5,qX2,qX6
444                 VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
445
446             .ELSE
447
448                 VADD    dYr6,dVr2,dVi6
449                 VSUB    dYi6,dVi2,dVr6
450
451                 VSUB    dYr2,dVr2,dVi6
452                 VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
453                 VADD    dYi2,dVi2,dVr6
454
455
456                 VSUB    qU1,qX0,qX4
457                 VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
458                 VSUB    qU3,qX1,qX5
459                 VSUB    qU5,qX2,qX6
460                 VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
461
462
463             .ENDIF
464
465             @// finish first stage of 8 point FFT
466
467             VSUB    qU7,qX3,qX7
468             VMOV    dT0[0],t0
469
470             @// finish second stage of 8 point FFT
471
472             VSUB    dVr1,dUr1,dUi5
473             VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
474             VADD    dVi1,dUi1,dUr5
475             VADD    dVr3,dUr1,dUi5
476             VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
477             VSUB    dVi3,dUi1,dUr5
478
479             VSUB    dVr5,dUr3,dUi7
480             VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
481             VADD    dVi5,dUi3,dUr7
482             VADD    dVr7,dUr3,dUi7
483             VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
484             VSUB    dVi7,dUi3,dUr7
485
486             @// finish third stage of 8 point FFT
487
488             .ifeqs  "\inverse", "TRUE"
489
490                 @// calculate a*v5
491                 VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
492                 VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
493                 VQRDMULH    dVi5,dVi5,dT0[0]
494
495                 VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
496                 VSUB    dVr5,dT1,dVi5                               @// a * V5
497                 VADD    dVi5,dT1,dVi5
498
499                 VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
500
501                 @// calculate  b*v7
502                 VQRDMULH    dT1,dVr7,dT0[0]
503                 VQRDMULH    dVi7,dVi7,dT0[0]
504
505                 VADD    qY1,qV1,qV5
506                 VSUB    qY5,qV1,qV5
507
508
509                 VADD    dVr7,dT1,dVi7                               @// b * V7
510                 VSUB    dVi7,dVi7,dT1
511                 SUB     pDst, pDst, step2                           @// set pDst to y1
512
513                 VSUB    dYr3,dVr3,dVr7
514                 VSUB    dYi3,dVi3,dVi7
515                 VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
516                 VADD    dYr7,dVr3,dVr7
517                 VADD    dYi7,dVi3,dVi7
518
519
520                 VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
521                 VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
522 #if 0
523                 VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
524 #else
525                 VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
526 #endif
527             .ELSE
528
529                 @// calculate  b*v7
530                 VQRDMULH    dT1,dVr7,dT0[0]
531                 VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
532                 VQRDMULH    dVi7,dVi7,dT0[0]
533
534                 VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
535                 VADD    dVr7,dT1,dVi7                               @// b * V7
536                 VSUB    dVi7,dVi7,dT1
537
538                 VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
539
540                 @// calculate a*v5
541                 VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
542                 VQRDMULH    dVi5,dVi5,dT0[0]
543
544                 VADD    dYr7,dVr3,dVr7
545                 VADD    dYi7,dVi3,dVi7
546                 SUB     pDst, pDst, step2                           @// set pDst to y1
547
548                 VSUB    dVr5,dT1,dVi5                               @// a * V5
549                 VADD    dVi5,dT1,dVi5
550
551                 VSUB    qY5,qV1,qV5
552
553                 VSUB    dYr3,dVr3,dVr7
554                 VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
555                 VSUB    dYi3,dVi3,dVi7
556                 VADD    qY1,qV1,qV5
557
558
559                 VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
560                 VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
561 #if 0
562                 VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
563 #else
564                 VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
565 #endif
566
567             .ENDIF
568
569
570         .ENDIF
571
572         SUB     pDst, pDst, step2                               @// update pDst for the next set
573         BGT     grpZeroSetLoop\name
574
575
576         @// reset pSrc to pDst for the next stage
577         SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize
578         MOV     pDst,pPingPongBuf
579
580
581
582         .endm
583
584
585         @// Allocate stack memory required by the function
586
587
588         M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
589             FFTSTAGE "FALSE","FALSE",FWD
590         M_END
591
592
593         M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
594             FFTSTAGE "FALSE","TRUE",INV
595         M_END
596
597
598         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
599             FFTSTAGE "TRUE","FALSE",FWDSFS
600         M_END
601
602
603         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
604             FFTSTAGE "TRUE","TRUE",INVSFS
605         M_END
606
607
608
609
610
611     .END