Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / third_party / ffmpeg / libavcodec / aarch64 / opus_imdct_neon.S
1 /*
2  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/aarch64/asm.S"
22
23 #include "asm-offsets.h"
24
25 .macro shuffle a, b, c, d
26 const shuffle_\a\b\c\d, align=4
27         .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
28         .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
29         .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
30         .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
31 endconst
32 .endm
33
34 shuffle 0, 2, 1, 3
35 shuffle 1, 0, 3, 2
36 shuffle 2, 3, 0, 1
37 shuffle 3, 1, 2, 0
38
39
40 function fft5_neon
41         lsl             x2,  x2,  #3
42         ld1             {v24.2s},         [x1],  x2
43         ld2             {v25.s,v26.s}[0], [x1],  x2
44         ld2             {v25.s,v26.s}[1], [x1],  x2
45         ld2             {v25.s,v26.s}[2], [x1],  x2
46         ld2             {v25.s,v26.s}[3], [x1]
47         dup             v6.4s,  v24.s[0]
48         dup             v7.4s,  v24.s[1]
49
50         faddp           v0.4s,  v25.4s, v26.4s
51         // z[][0], z[][3]
52         fmul            v16.4s, v25.4s, v15.s[0] // rr
53         fmul            v17.4s, v25.4s, v15.s[1] // ri
54         fmul            v18.4s, v26.4s, v15.s[0] // ir
55         fmul            v19.4s, v26.4s, v15.s[1] // ii
56         faddp           v0.4s,  v0.4s,  v0.4s
57         // z[][1], z[][2]
58         fmul            v20.4s, v25.4s, v15.s[2] // rr
59         fmul            v21.4s, v25.4s, v15.s[3] // ri
60         fmul            v22.4s, v26.4s, v15.s[2] // ir
61         fmul            v23.4s, v26.4s, v15.s[3] // ii
62         fadd            v0.2s,  v24.2s, v0.2s   // out[0]
63
64         // z[0123][0], z[0123][3]
65         fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
66         fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
67         ld1             {v16.16b},  [x11]
68         ld1             {v19.16b},  [x14]
69         fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
70         fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
71         ld1             {v17.16b},  [x12]
72         // z[0123][1], z[0123][2]
73         fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
74         fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
75         ld1             {v18.16b},  [x13]
76         fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
77         fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
78
79         //real
80         tbl             v20.16b, {v24.16b}, v16.16b
81         tbl             v21.16b, {v25.16b}, v17.16b
82         tbl             v22.16b, {v26.16b}, v18.16b
83         tbl             v23.16b, {v27.16b}, v19.16b
84         //imag
85         tbl             v16.16b, {v28.16b}, v16.16b
86         tbl             v17.16b, {v29.16b}, v17.16b
87         tbl             v18.16b, {v30.16b}, v18.16b
88         tbl             v19.16b, {v31.16b}, v19.16b
89
90         fadd            v6.4s,  v6.4s,  v20.4s
91         fadd            v22.4s, v22.4s, v23.4s
92         fadd            v7.4s,  v7.4s,  v16.4s
93         fadd            v18.4s, v18.4s, v19.4s
94
95         fadd            v21.4s, v21.4s, v22.4s
96         fadd            v17.4s, v17.4s, v18.4s
97         fadd            v6.4s,  v6.4s,  v21.4s
98         fadd            v7.4s,  v7.4s,  v17.4s
99
100         ret
101 endfunc
102
103 function fft15_neon
104         mov             x8,  x1
105         mov             x9,  x30
106         add             x2,  x3,  x3,  lsl #1   // 3 * stride
107
108         add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
109         bl              fft5_neon
110         mov             v1.8b,   v0.8b
111         mov             v2.16b,  v6.16b
112         mov             v3.16b,  v7.16b
113
114         add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
115         add             x2,  x3,  x3,  lsl #1   // 3 * stride
116         bl              fft5_neon
117         zip1            v1.4s,   v1.4s,  v0.4s
118         mov             v4.16b,  v6.16b
119         mov             v5.16b,  v7.16b
120
121         mov             x1,  x8                 // in + 0 * stride
122         add             x2,  x3,  x3,  lsl #1   // 3 * stride
123         bl              fft5_neon
124
125         faddp           v20.4s, v1.4s,  v1.4s
126
127         ext             v18.16b, v8.16b,  v8.16b,  #4
128         ext             v19.16b, v9.16b,  v9.16b,  #4
129         mov             v16.16b, v6.16b
130         mov             v17.16b, v7.16b
131         fadd            v20.2s, v20.2s, v0.2s
132
133         uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
134         uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
135
136         st1             {v20.2s},  [x0], #8     // out[0]
137
138         fmla            v16.4s, v2.4s,  v8.4s
139         fmls            v16.4s, v3.4s,  v9.4s
140
141         fmla            v17.4s, v2.4s,  v9.4s
142         fmla            v17.4s, v3.4s,  v8.4s
143
144         fmla            v16.4s, v4.4s,  v18.4s
145         fmls            v16.4s, v5.4s,  v19.4s
146
147         fmla            v17.4s, v4.4s,  v19.4s
148         fmla            v17.4s, v5.4s,  v18.4s
149
150         zip1            v18.4s, v16.4s, v17.4s
151         zip2            v19.4s, v16.4s, v17.4s
152
153         rev64           v31.4s, v14.4s
154         trn1            v28.2d, v1.2d,  v1.2d
155         trn2            v29.2d, v1.2d,  v1.2d
156         zip1            v30.2d, v14.2d, v31.2d
157         zip2            v31.2d, v14.2d, v31.2d
158
159         st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
160
161         fmul            v16.4s, v28.4s, v30.4s
162         fmul            v17.4s, v29.4s, v30.4s
163         fmls            v16.4s, v29.4s, v31.4s
164         fmla            v17.4s, v28.4s, v31.4s
165         faddp           v16.4s, v16.4s, v16.4s
166         faddp           v17.4s, v17.4s, v17.4s
167         zip1            v18.2s, v16.2s, v17.2s
168         zip2            v19.2s, v16.2s, v17.2s
169
170         fadd            v18.2s, v18.2s, v0.2s
171         fadd            v0.2s,  v19.2s, v0.2s
172
173         ext             v30.16b, v12.16b, v12.16b, #4
174         ext             v31.16b, v13.16b, v13.16b, #4
175         mov             v16.16b, v6.16b
176         mov             v17.16b, v7.16b
177
178         uzp1            v30.4s, v30.4s, v8.4s
179         uzp1            v31.4s, v31.4s, v9.4s
180
181         st1             {v18.2s},  [x0], #8     // out[5]
182
183         fmla            v16.4s, v2.4s,  v10.4s
184         fmls            v16.4s, v3.4s,  v11.4s
185
186         fmla            v17.4s, v2.4s,  v11.4s
187         fmla            v17.4s, v3.4s,  v10.4s
188
189         fmla            v16.4s, v4.4s,  v30.4s
190         fmls            v16.4s, v5.4s,  v31.4s
191
192         fmla            v17.4s, v4.4s,  v31.4s
193         fmla            v17.4s, v5.4s,  v30.4s
194
195         zip1            v18.4s, v16.4s, v17.4s
196         zip2            v19.4s, v16.4s, v17.4s
197
198         ext             v30.16b, v10.16b, v10.16b, #4
199         ext             v31.16b, v11.16b, v11.16b, #4
200
201         fmla            v6.4s,  v2.4s,  v12.4s
202         fmls            v6.4s,  v3.4s,  v13.4s
203
204         st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
205
206         uzp1            v30.4s, v30.4s, v12.4s
207         uzp1            v31.4s, v31.4s, v13.4s
208
209         fmla            v7.4s,  v2.4s,  v13.4s
210         fmla            v7.4s,  v3.4s,  v12.4s
211
212         st1             {v0.2s},  [x0], #8     // out[10]
213
214         fmla            v6.4s,  v4.4s,  v30.4s
215         fmls            v6.4s,  v5.4s,  v31.4s
216
217         fmla            v7.4s,  v4.4s,  v31.4s
218         fmla            v7.4s,  v5.4s,  v30.4s
219
220         zip1            v18.4s, v6.4s,  v7.4s
221         zip2            v19.4s, v6.4s,  v7.4s
222
223         st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
224
225         ret             x9
226 endfunc
227
228 // x0: out, x1: out+len2, x2: exptab, x3: len2
229 function fft15_pass
230         ands            x6,  x3,  #3
231         mov             x4,  x0
232         mov             x5,  x1
233         b.eq            9f
234         ld1             {v0.2s},  [x0], #8
235         ld1             {v1.2s},  [x1], #8
236         sub             x3,  x3,  x6
237         subs            x6,  x6,  #1
238         fadd            v2.2s,  v0.2s,  v1.2s
239         fsub            v3.2s,  v0.2s,  v1.2s
240         add             x2,  x2,  #8
241         st1             {v2.2s},  [x4], #8
242         st1             {v3.2s},  [x5], #8
243         b.eq            9f
244 1:
245         subs            x6,  x6,  #1
246         ldp             s4,  s5,  [x2], #8
247         ldp             s2,  s3,  [x1], #8
248         ldp             s0,  s1,  [x0], #8
249
250         fmul            s6,  s2,  s4
251         fmul            s7,  s2,  s5
252         fmls            s6,  s3,  v5.s[0]
253         fmla            s7,  s3,  v4.s[0]
254
255         fsub            s2,  s0,  s6
256         fsub            s3,  s1,  s7
257         fadd            s0,  s0,  s6
258         fadd            s1,  s1,  s7
259
260         stp             s2,  s3,  [x5], #8
261         stp             s0,  s1,  [x4], #8
262         b.gt            1b
263 9:
264         ld1             {v4.4s,v5.4s}, [x2],  #32
265         ld2             {v2.4s,v3.4s}, [x1],  #32
266         uzp1            v6.4s,  v4.4s,  v5.4s
267         uzp2            v7.4s,  v4.4s,  v5.4s
268         ld2             {v0.4s,v1.4s}, [x0],  #32
269 8:
270         subs            x3,  x3,  #8
271
272         fmul            v4.4s,  v2.4s,  v6.4s
273         fmul            v5.4s,  v2.4s,  v7.4s
274         b.lt            4f
275
276         ld1             {v18.4s,v19.4s}, [x2],  #32
277
278         fmls            v4.4s,  v3.4s,  v7.4s
279         fmla            v5.4s,  v3.4s,  v6.4s
280
281         ld2             {v22.4s,v23.4s}, [x1],  #32
282
283         fsub            v2.4s,  v0.4s,  v4.4s
284         fadd            v0.4s,  v0.4s,  v4.4s
285         fsub            v3.4s,  v1.4s,  v5.4s
286         fadd            v1.4s,  v1.4s,  v5.4s
287
288         uzp1            v16.4s, v18.4s, v19.4s
289         uzp2            v17.4s, v18.4s, v19.4s
290
291         st2             {v2.4s,v3.4s}, [x5],  #32
292         st2             {v0.4s,v1.4s}, [x4],  #32
293         ld2             {v20.4s,v21.4s}, [x0],  #32
294
295         fmul            v18.4s, v22.4s, v16.4s
296         fmul            v19.4s, v22.4s, v17.4s
297         b.eq            0f
298
299         ld1             {v4.4s,v5.4s}, [x2],  #32
300
301         fmls            v18.4s, v23.4s, v17.4s
302         fmla            v19.4s, v23.4s, v16.4s
303
304         ld2             {v2.4s,v3.4s}, [x1],  #32
305
306         fsub            v22.4s, v20.4s, v18.4s
307         fadd            v20.4s, v20.4s, v18.4s
308         fsub            v23.4s, v21.4s, v19.4s
309         fadd            v21.4s, v21.4s, v19.4s
310
311         uzp1            v6.4s,  v4.4s,  v5.4s
312         uzp2            v7.4s,  v4.4s,  v5.4s
313
314         st2             {v22.4s,v23.4s}, [x5],  #32
315         st2             {v20.4s,v21.4s}, [x4],  #32
316         ld2             {v0.4s,v1.4s}, [x0],  #32
317
318         b               8b
319 4:
320         fmls            v4.4s,  v3.4s,  v7.4s
321         fmla            v5.4s,  v3.4s,  v6.4s
322
323         fsub            v2.4s,  v0.4s,  v4.4s
324         fadd            v0.4s,  v0.4s,  v4.4s
325         fsub            v3.4s,  v1.4s,  v5.4s
326         fadd            v1.4s,  v1.4s,  v5.4s
327
328         st2             {v2.4s,v3.4s}, [x5],  #32
329         st2             {v0.4s,v1.4s}, [x4],  #32
330
331         ret
332 0:
333         fmls            v18.4s, v23.4s, v17.4s
334         fmla            v19.4s, v23.4s, v16.4s
335
336         fsub            v22.4s, v20.4s, v18.4s
337         fadd            v20.4s, v20.4s, v18.4s
338         fsub            v23.4s, v21.4s, v19.4s
339         fadd            v21.4s, v21.4s, v19.4s
340
341         st2             {v22.4s,v23.4s}, [x5],  #32
342         st2             {v20.4s,v21.4s}, [x4],  #32
343
344         ret
345 endfunc
346
347 function fft30_neon, align=6
348         sub             sp,  sp,  #0x20
349         stp             x20, x21, [sp]
350         stp             x22, x30, [sp, #0x10]
351         mov             x21, x1
352         mov             x22, x2
353         mov             x20, x4
354         mov             x0,  x21
355         mov             x1,  x22
356         lsl             x3,  x20, #1
357         bl              fft15_neon
358
359         add             x0,  x21, #15*8
360         add             x1,  x22, x20,  lsl #3
361         lsl             x3,  x20, #1
362         bl              fft15_neon
363
364         ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
365         add             x0,  x21, #0
366         add             x1,  x21, #15*8
367         mov             x3,  #15
368         ldp             x20, x21, [sp]
369         ldp             x22, x30, [sp, #0x10]
370         add             sp,  sp,  #0x20
371         b               fft15_pass
372 endfunc
373
374 .macro  def_fft n, n2
375 function fft\n\()_neon, align=6
376         sub             sp,  sp,  #0x30
377         stp             x20, x21, [sp]
378         stp             x22, x30, [sp, #0x10]
379         stp             x23, x24, [sp, #0x20]
380         mov             x21, x1
381         mov             x22, x2
382         mov             x23, x3
383         mov             x20, x4
384         sub             x3,  x3,  #1
385         lsl             x4,  x4,  #1
386         bl              fft\n2\()_neon
387
388         add             x1,  x21, #(\n2 * 8)
389         add             x2,  x22, x20, lsl #3
390         sub             x3,  x23, #1
391         lsl             x4,  x20, #1
392         bl              fft\n2\()_neon
393
394         add             x5,  x10, #CELT_EXPTAB
395         mov             x0,  x21
396         ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
397         add             x1,  x21, #(\n2 * 8)
398         mov             x3,  #\n2
399         ldp             x20, x21, [sp]
400         ldp             x22, x30, [sp, #0x10]
401         ldp             x23, x24, [sp, #0x20]
402         add             sp,  sp,  #0x30
403         b               fft15_pass
404 endfunc
405 .endm
406
407         def_fft    60,  30
408         def_fft   120,  60
409         def_fft   240, 120
410         def_fft   480, 240
411         def_fft   960, 480
412
413 function fft_b15_calc_neon
414         sub             sp,  sp,  #0x50
415         ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
416         movrel          x6,  fact5
417         movrel          x11, shuffle_0213
418         movrel          x12, shuffle_1032
419         movrel          x13, shuffle_2301
420         movrel          x14, shuffle_3120
421         add             x8,  x8,  #8
422         movrel          x5,  fft_tab_neon
423         stp             x20, x30, [sp]
424         stp             d8,  d9,  [sp, #0x10]
425         stp             d10, d11, [sp, #0x20]
426         stp             d12, d13, [sp, #0x30]
427         stp             d14, d15, [sp, #0x40]
428         ld1             {v15.4s}, [x6]
429         ld1             {v0.4s,v1.4s},   [x8],  #32
430         ld1             {v6.2s},  [x8],  #8
431         ld1             {v2.4s,v3.4s},   [x8],  #32
432         ld1             {v7.2s},  [x8],  #8
433         ld1             {v4.4s,v5.4s},   [x8],  #32
434         uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
435         uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
436         uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
437         uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
438         uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
439         uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
440         zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
441         add             x5,  x5,  x3,  lsl #3
442         ldr             x5,  [x5]
443         mov             x10, x0
444         blr             x5
445         ldp             x20, x30, [sp]
446         ldp             d8,  d9,  [sp, #0x10]
447         ldp             d10, d11, [sp, #0x20]
448         ldp             d12, d13, [sp, #0x30]
449         ldp             d14, d15, [sp, #0x40]
450         add             sp,  sp,  #0x50
451         ret
452 endfunc
453
454 const   fft_tab_neon
455         .quad fft15_neon
456         .quad fft30_neon
457         .quad fft60_neon
458         .quad fft120_neon
459         .quad fft240_neon
460         .quad fft480_neon
461         .quad fft960_neon
462 endconst
463
464 function ff_celt_imdct_half_neon, export=1
465         sub             sp,  sp,  #0x20
466         stp             x21, x30, [sp]
467         str             s0, [sp, #0x10]
468
469         ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
470         mov             x10, x0
471         mov             x21, x1
472         sub             w5,  w5,  #1
473         lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
474         sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
475         mul             x5,  x5,  x3
476         ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
477         ldr             w3,  [x0, #CELT_FFT_N]
478         add             x5,  x2,  x5,  lsl #2
479         mov             x11, x9
480
481         sub             w6,  w6,  #4
482         ld1             {v0.s}[0],  [x5], x8
483         ld1             {v1.s}[0],  [x2], x7
484         ld1             {v4.4s,v5.4s}, [x10], #32
485         ld1             {v0.s}[1],  [x5], x8
486         ld1             {v1.s}[1],  [x2], x7
487         uzp1            v2.4s,  v4.4s,  v5.4s
488         ld1             {v0.s}[2],  [x5], x8
489         ld1             {v1.s}[2],  [x2], x7
490         uzp2            v3.4s,  v4.4s,  v5.4s
491         ld1             {v0.s}[3],  [x5], x8
492         ld1             {v1.s}[3],  [x2], x7
493 1:
494         subs            w6,  w6,  #4
495
496         ld1             {v20.s}[0], [x5], x8
497         ld1             {v21.s}[0], [x2], x7
498         ld1             {v4.4s,v5.4s}, [x10], #32
499
500         fmul            v6.4s,  v0.4s,  v2.4s
501         fmul            v7.4s,  v0.4s,  v3.4s
502
503         ld1             {v20.s}[1], [x5], x8
504         ld1             {v21.s}[1], [x2], x7
505
506         fmls            v6.4s,  v1.4s,  v3.4s
507         fmla            v7.4s,  v1.4s,  v2.4s
508
509         ld1             {v20.s}[2], [x5], x8
510         ld1             {v21.s}[2], [x2], x7
511
512         uzp1            v2.4s,  v4.4s,  v5.4s
513         uzp2            v3.4s,  v4.4s,  v5.4s
514         ld1             {v20.s}[3], [x5], x8
515         ld1             {v21.s}[3], [x2], x7
516
517         zip1            v4.4s,  v6.4s,  v7.4s
518         zip2            v5.4s,  v6.4s,  v7.4s
519
520         fmul            v6.4s,  v20.4s, v2.4s
521         fmul            v7.4s,  v20.4s, v3.4s
522
523         st1             {v4.4s,v5.4s}, [x9], #32
524
525         fmls            v6.4s,  v21.4s, v3.4s
526         fmla            v7.4s,  v21.4s, v2.4s
527
528         b.eq            3f
529
530         subs            w6,  w6,  #4
531         ld1             {v4.4s,v5.4s}, [x10], #32
532         ld1             {v0.s}[0],  [x5], x8
533         ld1             {v1.s}[0],  [x2], x7
534         uzp1            v2.4s,  v4.4s,  v5.4s
535         ld1             {v0.s}[1],  [x5], x8
536         ld1             {v1.s}[1],  [x2], x7
537         uzp2            v3.4s,  v4.4s,  v5.4s
538         ld1             {v0.s}[2],  [x5], x8
539         ld1             {v1.s}[2],  [x2], x7
540         zip1            v4.4s,  v6.4s,  v7.4s
541         zip2            v5.4s,  v6.4s,  v7.4s
542         ld1             {v0.s}[3],  [x5], x8
543         ld1             {v1.s}[3],  [x2], x7
544
545         st1             {v4.4s,v5.4s}, [x9], #32
546
547         b.gt            1b
548
549         fmul            v6.4s,  v0.4s,  v2.4s
550         fmul            v7.4s,  v0.4s,  v3.4s
551         fmls            v6.4s,  v1.4s,  v3.4s
552         fmla            v7.4s,  v1.4s,  v2.4s
553 3:
554         zip1            v4.4s,  v6.4s,  v7.4s
555         zip2            v5.4s,  v6.4s,  v7.4s
556         st1             {v4.4s,v5.4s}, [x9], #32
557
558         mov             x2,  x11
559         mov             x4,  #1
560
561         bl              fft_b15_calc_neon
562
563         ldr             w5,  [x10, #CELT_LEN4]
564         ldr             x6,  [x10, #CELT_TWIDDLE]
565         ldr             s31, [sp, #0x10]
566
567         add             x1,  x21, x5,  lsl #2
568         add             x3,  x6,  x5,  lsl #2
569         sub             x0,  x1,  #16
570         sub             x2,  x3,  #16
571         mov             x8,  #-16
572         mov             x7,  #16
573         mov             x10, x0
574         mov             x11, x1
575
576         sub             w5,  w5,  #4
577
578         ld1             {v0.4s},  [x0], x8
579         ld1             {v1.4s},  [x1], x7
580         ld1             {v2.4s},  [x2], x8
581         ld1             {v3.4s},  [x3], x7
582
583         uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
584         uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
585
586         uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
587         uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
588
589         fmul            v1.4s,  v6.4s,  v5.4s
590         fmul            v0.4s,  v6.4s,  v7.4s
591 2:
592         subs            w5,  w5,  #4
593
594         ld1             {v20.4s}, [x0], x8
595
596         fmla            v1.4s,  v4.4s,  v7.4s
597         fmls            v0.4s,  v4.4s,  v5.4s
598
599         ld1             {v21.4s}, [x1], x7
600
601         ext             v1.16b, v1.16b, v1.16b, #8
602         fmul            v0.4s,  v0.4s,  v31.s[0]
603
604         ld1             {v2.4s},  [x2], x8
605
606         rev64           v1.4s,  v1.4s
607         fmul            v1.4s,  v1.4s,  v31.s[0]
608
609         ld1             {v3.4s},  [x3], x7
610
611         zip1            v5.4s,  v0.4s,  v1.4s
612         zip2            v7.4s,  v0.4s,  v1.4s
613
614         uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
615         uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
616
617         st1             {v5.4s},  [x10], x8
618         st1             {v7.4s},  [x11], x7
619
620         uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
621         uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
622
623         fmul            v1.4s,  v6.4s,  v5.4s
624         fmul            v0.4s,  v6.4s,  v7.4s
625         b.gt            2b
626
627         fmla            v1.4s,  v4.4s,  v7.4s
628         fmls            v0.4s,  v4.4s,  v5.4s
629         ext             v1.16b, v1.16b, v1.16b, #8
630         fmul            v0.4s,  v0.4s,  v31.s[0]
631         rev64           v1.4s,  v1.4s
632         fmul            v1.4s,  v1.4s,  v31.s[0]
633         zip1            v5.4s,  v0.4s,  v1.4s
634         zip2            v7.4s,  v0.4s,  v1.4s
635         st1             {v5.4s},  [x10], x8
636         st1             {v7.4s},  [x11], x7
637
638         ldp             x21, x30, [sp]
639         add             sp,  sp,  #0x20
640         ret
641 endfunc
642
643 // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
644 const   fact5,          align=4
645         .float           0.30901699437494745, 0.95105651629515353
646         .float          -0.80901699437494734, 0.58778525229247325
647 endconst