ff4e6e244a4a4ed3247e28aed5430ea82635abad
[platform/upstream/ffmpeg.git] / libavcodec / aarch64 / aacpsdsp_neon.S
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18
19 #include "libavutil/aarch64/asm.S"
20
21 function ff_ps_add_squares_neon, export=1
22 1:      ld1         {v0.4S,v1.4S}, [x1], #32
23         fmul        v0.4S, v0.4S, v0.4S
24         fmul        v1.4S, v1.4S, v1.4S
25         faddp       v2.4S, v0.4S, v1.4S
26         ld1         {v3.4S}, [x0]
27         fadd        v3.4S, v3.4S, v2.4S
28         st1         {v3.4S}, [x0], #16
29         subs        w2, w2, #4
30         b.gt        1b
31         ret
32 endfunc
33
34 function ff_ps_mul_pair_single_neon, export=1
35 1:      ld1         {v0.4S,v1.4S}, [x1], #32
36         ld1         {v2.4S},       [x2], #16
37         zip1        v3.4S, v2.4S, v2.4S
38         zip2        v4.4S, v2.4S, v2.4S
39         fmul        v0.4S, v0.4S, v3.4S
40         fmul        v1.4S, v1.4S, v4.4S
41         st1         {v0.4S,v1.4S}, [x0], #32
42         subs        w3, w3, #4
43         b.gt        1b
44         ret
45 endfunc
46
47 function ff_ps_stereo_interpolate_neon, export=1
48         ld1         {v0.4S}, [x2]
49         ld1         {v1.4S}, [x3]
50         zip1        v4.4S, v0.4S, v0.4S
51         zip2        v5.4S, v0.4S, v0.4S
52         zip1        v6.4S, v1.4S, v1.4S
53         zip2        v7.4S, v1.4S, v1.4S
54 1:      ld1         {v2.2S}, [x0]
55         ld1         {v3.2S}, [x1]
56         fadd        v4.4S, v4.4S, v6.4S
57         fadd        v5.4S, v5.4S, v7.4S
58         mov         v2.D[1], v2.D[0]
59         mov         v3.D[1], v3.D[0]
60         fmul        v2.4S, v2.4S, v4.4S
61         fmla        v2.4S, v3.4S, v5.4S
62         st1         {v2.D}[0], [x0], #8
63         st1         {v2.D}[1], [x1], #8
64         subs        w4, w4, #1
65         b.gt        1b
66         ret
67 endfunc
68
69 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
70         ld1         {v0.4S,v1.4S}, [x2]
71         ld1         {v6.4S,v7.4S}, [x3]
72         fneg        v2.4S, v1.4S
73         fneg        v3.4S, v7.4S
74         zip1        v16.4S, v0.4S, v0.4S
75         zip2        v17.4S, v0.4S, v0.4S
76         zip1        v18.4S, v2.4S, v1.4S
77         zip2        v19.4S, v2.4S, v1.4S
78         zip1        v20.4S, v6.4S, v6.4S
79         zip2        v21.4S, v6.4S, v6.4S
80         zip1        v22.4S, v3.4S, v7.4S
81         zip2        v23.4S, v3.4S, v7.4S
82 1:      ld1         {v2.2S}, [x0]
83         ld1         {v3.2S}, [x1]
84         fadd        v16.4S, v16.4S, v20.4S
85         fadd        v17.4S, v17.4S, v21.4S
86         mov         v2.D[1], v2.D[0]
87         mov         v3.D[1], v3.D[0]
88         fmul        v4.4S, v2.4S, v16.4S
89         fmla        v4.4S, v3.4S, v17.4S
90         fadd        v18.4S, v18.4S, v22.4S
91         fadd        v19.4S, v19.4S, v23.4S
92         ext         v2.16B, v2.16B, v2.16B, #4
93         ext         v3.16B, v3.16B, v3.16B, #4
94         fmla        v4.4S, v2.4S, v18.4S
95         fmla        v4.4S, v3.4S, v19.4S
96         st1         {v4.D}[0], [x0], #8
97         st1         {v4.D}[1], [x1], #8
98         subs        w4, w4, #1
99         b.gt        1b
100         ret
101 endfunc
102
103 function ff_ps_hybrid_analysis_neon, export=1
104         lsl         x3, x3, #3
105         ld2         {v0.4S,v1.4S}, [x1], #32
106         ld2         {v2.2S,v3.2S}, [x1], #16
107         ld1         {v24.2S},      [x1], #8
108         ld2         {v4.2S,v5.2S}, [x1], #16
109         ld2         {v6.4S,v7.4S}, [x1]
110         rev64       v6.4S, v6.4S
111         rev64       v7.4S, v7.4S
112         ext         v6.16B, v6.16B, v6.16B, #8
113         ext         v7.16B, v7.16B, v7.16B, #8
114         rev64       v4.2S, v4.2S
115         rev64       v5.2S, v5.2S
116         mov         v2.D[1], v3.D[0]
117         mov         v4.D[1], v5.D[0]
118         mov         v5.D[1], v2.D[0]
119         mov         v3.D[1], v4.D[0]
120         fadd        v16.4S, v0.4S, v6.4S
121         fadd        v17.4S, v1.4S, v7.4S
122         fsub        v18.4S, v1.4S, v7.4S
123         fsub        v19.4S, v0.4S, v6.4S
124         fadd        v22.4S, v2.4S, v4.4S
125         fsub        v23.4S, v5.4S, v3.4S
126         trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
127         trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
128 1:      ld2         {v2.4S,v3.4S}, [x2], #32
129         ld2         {v4.2S,v5.2S}, [x2], #16
130         ld1         {v6.2S},       [x2], #8
131         add         x2, x2, #8
132         mov         v4.D[1], v5.D[0]
133         mov         v6.S[1], v6.S[0]
134         fmul        v6.2S, v6.2S, v24.2S
135         fmul        v0.4S, v2.4S, v16.4S
136         fmul        v1.4S, v2.4S, v17.4S
137         fmls        v0.4S, v3.4S, v18.4S
138         fmla        v1.4S, v3.4S, v19.4S
139         fmla        v0.4S, v4.4S, v20.4S
140         fmla        v1.4S, v4.4S, v21.4S
141         faddp       v0.4S, v0.4S, v1.4S
142         faddp       v0.4S, v0.4S, v0.4S
143         fadd        v0.2S, v0.2S, v6.2S
144         st1         {v0.2S}, [x0], x3
145         subs        w4, w4, #1
146         b.gt        1b
147         ret
148 endfunc