Imported Upstream version 6.1
[platform/upstream/ffmpeg.git] / libavcodec / aarch64 / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/aarch64/asm.S"
24 #include "neon.S"
25
26 .macro  h264_loop_filter_start
27         cmp             w2,  #0
28         ldr             w6,  [x4]
29         ccmp            w3,  #0, #0, ne
30         mov             v24.s[0], w6
31         and             w8,  w6,  w6,  lsl #16
32         b.eq            1f
33         ands            w8,  w8,  w8,  lsl #8
34         b.ge            2f
35 1:
36         ret
37 2:
38 .endm
39
40 .macro  h264_loop_filter_luma
41         dup             v22.16b, w2                     // alpha
42         uxtl            v24.8h,  v24.8b
43         uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
44         uxtl            v24.4s,  v24.4h
45         uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
46         sli             v24.8h,  v24.8h,  #8
47         uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
48         sli             v24.4s,  v24.4s,  #16
49         cmhi            v21.16b, v22.16b, v21.16b       // < alpha
50         dup             v22.16b, w3                     // beta
51         cmlt            v23.16b, v24.16b, #0
52         cmhi            v28.16b, v22.16b, v28.16b       // < beta
53         cmhi            v30.16b, v22.16b, v30.16b       // < beta
54         bic             v21.16b, v21.16b, v23.16b
55         uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
56         and             v21.16b, v21.16b, v28.16b
57         uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
58         and             v21.16b, v21.16b, v30.16b      // < beta
59         shrn            v30.8b,  v21.8h,  #4
60         mov             x7, v30.d[0]
61         cmhi            v17.16b, v22.16b, v17.16b       // < beta
62         cmhi            v19.16b, v22.16b, v19.16b       // < beta
63         cbz             x7,  9f
64         and             v17.16b, v17.16b, v21.16b
65         and             v19.16b, v19.16b, v21.16b
66         and             v24.16b, v24.16b, v21.16b
67         urhadd          v28.16b, v16.16b,  v0.16b
68         sub             v21.16b, v24.16b, v17.16b
69         uqadd           v23.16b, v18.16b, v24.16b
70         uhadd           v20.16b, v20.16b, v28.16b
71         sub             v21.16b, v21.16b, v19.16b
72         uhadd           v28.16b,  v4.16b, v28.16b
73         umin            v23.16b, v23.16b, v20.16b
74         uqsub           v22.16b, v18.16b, v24.16b
75         uqadd           v4.16b,   v2.16b, v24.16b
76         umax            v23.16b, v23.16b, v22.16b
77         uqsub           v22.16b,  v2.16b, v24.16b
78         umin            v28.16b,  v4.16b, v28.16b
79         uxtl            v4.8h,    v0.8b
80         umax            v28.16b, v28.16b, v22.16b
81         uxtl2           v20.8h,   v0.16b
82         usubw           v4.8h,    v4.8h,  v16.8b
83         usubw2          v20.8h,  v20.8h,  v16.16b
84         shl             v4.8h,    v4.8h,  #2
85         shl             v20.8h,  v20.8h,  #2
86         uaddw           v4.8h,    v4.8h,  v18.8b
87         uaddw2          v20.8h,  v20.8h,  v18.16b
88         usubw           v4.8h,    v4.8h,   v2.8b
89         usubw2          v20.8h,  v20.8h,   v2.16b
90         rshrn           v4.8b,    v4.8h,  #3
91         rshrn2          v4.16b,  v20.8h,  #3
92         bsl             v17.16b, v23.16b, v18.16b
93         bsl             v19.16b, v28.16b,  v2.16b
94         neg             v23.16b, v21.16b
95         uxtl            v28.8h,  v16.8b
96         smin            v4.16b,   v4.16b, v21.16b
97         uxtl2           v21.8h,  v16.16b
98         smax            v4.16b,   v4.16b, v23.16b
99         uxtl            v22.8h,   v0.8b
100         uxtl2           v24.8h,   v0.16b
101         saddw           v28.8h,  v28.8h,  v4.8b
102         saddw2          v21.8h,  v21.8h,  v4.16b
103         ssubw           v22.8h,  v22.8h,  v4.8b
104         ssubw2          v24.8h,  v24.8h,  v4.16b
105         sqxtun          v16.8b,  v28.8h
106         sqxtun2         v16.16b, v21.8h
107         sqxtun          v0.8b,   v22.8h
108         sqxtun2         v0.16b,  v24.8h
109 .endm
110
111 function ff_h264_v_loop_filter_luma_neon, export=1
112         h264_loop_filter_start
113
114         ld1             {v0.16b},  [x0], x1
115         ld1             {v2.16b},  [x0], x1
116         ld1             {v4.16b},  [x0], x1
117         sub             x0,  x0,  x1, lsl #2
118         sub             x0,  x0,  x1, lsl #1
119         ld1             {v20.16b},  [x0], x1
120         ld1             {v18.16b},  [x0], x1
121         ld1             {v16.16b},  [x0], x1
122
123         h264_loop_filter_luma
124
125         sub             x0,  x0,  x1, lsl #1
126         st1             {v17.16b},  [x0], x1
127         st1             {v16.16b}, [x0], x1
128         st1             {v0.16b},  [x0], x1
129         st1             {v19.16b}, [x0]
130 9:
131         ret
132 endfunc
133
134 function ff_h264_h_loop_filter_luma_neon, export=1
135         h264_loop_filter_start
136
137         sub             x0,  x0,  #4
138         ld1             {v6.8b},  [x0], x1
139         ld1             {v20.8b}, [x0], x1
140         ld1             {v18.8b}, [x0], x1
141         ld1             {v16.8b}, [x0], x1
142         ld1             {v0.8b},  [x0], x1
143         ld1             {v2.8b},  [x0], x1
144         ld1             {v4.8b},  [x0], x1
145         ld1             {v26.8b}, [x0], x1
146         ld1             {v6.d}[1],  [x0], x1
147         ld1             {v20.d}[1], [x0], x1
148         ld1             {v18.d}[1], [x0], x1
149         ld1             {v16.d}[1], [x0], x1
150         ld1             {v0.d}[1],  [x0], x1
151         ld1             {v2.d}[1],  [x0], x1
152         ld1             {v4.d}[1],  [x0], x1
153         ld1             {v26.d}[1], [x0], x1
154
155         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156
157         h264_loop_filter_luma
158
159         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
160
161         sub             x0,  x0,  x1, lsl #4
162         add             x0,  x0,  #2
163         st1             {v17.s}[0],  [x0], x1
164         st1             {v16.s}[0], [x0], x1
165         st1             {v0.s}[0],  [x0], x1
166         st1             {v19.s}[0], [x0], x1
167         st1             {v17.s}[1],  [x0], x1
168         st1             {v16.s}[1], [x0], x1
169         st1             {v0.s}[1],  [x0], x1
170         st1             {v19.s}[1], [x0], x1
171         st1             {v17.s}[2],  [x0], x1
172         st1             {v16.s}[2], [x0], x1
173         st1             {v0.s}[2],  [x0], x1
174         st1             {v19.s}[2], [x0], x1
175         st1             {v17.s}[3],  [x0], x1
176         st1             {v16.s}[3], [x0], x1
177         st1             {v0.s}[3],  [x0], x1
178         st1             {v19.s}[3], [x0], x1
179 9:
180         ret
181 endfunc
182
183
184 .macro h264_loop_filter_start_intra
185         orr             w4,  w2,  w3
186         cbnz            w4,  1f
187         ret
188 1:
189         dup             v30.16b, w2                // alpha
190         dup             v31.16b, w3                // beta
191 .endm
192
193 .macro h264_loop_filter_luma_intra
194         uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
195         uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
196         uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
197         cmhi            v19.16b, v30.16b, v16.16b       // < alpha
198         cmhi            v17.16b, v31.16b, v17.16b       // < beta
199         cmhi            v18.16b, v31.16b, v18.16b       // < beta
200
201         movi            v29.16b, #2
202         ushr            v30.16b, v30.16b, #2            // alpha >> 2
203         add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
204         cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
205
206         and             v19.16b, v19.16b, v17.16b
207         and             v19.16b, v19.16b, v18.16b
208         shrn            v20.8b,  v19.8h,  #4
209         mov             x4, v20.d[0]
210         cbz             x4, 9f
211
212         ushll           v20.8h,  v6.8b,   #1
213         ushll           v22.8h,  v1.8b,   #1
214         ushll2          v21.8h,  v6.16b,  #1
215         ushll2          v23.8h,  v1.16b,  #1
216         uaddw           v20.8h,  v20.8h,  v7.8b
217         uaddw           v22.8h,  v22.8h,  v0.8b
218         uaddw2          v21.8h,  v21.8h,  v7.16b
219         uaddw2          v23.8h,  v23.8h,  v0.16b
220         uaddw           v20.8h,  v20.8h,  v1.8b
221         uaddw           v22.8h,  v22.8h,  v6.8b
222         uaddw2          v21.8h,  v21.8h,  v1.16b
223         uaddw2          v23.8h,  v23.8h,  v6.16b
224
225         rshrn           v24.8b,  v20.8h,  #2 // p0'_1
226         rshrn           v25.8b,  v22.8h,  #2 // q0'_1
227         rshrn2          v24.16b, v21.8h,  #2 // p0'_1
228         rshrn2          v25.16b, v23.8h,  #2 // q0'_1
229
230         uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
231         uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
232         cmhi            v17.16b, v31.16b, v17.16b       // < beta
233         cmhi            v18.16b, v31.16b, v18.16b       // < beta
234
235         and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
236         and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
237
238         not             v30.16b, v17.16b
239         not             v31.16b, v18.16b
240
241         and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
242         and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
243
244         and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
245         and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
246
247         //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248         uaddl           v26.8h,  v5.8b,   v7.8b
249         uaddl2          v27.8h,  v5.16b,  v7.16b
250         uaddw           v26.8h,  v26.8h,  v0.8b
251         uaddw2          v27.8h,  v27.8h,  v0.16b
252         add             v20.8h,  v20.8h,  v26.8h
253         add             v21.8h,  v21.8h,  v27.8h
254         uaddw           v20.8h,  v20.8h,  v0.8b
255         uaddw2          v21.8h,  v21.8h,  v0.16b
256         rshrn           v20.8b,  v20.8h,  #3 // p0'_2
257         rshrn2          v20.16b, v21.8h,  #3 // p0'_2
258         uaddw           v26.8h,  v26.8h,  v6.8b
259         uaddw2          v27.8h,  v27.8h,  v6.16b
260         rshrn           v21.8b,  v26.8h,  #2 // p1'_2
261         rshrn2          v21.16b, v27.8h,  #2 // p1'_2
262         uaddl           v28.8h,  v4.8b,   v5.8b
263         uaddl2          v29.8h,  v4.16b,  v5.16b
264         shl             v28.8h,  v28.8h,  #1
265         shl             v29.8h,  v29.8h,  #1
266         add             v28.8h,  v28.8h,  v26.8h
267         add             v29.8h,  v29.8h,  v27.8h
268         rshrn           v19.8b,  v28.8h,  #3 // p2'_2
269         rshrn2          v19.16b, v29.8h,  #3 // p2'_2
270
271         //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272         uaddl           v26.8h,  v2.8b,   v0.8b
273         uaddl2          v27.8h,  v2.16b,  v0.16b
274         uaddw           v26.8h,  v26.8h,  v7.8b
275         uaddw2          v27.8h,  v27.8h,  v7.16b
276         add             v22.8h,  v22.8h,  v26.8h
277         add             v23.8h,  v23.8h,  v27.8h
278         uaddw           v22.8h,  v22.8h,  v7.8b
279         uaddw2          v23.8h,  v23.8h,  v7.16b
280         rshrn           v22.8b,  v22.8h,  #3 // q0'_2
281         rshrn2          v22.16b, v23.8h,  #3 // q0'_2
282         uaddw           v26.8h,  v26.8h,  v1.8b
283         uaddw2          v27.8h,  v27.8h,  v1.16b
284         rshrn           v23.8b,  v26.8h,  #2 // q1'_2
285         rshrn2          v23.16b, v27.8h,  #2 // q1'_2
286         uaddl           v28.8h,  v2.8b,   v3.8b
287         uaddl2          v29.8h,  v2.16b,  v3.16b
288         shl             v28.8h,  v28.8h,  #1
289         shl             v29.8h,  v29.8h,  #1
290         add             v28.8h,  v28.8h,  v26.8h
291         add             v29.8h,  v29.8h,  v27.8h
292         rshrn           v26.8b,  v28.8h,  #3 // q2'_2
293         rshrn2          v26.16b, v29.8h,  #3 // q2'_2
294
295         bit             v7.16b,  v24.16b, v30.16b  // p0'_1
296         bit             v0.16b,  v25.16b, v31.16b  // q0'_1
297         bit             v7.16b,  v20.16b, v17.16b  // p0'_2
298         bit             v6.16b,  v21.16b, v17.16b  // p1'_2
299         bit             v5.16b,  v19.16b, v17.16b  // p2'_2
300         bit             v0.16b,  v22.16b, v18.16b  // q0'_2
301         bit             v1.16b,  v23.16b, v18.16b  // q1'_2
302         bit             v2.16b,  v26.16b, v18.16b  // q2'_2
303 .endm
304
305 function ff_h264_v_loop_filter_luma_intra_neon, export=1
306         h264_loop_filter_start_intra
307
308         ld1             {v0.16b},  [x0], x1 // q0
309         ld1             {v1.16b},  [x0], x1 // q1
310         ld1             {v2.16b},  [x0], x1 // q2
311         ld1             {v3.16b},  [x0], x1 // q3
312         sub             x0,  x0,  x1, lsl #3
313         ld1             {v4.16b},  [x0], x1 // p3
314         ld1             {v5.16b},  [x0], x1 // p2
315         ld1             {v6.16b},  [x0], x1 // p1
316         ld1             {v7.16b},  [x0]     // p0
317
318         h264_loop_filter_luma_intra
319
320         sub             x0,  x0,  x1, lsl #1
321         st1             {v5.16b}, [x0], x1  // p2
322         st1             {v6.16b}, [x0], x1  // p1
323         st1             {v7.16b}, [x0], x1  // p0
324         st1             {v0.16b}, [x0], x1  // q0
325         st1             {v1.16b}, [x0], x1  // q1
326         st1             {v2.16b}, [x0]      // q2
327 9:
328         ret
329 endfunc
330
331 function ff_h264_h_loop_filter_luma_intra_neon, export=1
332         h264_loop_filter_start_intra
333
334         sub             x0,  x0,  #4
335         ld1             {v4.8b},  [x0], x1
336         ld1             {v5.8b},  [x0], x1
337         ld1             {v6.8b},  [x0], x1
338         ld1             {v7.8b},  [x0], x1
339         ld1             {v0.8b},  [x0], x1
340         ld1             {v1.8b},  [x0], x1
341         ld1             {v2.8b},  [x0], x1
342         ld1             {v3.8b},  [x0], x1
343         ld1             {v4.d}[1],  [x0], x1
344         ld1             {v5.d}[1],  [x0], x1
345         ld1             {v6.d}[1],  [x0], x1
346         ld1             {v7.d}[1],  [x0], x1
347         ld1             {v0.d}[1],  [x0], x1
348         ld1             {v1.d}[1],  [x0], x1
349         ld1             {v2.d}[1],  [x0], x1
350         ld1             {v3.d}[1],  [x0], x1
351
352         transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353
354         h264_loop_filter_luma_intra
355
356         transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357
358         sub             x0,  x0,  x1, lsl #4
359         st1             {v4.8b},  [x0], x1
360         st1             {v5.8b},  [x0], x1
361         st1             {v6.8b},  [x0], x1
362         st1             {v7.8b},  [x0], x1
363         st1             {v0.8b},  [x0], x1
364         st1             {v1.8b},  [x0], x1
365         st1             {v2.8b},  [x0], x1
366         st1             {v3.8b},  [x0], x1
367         st1             {v4.d}[1],  [x0], x1
368         st1             {v5.d}[1],  [x0], x1
369         st1             {v6.d}[1],  [x0], x1
370         st1             {v7.d}[1],  [x0], x1
371         st1             {v0.d}[1],  [x0], x1
372         st1             {v1.d}[1],  [x0], x1
373         st1             {v2.d}[1],  [x0], x1
374         st1             {v3.d}[1],  [x0], x1
375 9:
376         ret
377 endfunc
378
379 .macro  h264_loop_filter_chroma
380         dup             v22.8b, w2              // alpha
381         dup             v23.8b, w3              // beta
382         uxtl            v24.8h, v24.8b
383         uabd            v26.8b, v16.8b, v0.8b   // abs(p0 - q0)
384         uabd            v28.8b, v18.8b, v16.8b  // abs(p1 - p0)
385         uabd            v30.8b, v2.8b,  v0.8b   // abs(q1 - q0)
386         cmhi            v26.8b, v22.8b, v26.8b  // < alpha
387         cmhi            v28.8b, v23.8b, v28.8b  // < beta
388         cmhi            v30.8b, v23.8b, v30.8b  // < beta
389         uxtl            v4.8h,  v0.8b
390         and             v26.8b, v26.8b, v28.8b
391         usubw           v4.8h,  v4.8h,  v16.8b
392         and             v26.8b, v26.8b, v30.8b
393         shl             v4.8h,  v4.8h,  #2
394         mov             x8,  v26.d[0]
395         sli             v24.8h, v24.8h, #8
396         uaddw           v4.8h,  v4.8h,  v18.8b
397         cbz             x8,  9f
398         usubw           v4.8h,  v4.8h,  v2.8b
399         rshrn           v4.8b,  v4.8h,  #3
400         smin            v4.8b,  v4.8b,  v24.8b
401         neg             v25.8b, v24.8b
402         smax            v4.8b,  v4.8b,  v25.8b
403         uxtl            v22.8h, v0.8b
404         and             v4.8b,  v4.8b,  v26.8b
405         uxtl            v28.8h, v16.8b
406         saddw           v28.8h, v28.8h, v4.8b
407         ssubw           v22.8h, v22.8h, v4.8b
408         sqxtun          v16.8b, v28.8h
409         sqxtun          v0.8b,  v22.8h
410 .endm
411
412 function ff_h264_v_loop_filter_chroma_neon, export=1
413         h264_loop_filter_start
414
415         sub             x0,  x0,  x1, lsl #1
416         ld1             {v18.8b}, [x0], x1
417         ld1             {v16.8b}, [x0], x1
418         ld1             {v0.8b},  [x0], x1
419         ld1             {v2.8b},  [x0]
420
421         h264_loop_filter_chroma
422
423         sub             x0,  x0,  x1, lsl #1
424         st1             {v16.8b}, [x0], x1
425         st1             {v0.8b},  [x0], x1
426 9:
427         ret
428 endfunc
429
430 function ff_h264_h_loop_filter_chroma_neon, export=1
431         h264_loop_filter_start
432
433         sub             x0,  x0,  #2
434 h_loop_filter_chroma420:
435         ld1             {v18.s}[0], [x0], x1
436         ld1             {v16.s}[0], [x0], x1
437         ld1             {v0.s}[0],  [x0], x1
438         ld1             {v2.s}[0],  [x0], x1
439         ld1             {v18.s}[1], [x0], x1
440         ld1             {v16.s}[1], [x0], x1
441         ld1             {v0.s}[1],  [x0], x1
442         ld1             {v2.s}[1],  [x0], x1
443
444         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
445
446         h264_loop_filter_chroma
447
448         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
449
450         sub             x0,  x0,  x1, lsl #3
451         st1             {v18.s}[0], [x0], x1
452         st1             {v16.s}[0], [x0], x1
453         st1             {v0.s}[0],  [x0], x1
454         st1             {v2.s}[0],  [x0], x1
455         st1             {v18.s}[1], [x0], x1
456         st1             {v16.s}[1], [x0], x1
457         st1             {v0.s}[1],  [x0], x1
458         st1             {v2.s}[1],  [x0], x1
459 9:
460         ret
461 endfunc
462
463 function ff_h264_h_loop_filter_chroma422_neon, export=1
464         h264_loop_filter_start
465         add             x5,  x0,  x1
466         sub             x0,  x0,  #2
467         add             x1,  x1,  x1
468         mov             x7,  x30
469         bl              h_loop_filter_chroma420
470         mov             x30, x7
471         sub             x0,  x5,  #2
472         mov             v24.s[0], w6
473         b               h_loop_filter_chroma420
474 endfunc
475
476 .macro h264_loop_filter_chroma_intra
477         uabd            v26.8b,  v16.8b,  v17.8b  // abs(p0 - q0)
478         uabd            v27.8b,  v18.8b,  v16.8b  // abs(p1 - p0)
479         uabd            v28.8b,  v19.8b,  v17.8b  // abs(q1 - q0)
480         cmhi            v26.8b,  v30.8b,  v26.8b  // < alpha
481         cmhi            v27.8b,  v31.8b,  v27.8b  // < beta
482         cmhi            v28.8b,  v31.8b,  v28.8b  // < beta
483         and             v26.8b,  v26.8b,  v27.8b
484         and             v26.8b,  v26.8b,  v28.8b
485         mov             x2, v26.d[0]
486
487         ushll           v4.8h,   v18.8b,  #1
488         ushll           v6.8h,   v19.8b,  #1
489         cbz             x2, 9f
490         uaddl           v20.8h,  v16.8b,  v19.8b
491         uaddl           v22.8h,  v17.8b,  v18.8b
492         add             v20.8h,  v20.8h,  v4.8h
493         add             v22.8h,  v22.8h,  v6.8h
494         uqrshrn         v24.8b,  v20.8h,  #2
495         uqrshrn         v25.8b,  v22.8h,  #2
496         bit             v16.8b,  v24.8b,  v26.8b
497         bit             v17.8b,  v25.8b,  v26.8b
498 .endm
499
500 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
501         h264_loop_filter_start_intra
502
503         sub             x0,  x0,  x1, lsl #1
504         ld1             {v18.8b}, [x0], x1
505         ld1             {v16.8b}, [x0], x1
506         ld1             {v17.8b}, [x0], x1
507         ld1             {v19.8b}, [x0]
508
509         h264_loop_filter_chroma_intra
510
511         sub             x0,  x0,  x1, lsl #1
512         st1             {v16.8b}, [x0], x1
513         st1             {v17.8b}, [x0], x1
514
515 9:
516         ret
517 endfunc
518
519 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
520         h264_loop_filter_start_intra
521
522         sub             x4,  x0,  #2
523         sub             x0,  x0,  #1
524         ld1             {v18.8b}, [x4], x1
525         ld1             {v16.8b}, [x4], x1
526         ld1             {v17.8b}, [x4], x1
527         ld1             {v19.8b}, [x4], x1
528
529         transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
530
531         h264_loop_filter_chroma_intra
532
533         st2             {v16.b,v17.b}[0], [x0], x1
534         st2             {v16.b,v17.b}[1], [x0], x1
535         st2             {v16.b,v17.b}[2], [x0], x1
536         st2             {v16.b,v17.b}[3], [x0], x1
537
538 9:
539         ret
540 endfunc
541
542 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
543         h264_loop_filter_start_intra
544
545         sub             x4,  x0,  #2
546         sub             x0,  x0,  #1
547 h_loop_filter_chroma420_intra:
548         ld1             {v18.8b}, [x4], x1
549         ld1             {v16.8b}, [x4], x1
550         ld1             {v17.8b}, [x4], x1
551         ld1             {v19.8b}, [x4], x1
552         ld1             {v18.s}[1], [x4], x1
553         ld1             {v16.s}[1], [x4], x1
554         ld1             {v17.s}[1], [x4], x1
555         ld1             {v19.s}[1], [x4], x1
556
557         transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
558
559         h264_loop_filter_chroma_intra
560
561         st2             {v16.b,v17.b}[0], [x0], x1
562         st2             {v16.b,v17.b}[1], [x0], x1
563         st2             {v16.b,v17.b}[2], [x0], x1
564         st2             {v16.b,v17.b}[3], [x0], x1
565         st2             {v16.b,v17.b}[4], [x0], x1
566         st2             {v16.b,v17.b}[5], [x0], x1
567         st2             {v16.b,v17.b}[6], [x0], x1
568         st2             {v16.b,v17.b}[7], [x0], x1
569
570 9:
571         ret
572 endfunc
573
574 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
575         h264_loop_filter_start_intra
576         sub             x4,  x0,  #2
577         add             x5,  x0,  x1, lsl #3
578         sub             x0,  x0,  #1
579         mov             x7,  x30
580         bl              h_loop_filter_chroma420_intra
581         sub             x0,  x5,  #1
582         mov             x30, x7
583         b               h_loop_filter_chroma420_intra
584 endfunc
585
586 .macro  biweight_16     macs, macd
587         dup             v0.16b,  w5
588         dup             v1.16b,  w6
589         mov             v4.16b,  v16.16b
590         mov             v6.16b,  v16.16b
591 1:      subs            w3,  w3,  #2
592         ld1             {v20.16b}, [x0], x2
593         \macd           v4.8h,   v0.8b,  v20.8b
594         \macd\()2       v6.8H,   v0.16B, v20.16B
595         ld1             {v22.16b}, [x1], x2
596         \macs           v4.8h,   v1.8b,  v22.8b
597         \macs\()2       v6.8H,   v1.16B, v22.16B
598         mov             v24.16b, v16.16b
599         ld1             {v28.16b}, [x0], x2
600         mov             v26.16b, v16.16b
601         \macd           v24.8h,  v0.8b,  v28.8b
602         \macd\()2       v26.8H,  v0.16B, v28.16B
603         ld1             {v30.16b}, [x1], x2
604         \macs           v24.8h,  v1.8b,  v30.8b
605         \macs\()2       v26.8H,  v1.16B, v30.16B
606         sshl            v4.8h,   v4.8h,  v18.8h
607         sshl            v6.8h,   v6.8h,  v18.8h
608         sqxtun          v4.8b,   v4.8h
609         sqxtun2         v4.16b,  v6.8h
610         sshl            v24.8h,  v24.8h, v18.8h
611         sshl            v26.8h,  v26.8h, v18.8h
612         sqxtun          v24.8b,  v24.8h
613         sqxtun2         v24.16b, v26.8h
614         mov             v6.16b,  v16.16b
615         st1             {v4.16b},  [x7], x2
616         mov             v4.16b,  v16.16b
617         st1             {v24.16b}, [x7], x2
618         b.ne            1b
619         ret
620 .endm
621
622 .macro  biweight_8      macs, macd
623         dup             v0.8b,  w5
624         dup             v1.8b,  w6
625         mov             v2.16b,  v16.16b
626         mov             v20.16b, v16.16b
627 1:      subs            w3,  w3,  #2
628         ld1             {v4.8b}, [x0], x2
629         \macd           v2.8h,  v0.8b,  v4.8b
630         ld1             {v5.8b}, [x1], x2
631         \macs           v2.8h,  v1.8b,  v5.8b
632         ld1             {v6.8b}, [x0], x2
633         \macd           v20.8h, v0.8b,  v6.8b
634         ld1             {v7.8b}, [x1], x2
635         \macs           v20.8h, v1.8b,  v7.8b
636         sshl            v2.8h,  v2.8h,  v18.8h
637         sqxtun          v2.8b,  v2.8h
638         sshl            v20.8h, v20.8h, v18.8h
639         sqxtun          v4.8b,  v20.8h
640         mov             v20.16b, v16.16b
641         st1             {v2.8b}, [x7], x2
642         mov             v2.16b,  v16.16b
643         st1             {v4.8b}, [x7], x2
644         b.ne            1b
645         ret
646 .endm
647
648 .macro  biweight_4      macs, macd
649         dup             v0.8b,  w5
650         dup             v1.8b,  w6
651         mov             v2.16b, v16.16b
652         mov             v20.16b,v16.16b
653 1:      subs            w3,  w3,  #4
654         ld1             {v4.s}[0], [x0], x2
655         ld1             {v4.s}[1], [x0], x2
656         \macd           v2.8h,  v0.8b,  v4.8b
657         ld1             {v5.s}[0], [x1], x2
658         ld1             {v5.s}[1], [x1], x2
659         \macs           v2.8h,  v1.8b,  v5.8b
660         b.lt            2f
661         ld1             {v6.s}[0], [x0], x2
662         ld1             {v6.s}[1], [x0], x2
663         \macd           v20.8h, v0.8b,  v6.8b
664         ld1             {v7.s}[0], [x1], x2
665         ld1             {v7.s}[1], [x1], x2
666         \macs           v20.8h, v1.8b,  v7.8b
667         sshl            v2.8h,  v2.8h,  v18.8h
668         sqxtun          v2.8b,  v2.8h
669         sshl            v20.8h, v20.8h, v18.8h
670         sqxtun          v4.8b,  v20.8h
671         mov             v20.16b, v16.16b
672         st1             {v2.s}[0], [x7], x2
673         st1             {v2.s}[1], [x7], x2
674         mov             v2.16b,  v16.16b
675         st1             {v4.s}[0], [x7], x2
676         st1             {v4.s}[1], [x7], x2
677         b.ne            1b
678         ret
679 2:      sshl            v2.8h,  v2.8h,  v18.8h
680         sqxtun          v2.8b,  v2.8h
681         st1             {v2.s}[0], [x7], x2
682         st1             {v2.s}[1], [x7], x2
683         ret
684 .endm
685
686 .macro  biweight_func   w
687 function ff_biweight_h264_pixels_\w\()_neon, export=1
688         lsr             w8,  w5,  #31
689         add             w7,  w7,  #1
690         eor             w8,  w8,  w6,  lsr #30
691         orr             w7,  w7,  #1
692         dup             v18.8h,   w4
693         lsl             w7,  w7,  w4
694         not             v18.16b,  v18.16b
695         dup             v16.8h,   w7
696         mov             x7,  x0
697         cbz             w8,  10f
698         subs            w8,  w8,  #1
699         b.eq            20f
700         subs            w8,  w8,  #1
701         b.eq            30f
702         b               40f
703 10:     biweight_\w     umlal, umlal
704 20:     neg             w5, w5
705         biweight_\w     umlal, umlsl
706 30:     neg             w5, w5
707         neg             w6, w6
708         biweight_\w     umlsl, umlsl
709 40:     neg             w6, w6
710         biweight_\w     umlsl, umlal
711 endfunc
712 .endm
713
714         biweight_func   16
715         biweight_func   8
716         biweight_func   4
717
718 .macro  weight_16       add
719         dup             v0.16b,  w4
720 1:      subs            w2,  w2,  #2
721         ld1             {v20.16b}, [x0], x1
722         umull           v4.8h,   v0.8b,  v20.8b
723         umull2          v6.8h,   v0.16b, v20.16b
724         ld1             {v28.16b}, [x0], x1
725         umull           v24.8h,  v0.8b,  v28.8b
726         umull2          v26.8h,  v0.16b, v28.16b
727         \add            v4.8h,   v16.8h, v4.8h
728         srshl           v4.8h,   v4.8h,  v18.8h
729         \add            v6.8h,   v16.8h, v6.8h
730         srshl           v6.8h,   v6.8h,  v18.8h
731         sqxtun          v4.8b,   v4.8h
732         sqxtun2         v4.16b,  v6.8h
733         \add            v24.8h,  v16.8h, v24.8h
734         srshl           v24.8h,  v24.8h, v18.8h
735         \add            v26.8h,  v16.8h, v26.8h
736         srshl           v26.8h,  v26.8h, v18.8h
737         sqxtun          v24.8b,  v24.8h
738         sqxtun2         v24.16b, v26.8h
739         st1             {v4.16b},  [x5], x1
740         st1             {v24.16b}, [x5], x1
741         b.ne            1b
742         ret
743 .endm
744
745 .macro  weight_8        add
746         dup             v0.8b,  w4
747 1:      subs            w2,  w2,  #2
748         ld1             {v4.8b}, [x0], x1
749         umull           v2.8h,  v0.8b,  v4.8b
750         ld1             {v6.8b}, [x0], x1
751         umull           v20.8h, v0.8b,  v6.8b
752         \add            v2.8h,  v16.8h,  v2.8h
753         srshl           v2.8h,  v2.8h,  v18.8h
754         sqxtun          v2.8b,  v2.8h
755         \add            v20.8h, v16.8h,  v20.8h
756         srshl           v20.8h, v20.8h, v18.8h
757         sqxtun          v4.8b,  v20.8h
758         st1             {v2.8b}, [x5], x1
759         st1             {v4.8b}, [x5], x1
760         b.ne            1b
761         ret
762 .endm
763
764 .macro  weight_4        add
765         dup             v0.8b,  w4
766 1:      subs            w2,  w2,  #4
767         ld1             {v4.s}[0], [x0], x1
768         ld1             {v4.s}[1], [x0], x1
769         umull           v2.8h,  v0.8b,  v4.8b
770         b.lt            2f
771         ld1             {v6.s}[0], [x0], x1
772         ld1             {v6.s}[1], [x0], x1
773         umull           v20.8h, v0.8b,  v6.8b
774         \add            v2.8h,  v16.8h,  v2.8h
775         srshl           v2.8h,  v2.8h,  v18.8h
776         sqxtun          v2.8b,  v2.8h
777         \add            v20.8h, v16.8h,  v20.8h
778         srshl           v20.8h, v20.8h, v18.8h
779         sqxtun          v4.8b,  v20.8h
780         st1             {v2.s}[0], [x5], x1
781         st1             {v2.s}[1], [x5], x1
782         st1             {v4.s}[0], [x5], x1
783         st1             {v4.s}[1], [x5], x1
784         b.ne            1b
785         ret
786 2:      \add            v2.8h,  v16.8h,  v2.8h
787         srshl           v2.8h,  v2.8h,  v18.8h
788         sqxtun          v2.8b,  v2.8h
789         st1             {v2.s}[0], [x5], x1
790         st1             {v2.s}[1], [x5], x1
791         ret
792 .endm
793
794 .macro  weight_func     w
795 function ff_weight_h264_pixels_\w\()_neon, export=1
796         cmp             w3,  #1
797         mov             w6,  #1
798         lsl             w5,  w5,  w3
799         dup             v16.8h,  w5
800         mov             x5,  x0
801         b.le            20f
802         sub             w6,  w6,  w3
803         dup             v18.8h,  w6
804         cmp             w4, #0
805         b.lt            10f
806         weight_\w       shadd
807 10:     neg             w4,  w4
808         weight_\w       shsub
809 20:     neg             w6,  w3
810         dup             v18.8h,  w6
811         cmp             w4,  #0
812         b.lt            10f
813         weight_\w       add
814 10:     neg             w4,  w4
815         weight_\w       sub
816 endfunc
817 .endm
818
819         weight_func     16
820         weight_func     8
821         weight_func     4
822
823 .macro  h264_loop_filter_start_10
824         cmp             w2,  #0
825         ldr             w6,  [x4]
826         ccmp            w3,  #0,  #0,  ne
827         lsl             w2,  w2,  #2
828         mov             v24.s[0], w6
829         lsl             w3,  w3,  #2
830         and             w8,  w6,  w6,  lsl #16
831         b.eq            1f
832         ands            w8,  w8,  w8,  lsl #8
833         b.ge            2f
834 1:
835         ret
836 2:
837 .endm
838
839 .macro h264_loop_filter_start_intra_10
840         orr             w4,  w2,  w3
841         cbnz            w4,  1f
842         ret
843 1:
844         lsl             w2,  w2,  #2
845         lsl             w3,  w3,  #2
846         dup             v30.8h,   w2              // alpha
847         dup             v31.8h,   w3              // beta
848 .endm
849
850 .macro  h264_loop_filter_chroma_10
851         dup             v22.8h,  w2               // alpha
852         dup             v23.8h,  w3               // beta
853         uxtl            v24.8h,  v24.8b           // tc0
854
855         uabd            v26.8h,  v16.8h,  v0.8h   // abs(p0 - q0)
856         uabd            v28.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
857         uabd            v30.8h,  v2.8h,   v0.8h   // abs(q1 - q0)
858         cmhi            v26.8h,  v22.8h,  v26.8h  // < alpha
859         cmhi            v28.8h,  v23.8h,  v28.8h  // < beta
860         cmhi            v30.8h,  v23.8h,  v30.8h  // < beta
861
862         and             v26.16b, v26.16b, v28.16b
863         mov             v4.16b,  v0.16b
864         sub             v4.8h,   v4.8h,   v16.8h
865         and             v26.16b, v26.16b, v30.16b
866         shl             v4.8h,   v4.8h,   #2
867         mov             x8, v26.d[0]
868         mov             x9, v26.d[1]
869         sli             v24.8h,  v24.8h,  #8
870         uxtl            v24.8h,  v24.8b
871         add             v4.8h,   v4.8h,   v18.8h
872         adds            x8,  x8,  x9
873         shl             v24.8h,  v24.8h,  #2
874
875         b.eq            9f
876
877         movi            v31.8h, #3                // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
878         uqsub           v24.8h,  v24.8h,  v31.8h
879         sub             v4.8h,   v4.8h,   v2.8h
880         srshr           v4.8h,   v4.8h,   #3
881         smin            v4.8h,   v4.8h,   v24.8h
882         neg             v25.8h,  v24.8h
883         smax            v4.8h,   v4.8h,   v25.8h
884         and             v4.16b,  v4.16b,  v26.16b
885         add             v16.8h,  v16.8h,  v4.8h
886         sub             v0.8h,   v0.8h,   v4.8h
887
888         mvni            v4.8h,   #0xFC, lsl #8    // 1023 for clipping
889         movi            v5.8h,   #0
890         smin            v0.8h,   v0.8h,   v4.8h
891         smin            v16.8h,  v16.8h,  v4.8h
892         smax            v0.8h,   v0.8h,   v5.8h
893         smax            v16.8h,  v16.8h,  v5.8h
894 .endm
895
896 function ff_h264_v_loop_filter_chroma_neon_10, export=1
897         h264_loop_filter_start_10
898
899         mov             x10,  x0
900         sub             x0,  x0,  x1, lsl #1
901         ld1             {v18.8h}, [x0 ], x1
902         ld1             {v0.8h},  [x10], x1
903         ld1             {v16.8h}, [x0 ], x1
904         ld1             {v2.8h},  [x10]
905
906         h264_loop_filter_chroma_10
907
908         sub             x0,  x10,  x1, lsl #1
909         st1             {v16.8h}, [x0], x1
910         st1             {v0.8h},  [x0], x1
911 9:
912         ret
913 endfunc
914
915 function ff_h264_h_loop_filter_chroma_neon_10, export=1
916         h264_loop_filter_start_10
917
918         sub             x0,  x0,  #4 // access the 2nd left pixel
919 h_loop_filter_chroma420_10:
920         add             x10,  x0,  x1,  lsl #2
921         ld1             {v18.d}[0], [x0 ], x1
922         ld1             {v18.d}[1], [x10], x1
923         ld1             {v16.d}[0], [x0 ], x1
924         ld1             {v16.d}[1], [x10], x1
925         ld1             {v0.d}[0],  [x0 ], x1
926         ld1             {v0.d}[1],  [x10], x1
927         ld1             {v2.d}[0],  [x0 ], x1
928         ld1             {v2.d}[1],  [x10], x1
929
930         transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
931
932         h264_loop_filter_chroma_10
933
934         transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
935
936         sub             x0,  x10,  x1, lsl #3
937         st1             {v18.d}[0], [x0], x1
938         st1             {v16.d}[0], [x0], x1
939         st1             {v0.d}[0],  [x0], x1
940         st1             {v2.d}[0],  [x0], x1
941         st1             {v18.d}[1], [x0], x1
942         st1             {v16.d}[1], [x0], x1
943         st1             {v0.d}[1],  [x0], x1
944         st1             {v2.d}[1],  [x0], x1
945 9:
946         ret
947 endfunc
948
949 function ff_h264_h_loop_filter_chroma422_neon_10, export=1
950         h264_loop_filter_start_10
951         add             x5,  x0,  x1
952         sub             x0,  x0,  #4
953         add             x1,  x1,  x1
954         mov             x7,  x30
955         bl              h_loop_filter_chroma420_10
956         mov             x30, x7
957         sub             x0,  x5,  #4
958         mov             v24.s[0], w6
959         b               h_loop_filter_chroma420_10
960 endfunc
961
962 .macro h264_loop_filter_chroma_intra_10
963         uabd            v26.8h,  v16.8h,  v17.8h  // abs(p0 - q0)
964         uabd            v27.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
965         uabd            v28.8h,  v19.8h,  v17.8h  // abs(q1 - q0)
966         cmhi            v26.8h,  v30.8h,  v26.8h  // < alpha
967         cmhi            v27.8h,  v31.8h,  v27.8h  // < beta
968         cmhi            v28.8h,  v31.8h,  v28.8h  // < beta
969         and             v26.16b, v26.16b, v27.16b
970         and             v26.16b, v26.16b, v28.16b
971         mov             x2, v26.d[0]
972         mov             x3, v26.d[1]
973
974         shl             v4.8h,  v18.8h,  #1
975         shl             v6.8h,  v19.8h,  #1
976
977         adds            x2,  x2,  x3
978         b.eq            9f
979
980         add             v20.8h,  v16.8h,  v19.8h
981         add             v22.8h,  v17.8h,  v18.8h
982         add             v20.8h,  v20.8h,  v4.8h
983         add             v22.8h,  v22.8h,  v6.8h
984         urshr           v24.8h,  v20.8h,  #2
985         urshr           v25.8h,  v22.8h,  #2
986         bit             v16.16b, v24.16b, v26.16b
987         bit             v17.16b, v25.16b, v26.16b
988 .endm
989
990 function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
991         h264_loop_filter_start_intra_10
992         mov             x9,  x0
993         sub             x0,  x0,  x1, lsl #1
994         ld1             {v18.8h}, [x0], x1
995         ld1             {v17.8h}, [x9], x1
996         ld1             {v16.8h}, [x0], x1
997         ld1             {v19.8h}, [x9]
998
999         h264_loop_filter_chroma_intra_10
1000
1001         sub             x0,  x9,  x1, lsl #1
1002         st1             {v16.8h}, [x0], x1
1003         st1             {v17.8h}, [x0], x1
1004
1005 9:
1006         ret
1007 endfunc
1008
1009 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
1010         h264_loop_filter_start_intra_10
1011
1012         sub             x4,  x0,  #4
1013         sub             x0,  x0,  #2
1014         add             x9,  x4,  x1, lsl #1
1015         ld1             {v18.8h}, [x4], x1
1016         ld1             {v17.8h}, [x9], x1
1017         ld1             {v16.8h}, [x4], x1
1018         ld1             {v19.8h}, [x9], x1
1019
1020         transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
1021
1022         h264_loop_filter_chroma_intra_10
1023
1024         st2             {v16.h,v17.h}[0], [x0], x1
1025         st2             {v16.h,v17.h}[1], [x0], x1
1026         st2             {v16.h,v17.h}[2], [x0], x1
1027         st2             {v16.h,v17.h}[3], [x0], x1
1028
1029 9:
1030         ret
1031 endfunc
1032
1033 function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
1034         h264_loop_filter_start_intra_10
1035         sub             x4,  x0,  #4
1036         sub             x0,  x0,  #2
1037 h_loop_filter_chroma420_intra_10:
1038         add             x9,  x4,  x1, lsl #2
1039         ld1             {v18.4h},   [x4], x1
1040         ld1             {v18.d}[1], [x9], x1
1041         ld1             {v16.4h},   [x4], x1
1042         ld1             {v16.d}[1], [x9], x1
1043         ld1             {v17.4h},   [x4], x1
1044         ld1             {v17.d}[1], [x9], x1
1045         ld1             {v19.4h},   [x4], x1
1046         ld1             {v19.d}[1], [x9], x1
1047
1048         transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
1049
1050         h264_loop_filter_chroma_intra_10
1051
1052         st2             {v16.h,v17.h}[0], [x0], x1
1053         st2             {v16.h,v17.h}[1], [x0], x1
1054         st2             {v16.h,v17.h}[2], [x0], x1
1055         st2             {v16.h,v17.h}[3], [x0], x1
1056         st2             {v16.h,v17.h}[4], [x0], x1
1057         st2             {v16.h,v17.h}[5], [x0], x1
1058         st2             {v16.h,v17.h}[6], [x0], x1
1059         st2             {v16.h,v17.h}[7], [x0], x1
1060
1061 9:
1062         ret
1063 endfunc
1064
1065 function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
1066         h264_loop_filter_start_intra_10
1067         sub             x4,  x0,  #4
1068         add             x5,  x0,  x1, lsl #3
1069         sub             x0,  x0,  #2
1070         mov             x7,  x30
1071         bl              h_loop_filter_chroma420_intra_10
1072         mov             x4,  x9
1073         sub             x0,  x5,  #2
1074         mov             x30, x7
1075         b               h_loop_filter_chroma420_intra_10
1076 endfunc