2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
26 .macro h264_loop_filter_start
31 and w8, w6, w6, lsl #16
33 ands w8, w8, w8, lsl #8
40 .macro h264_loop_filter_luma
41 dup v22.16b, w2 // alpha
43 uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
45 uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
46 sli v24.8h, v24.8h, #8
47 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
48 sli v24.4s, v24.4s, #16
49 cmhi v21.16b, v22.16b, v21.16b // < alpha
50 dup v22.16b, w3 // beta
51 cmlt v23.16b, v24.16b, #0
52 cmhi v28.16b, v22.16b, v28.16b // < beta
53 cmhi v30.16b, v22.16b, v30.16b // < beta
54 bic v21.16b, v21.16b, v23.16b
55 uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
56 and v21.16b, v21.16b, v28.16b
57 uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
58 and v21.16b, v21.16b, v30.16b // < beta
59 shrn v30.8b, v21.8h, #4
61 cmhi v17.16b, v22.16b, v17.16b // < beta
62 cmhi v19.16b, v22.16b, v19.16b // < beta
64 and v17.16b, v17.16b, v21.16b
65 and v19.16b, v19.16b, v21.16b
66 and v24.16b, v24.16b, v21.16b
67 urhadd v28.16b, v16.16b, v0.16b
68 sub v21.16b, v24.16b, v17.16b
69 uqadd v23.16b, v18.16b, v24.16b
70 uhadd v20.16b, v20.16b, v28.16b
71 sub v21.16b, v21.16b, v19.16b
72 uhadd v28.16b, v4.16b, v28.16b
73 umin v23.16b, v23.16b, v20.16b
74 uqsub v22.16b, v18.16b, v24.16b
75 uqadd v4.16b, v2.16b, v24.16b
76 umax v23.16b, v23.16b, v22.16b
77 uqsub v22.16b, v2.16b, v24.16b
78 umin v28.16b, v4.16b, v28.16b
80 umax v28.16b, v28.16b, v22.16b
82 usubw v4.8h, v4.8h, v16.8b
83 usubw2 v20.8h, v20.8h, v16.16b
85 shl v20.8h, v20.8h, #2
86 uaddw v4.8h, v4.8h, v18.8b
87 uaddw2 v20.8h, v20.8h, v18.16b
88 usubw v4.8h, v4.8h, v2.8b
89 usubw2 v20.8h, v20.8h, v2.16b
90 rshrn v4.8b, v4.8h, #3
91 rshrn2 v4.16b, v20.8h, #3
92 bsl v17.16b, v23.16b, v18.16b
93 bsl v19.16b, v28.16b, v2.16b
96 smin v4.16b, v4.16b, v21.16b
98 smax v4.16b, v4.16b, v23.16b
101 saddw v28.8h, v28.8h, v4.8b
102 saddw2 v21.8h, v21.8h, v4.16b
103 ssubw v22.8h, v22.8h, v4.8b
104 ssubw2 v24.8h, v24.8h, v4.16b
105 sqxtun v16.8b, v28.8h
106 sqxtun2 v16.16b, v21.8h
108 sqxtun2 v0.16b, v24.8h
111 function ff_h264_v_loop_filter_luma_neon, export=1
112 h264_loop_filter_start
114 ld1 {v0.16b}, [x0], x1
115 ld1 {v2.16b}, [x0], x1
116 ld1 {v4.16b}, [x0], x1
117 sub x0, x0, x1, lsl #2
118 sub x0, x0, x1, lsl #1
119 ld1 {v20.16b}, [x0], x1
120 ld1 {v18.16b}, [x0], x1
121 ld1 {v16.16b}, [x0], x1
123 h264_loop_filter_luma
125 sub x0, x0, x1, lsl #1
126 st1 {v17.16b}, [x0], x1
127 st1 {v16.16b}, [x0], x1
128 st1 {v0.16b}, [x0], x1
134 function ff_h264_h_loop_filter_luma_neon, export=1
135 h264_loop_filter_start
138 ld1 {v6.8b}, [x0], x1
139 ld1 {v20.8b}, [x0], x1
140 ld1 {v18.8b}, [x0], x1
141 ld1 {v16.8b}, [x0], x1
142 ld1 {v0.8b}, [x0], x1
143 ld1 {v2.8b}, [x0], x1
144 ld1 {v4.8b}, [x0], x1
145 ld1 {v26.8b}, [x0], x1
146 ld1 {v6.d}[1], [x0], x1
147 ld1 {v20.d}[1], [x0], x1
148 ld1 {v18.d}[1], [x0], x1
149 ld1 {v16.d}[1], [x0], x1
150 ld1 {v0.d}[1], [x0], x1
151 ld1 {v2.d}[1], [x0], x1
152 ld1 {v4.d}[1], [x0], x1
153 ld1 {v26.d}[1], [x0], x1
155 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
157 h264_loop_filter_luma
159 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
161 sub x0, x0, x1, lsl #4
163 st1 {v17.s}[0], [x0], x1
164 st1 {v16.s}[0], [x0], x1
165 st1 {v0.s}[0], [x0], x1
166 st1 {v19.s}[0], [x0], x1
167 st1 {v17.s}[1], [x0], x1
168 st1 {v16.s}[1], [x0], x1
169 st1 {v0.s}[1], [x0], x1
170 st1 {v19.s}[1], [x0], x1
171 st1 {v17.s}[2], [x0], x1
172 st1 {v16.s}[2], [x0], x1
173 st1 {v0.s}[2], [x0], x1
174 st1 {v19.s}[2], [x0], x1
175 st1 {v17.s}[3], [x0], x1
176 st1 {v16.s}[3], [x0], x1
177 st1 {v0.s}[3], [x0], x1
178 st1 {v19.s}[3], [x0], x1
184 .macro h264_loop_filter_start_intra
189 dup v30.16b, w2 // alpha
190 dup v31.16b, w3 // beta
193 .macro h264_loop_filter_luma_intra
194 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
195 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
196 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
197 cmhi v19.16b, v30.16b, v16.16b // < alpha
198 cmhi v17.16b, v31.16b, v17.16b // < beta
199 cmhi v18.16b, v31.16b, v18.16b // < beta
202 ushr v30.16b, v30.16b, #2 // alpha >> 2
203 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
204 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
206 and v19.16b, v19.16b, v17.16b
207 and v19.16b, v19.16b, v18.16b
208 shrn v20.8b, v19.8h, #4
212 ushll v20.8h, v6.8b, #1
213 ushll v22.8h, v1.8b, #1
214 ushll2 v21.8h, v6.16b, #1
215 ushll2 v23.8h, v1.16b, #1
216 uaddw v20.8h, v20.8h, v7.8b
217 uaddw v22.8h, v22.8h, v0.8b
218 uaddw2 v21.8h, v21.8h, v7.16b
219 uaddw2 v23.8h, v23.8h, v0.16b
220 uaddw v20.8h, v20.8h, v1.8b
221 uaddw v22.8h, v22.8h, v6.8b
222 uaddw2 v21.8h, v21.8h, v1.16b
223 uaddw2 v23.8h, v23.8h, v6.16b
225 rshrn v24.8b, v20.8h, #2 // p0'_1
226 rshrn v25.8b, v22.8h, #2 // q0'_1
227 rshrn2 v24.16b, v21.8h, #2 // p0'_1
228 rshrn2 v25.16b, v23.8h, #2 // q0'_1
230 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
231 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
232 cmhi v17.16b, v31.16b, v17.16b // < beta
233 cmhi v18.16b, v31.16b, v18.16b // < beta
235 and v17.16b, v16.16b, v17.16b // if_2 && if_3
236 and v18.16b, v16.16b, v18.16b // if_2 && if_4
241 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
242 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
244 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
245 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
247 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248 uaddl v26.8h, v5.8b, v7.8b
249 uaddl2 v27.8h, v5.16b, v7.16b
250 uaddw v26.8h, v26.8h, v0.8b
251 uaddw2 v27.8h, v27.8h, v0.16b
252 add v20.8h, v20.8h, v26.8h
253 add v21.8h, v21.8h, v27.8h
254 uaddw v20.8h, v20.8h, v0.8b
255 uaddw2 v21.8h, v21.8h, v0.16b
256 rshrn v20.8b, v20.8h, #3 // p0'_2
257 rshrn2 v20.16b, v21.8h, #3 // p0'_2
258 uaddw v26.8h, v26.8h, v6.8b
259 uaddw2 v27.8h, v27.8h, v6.16b
260 rshrn v21.8b, v26.8h, #2 // p1'_2
261 rshrn2 v21.16b, v27.8h, #2 // p1'_2
262 uaddl v28.8h, v4.8b, v5.8b
263 uaddl2 v29.8h, v4.16b, v5.16b
264 shl v28.8h, v28.8h, #1
265 shl v29.8h, v29.8h, #1
266 add v28.8h, v28.8h, v26.8h
267 add v29.8h, v29.8h, v27.8h
268 rshrn v19.8b, v28.8h, #3 // p2'_2
269 rshrn2 v19.16b, v29.8h, #3 // p2'_2
271 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272 uaddl v26.8h, v2.8b, v0.8b
273 uaddl2 v27.8h, v2.16b, v0.16b
274 uaddw v26.8h, v26.8h, v7.8b
275 uaddw2 v27.8h, v27.8h, v7.16b
276 add v22.8h, v22.8h, v26.8h
277 add v23.8h, v23.8h, v27.8h
278 uaddw v22.8h, v22.8h, v7.8b
279 uaddw2 v23.8h, v23.8h, v7.16b
280 rshrn v22.8b, v22.8h, #3 // q0'_2
281 rshrn2 v22.16b, v23.8h, #3 // q0'_2
282 uaddw v26.8h, v26.8h, v1.8b
283 uaddw2 v27.8h, v27.8h, v1.16b
284 rshrn v23.8b, v26.8h, #2 // q1'_2
285 rshrn2 v23.16b, v27.8h, #2 // q1'_2
286 uaddl v28.8h, v2.8b, v3.8b
287 uaddl2 v29.8h, v2.16b, v3.16b
288 shl v28.8h, v28.8h, #1
289 shl v29.8h, v29.8h, #1
290 add v28.8h, v28.8h, v26.8h
291 add v29.8h, v29.8h, v27.8h
292 rshrn v26.8b, v28.8h, #3 // q2'_2
293 rshrn2 v26.16b, v29.8h, #3 // q2'_2
295 bit v7.16b, v24.16b, v30.16b // p0'_1
296 bit v0.16b, v25.16b, v31.16b // q0'_1
297 bit v7.16b, v20.16b, v17.16b // p0'_2
298 bit v6.16b, v21.16b, v17.16b // p1'_2
299 bit v5.16b, v19.16b, v17.16b // p2'_2
300 bit v0.16b, v22.16b, v18.16b // q0'_2
301 bit v1.16b, v23.16b, v18.16b // q1'_2
302 bit v2.16b, v26.16b, v18.16b // q2'_2
305 function ff_h264_v_loop_filter_luma_intra_neon, export=1
306 h264_loop_filter_start_intra
308 ld1 {v0.16b}, [x0], x1 // q0
309 ld1 {v1.16b}, [x0], x1 // q1
310 ld1 {v2.16b}, [x0], x1 // q2
311 ld1 {v3.16b}, [x0], x1 // q3
312 sub x0, x0, x1, lsl #3
313 ld1 {v4.16b}, [x0], x1 // p3
314 ld1 {v5.16b}, [x0], x1 // p2
315 ld1 {v6.16b}, [x0], x1 // p1
316 ld1 {v7.16b}, [x0] // p0
318 h264_loop_filter_luma_intra
320 sub x0, x0, x1, lsl #1
321 st1 {v5.16b}, [x0], x1 // p2
322 st1 {v6.16b}, [x0], x1 // p1
323 st1 {v7.16b}, [x0], x1 // p0
324 st1 {v0.16b}, [x0], x1 // q0
325 st1 {v1.16b}, [x0], x1 // q1
326 st1 {v2.16b}, [x0] // q2
331 function ff_h264_h_loop_filter_luma_intra_neon, export=1
332 h264_loop_filter_start_intra
335 ld1 {v4.8b}, [x0], x1
336 ld1 {v5.8b}, [x0], x1
337 ld1 {v6.8b}, [x0], x1
338 ld1 {v7.8b}, [x0], x1
339 ld1 {v0.8b}, [x0], x1
340 ld1 {v1.8b}, [x0], x1
341 ld1 {v2.8b}, [x0], x1
342 ld1 {v3.8b}, [x0], x1
343 ld1 {v4.d}[1], [x0], x1
344 ld1 {v5.d}[1], [x0], x1
345 ld1 {v6.d}[1], [x0], x1
346 ld1 {v7.d}[1], [x0], x1
347 ld1 {v0.d}[1], [x0], x1
348 ld1 {v1.d}[1], [x0], x1
349 ld1 {v2.d}[1], [x0], x1
350 ld1 {v3.d}[1], [x0], x1
352 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
354 h264_loop_filter_luma_intra
356 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
358 sub x0, x0, x1, lsl #4
359 st1 {v4.8b}, [x0], x1
360 st1 {v5.8b}, [x0], x1
361 st1 {v6.8b}, [x0], x1
362 st1 {v7.8b}, [x0], x1
363 st1 {v0.8b}, [x0], x1
364 st1 {v1.8b}, [x0], x1
365 st1 {v2.8b}, [x0], x1
366 st1 {v3.8b}, [x0], x1
367 st1 {v4.d}[1], [x0], x1
368 st1 {v5.d}[1], [x0], x1
369 st1 {v6.d}[1], [x0], x1
370 st1 {v7.d}[1], [x0], x1
371 st1 {v0.d}[1], [x0], x1
372 st1 {v1.d}[1], [x0], x1
373 st1 {v2.d}[1], [x0], x1
374 st1 {v3.d}[1], [x0], x1
379 .macro h264_loop_filter_chroma
380 dup v22.8b, w2 // alpha
381 dup v23.8b, w3 // beta
383 uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0)
384 uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
385 uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0)
386 cmhi v26.8b, v22.8b, v26.8b // < alpha
387 cmhi v28.8b, v23.8b, v28.8b // < beta
388 cmhi v30.8b, v23.8b, v30.8b // < beta
390 and v26.8b, v26.8b, v28.8b
391 usubw v4.8h, v4.8h, v16.8b
392 and v26.8b, v26.8b, v30.8b
395 sli v24.8h, v24.8h, #8
396 uaddw v4.8h, v4.8h, v18.8b
398 usubw v4.8h, v4.8h, v2.8b
399 rshrn v4.8b, v4.8h, #3
400 smin v4.8b, v4.8b, v24.8b
402 smax v4.8b, v4.8b, v25.8b
404 and v4.8b, v4.8b, v26.8b
406 saddw v28.8h, v28.8h, v4.8b
407 ssubw v22.8h, v22.8h, v4.8b
408 sqxtun v16.8b, v28.8h
412 function ff_h264_v_loop_filter_chroma_neon, export=1
413 h264_loop_filter_start
415 sub x0, x0, x1, lsl #1
416 ld1 {v18.8b}, [x0], x1
417 ld1 {v16.8b}, [x0], x1
418 ld1 {v0.8b}, [x0], x1
421 h264_loop_filter_chroma
423 sub x0, x0, x1, lsl #1
424 st1 {v16.8b}, [x0], x1
425 st1 {v0.8b}, [x0], x1
430 function ff_h264_h_loop_filter_chroma_neon, export=1
431 h264_loop_filter_start
434 h_loop_filter_chroma420:
435 ld1 {v18.s}[0], [x0], x1
436 ld1 {v16.s}[0], [x0], x1
437 ld1 {v0.s}[0], [x0], x1
438 ld1 {v2.s}[0], [x0], x1
439 ld1 {v18.s}[1], [x0], x1
440 ld1 {v16.s}[1], [x0], x1
441 ld1 {v0.s}[1], [x0], x1
442 ld1 {v2.s}[1], [x0], x1
444 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
446 h264_loop_filter_chroma
448 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
450 sub x0, x0, x1, lsl #3
451 st1 {v18.s}[0], [x0], x1
452 st1 {v16.s}[0], [x0], x1
453 st1 {v0.s}[0], [x0], x1
454 st1 {v2.s}[0], [x0], x1
455 st1 {v18.s}[1], [x0], x1
456 st1 {v16.s}[1], [x0], x1
457 st1 {v0.s}[1], [x0], x1
458 st1 {v2.s}[1], [x0], x1
463 function ff_h264_h_loop_filter_chroma422_neon, export=1
464 h264_loop_filter_start
469 bl h_loop_filter_chroma420
473 b h_loop_filter_chroma420
476 .macro h264_loop_filter_chroma_intra
477 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
478 uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
479 uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
480 cmhi v26.8b, v30.8b, v26.8b // < alpha
481 cmhi v27.8b, v31.8b, v27.8b // < beta
482 cmhi v28.8b, v31.8b, v28.8b // < beta
483 and v26.8b, v26.8b, v27.8b
484 and v26.8b, v26.8b, v28.8b
487 ushll v4.8h, v18.8b, #1
488 ushll v6.8h, v19.8b, #1
490 uaddl v20.8h, v16.8b, v19.8b
491 uaddl v22.8h, v17.8b, v18.8b
492 add v20.8h, v20.8h, v4.8h
493 add v22.8h, v22.8h, v6.8h
494 uqrshrn v24.8b, v20.8h, #2
495 uqrshrn v25.8b, v22.8h, #2
496 bit v16.8b, v24.8b, v26.8b
497 bit v17.8b, v25.8b, v26.8b
500 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
501 h264_loop_filter_start_intra
503 sub x0, x0, x1, lsl #1
504 ld1 {v18.8b}, [x0], x1
505 ld1 {v16.8b}, [x0], x1
506 ld1 {v17.8b}, [x0], x1
509 h264_loop_filter_chroma_intra
511 sub x0, x0, x1, lsl #1
512 st1 {v16.8b}, [x0], x1
513 st1 {v17.8b}, [x0], x1
519 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
520 h264_loop_filter_start_intra
524 ld1 {v18.8b}, [x4], x1
525 ld1 {v16.8b}, [x4], x1
526 ld1 {v17.8b}, [x4], x1
527 ld1 {v19.8b}, [x4], x1
529 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
531 h264_loop_filter_chroma_intra
533 st2 {v16.b,v17.b}[0], [x0], x1
534 st2 {v16.b,v17.b}[1], [x0], x1
535 st2 {v16.b,v17.b}[2], [x0], x1
536 st2 {v16.b,v17.b}[3], [x0], x1
542 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
543 h264_loop_filter_start_intra
547 h_loop_filter_chroma420_intra:
548 ld1 {v18.8b}, [x4], x1
549 ld1 {v16.8b}, [x4], x1
550 ld1 {v17.8b}, [x4], x1
551 ld1 {v19.8b}, [x4], x1
552 ld1 {v18.s}[1], [x4], x1
553 ld1 {v16.s}[1], [x4], x1
554 ld1 {v17.s}[1], [x4], x1
555 ld1 {v19.s}[1], [x4], x1
557 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
559 h264_loop_filter_chroma_intra
561 st2 {v16.b,v17.b}[0], [x0], x1
562 st2 {v16.b,v17.b}[1], [x0], x1
563 st2 {v16.b,v17.b}[2], [x0], x1
564 st2 {v16.b,v17.b}[3], [x0], x1
565 st2 {v16.b,v17.b}[4], [x0], x1
566 st2 {v16.b,v17.b}[5], [x0], x1
567 st2 {v16.b,v17.b}[6], [x0], x1
568 st2 {v16.b,v17.b}[7], [x0], x1
574 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
575 h264_loop_filter_start_intra
577 add x5, x0, x1, lsl #3
580 bl h_loop_filter_chroma420_intra
583 b h_loop_filter_chroma420_intra
586 .macro biweight_16 macs, macd
592 ld1 {v20.16b}, [x0], x2
593 \macd v4.8h, v0.8b, v20.8b
594 \macd\()2 v6.8H, v0.16B, v20.16B
595 ld1 {v22.16b}, [x1], x2
596 \macs v4.8h, v1.8b, v22.8b
597 \macs\()2 v6.8H, v1.16B, v22.16B
599 ld1 {v28.16b}, [x0], x2
601 \macd v24.8h, v0.8b, v28.8b
602 \macd\()2 v26.8H, v0.16B, v28.16B
603 ld1 {v30.16b}, [x1], x2
604 \macs v24.8h, v1.8b, v30.8b
605 \macs\()2 v26.8H, v1.16B, v30.16B
606 sshl v4.8h, v4.8h, v18.8h
607 sshl v6.8h, v6.8h, v18.8h
609 sqxtun2 v4.16b, v6.8h
610 sshl v24.8h, v24.8h, v18.8h
611 sshl v26.8h, v26.8h, v18.8h
612 sqxtun v24.8b, v24.8h
613 sqxtun2 v24.16b, v26.8h
615 st1 {v4.16b}, [x7], x2
617 st1 {v24.16b}, [x7], x2
622 .macro biweight_8 macs, macd
628 ld1 {v4.8b}, [x0], x2
629 \macd v2.8h, v0.8b, v4.8b
630 ld1 {v5.8b}, [x1], x2
631 \macs v2.8h, v1.8b, v5.8b
632 ld1 {v6.8b}, [x0], x2
633 \macd v20.8h, v0.8b, v6.8b
634 ld1 {v7.8b}, [x1], x2
635 \macs v20.8h, v1.8b, v7.8b
636 sshl v2.8h, v2.8h, v18.8h
638 sshl v20.8h, v20.8h, v18.8h
641 st1 {v2.8b}, [x7], x2
643 st1 {v4.8b}, [x7], x2
648 .macro biweight_4 macs, macd
654 ld1 {v4.s}[0], [x0], x2
655 ld1 {v4.s}[1], [x0], x2
656 \macd v2.8h, v0.8b, v4.8b
657 ld1 {v5.s}[0], [x1], x2
658 ld1 {v5.s}[1], [x1], x2
659 \macs v2.8h, v1.8b, v5.8b
661 ld1 {v6.s}[0], [x0], x2
662 ld1 {v6.s}[1], [x0], x2
663 \macd v20.8h, v0.8b, v6.8b
664 ld1 {v7.s}[0], [x1], x2
665 ld1 {v7.s}[1], [x1], x2
666 \macs v20.8h, v1.8b, v7.8b
667 sshl v2.8h, v2.8h, v18.8h
669 sshl v20.8h, v20.8h, v18.8h
672 st1 {v2.s}[0], [x7], x2
673 st1 {v2.s}[1], [x7], x2
675 st1 {v4.s}[0], [x7], x2
676 st1 {v4.s}[1], [x7], x2
679 2: sshl v2.8h, v2.8h, v18.8h
681 st1 {v2.s}[0], [x7], x2
682 st1 {v2.s}[1], [x7], x2
686 .macro biweight_func w
687 function ff_biweight_h264_pixels_\w\()_neon, export=1
690 eor w8, w8, w6, lsr #30
703 10: biweight_\w umlal, umlal
705 biweight_\w umlal, umlsl
708 biweight_\w umlsl, umlsl
710 biweight_\w umlsl, umlal
721 ld1 {v20.16b}, [x0], x1
722 umull v4.8h, v0.8b, v20.8b
723 umull2 v6.8h, v0.16b, v20.16b
724 ld1 {v28.16b}, [x0], x1
725 umull v24.8h, v0.8b, v28.8b
726 umull2 v26.8h, v0.16b, v28.16b
727 \add v4.8h, v16.8h, v4.8h
728 srshl v4.8h, v4.8h, v18.8h
729 \add v6.8h, v16.8h, v6.8h
730 srshl v6.8h, v6.8h, v18.8h
732 sqxtun2 v4.16b, v6.8h
733 \add v24.8h, v16.8h, v24.8h
734 srshl v24.8h, v24.8h, v18.8h
735 \add v26.8h, v16.8h, v26.8h
736 srshl v26.8h, v26.8h, v18.8h
737 sqxtun v24.8b, v24.8h
738 sqxtun2 v24.16b, v26.8h
739 st1 {v4.16b}, [x5], x1
740 st1 {v24.16b}, [x5], x1
748 ld1 {v4.8b}, [x0], x1
749 umull v2.8h, v0.8b, v4.8b
750 ld1 {v6.8b}, [x0], x1
751 umull v20.8h, v0.8b, v6.8b
752 \add v2.8h, v16.8h, v2.8h
753 srshl v2.8h, v2.8h, v18.8h
755 \add v20.8h, v16.8h, v20.8h
756 srshl v20.8h, v20.8h, v18.8h
758 st1 {v2.8b}, [x5], x1
759 st1 {v4.8b}, [x5], x1
767 ld1 {v4.s}[0], [x0], x1
768 ld1 {v4.s}[1], [x0], x1
769 umull v2.8h, v0.8b, v4.8b
771 ld1 {v6.s}[0], [x0], x1
772 ld1 {v6.s}[1], [x0], x1
773 umull v20.8h, v0.8b, v6.8b
774 \add v2.8h, v16.8h, v2.8h
775 srshl v2.8h, v2.8h, v18.8h
777 \add v20.8h, v16.8h, v20.8h
778 srshl v20.8h, v20.8h, v18.8h
780 st1 {v2.s}[0], [x5], x1
781 st1 {v2.s}[1], [x5], x1
782 st1 {v4.s}[0], [x5], x1
783 st1 {v4.s}[1], [x5], x1
786 2: \add v2.8h, v16.8h, v2.8h
787 srshl v2.8h, v2.8h, v18.8h
789 st1 {v2.s}[0], [x5], x1
790 st1 {v2.s}[1], [x5], x1
795 function ff_weight_h264_pixels_\w\()_neon, export=1
823 .macro h264_loop_filter_start_10
830 and w8, w6, w6, lsl #16
832 ands w8, w8, w8, lsl #8
839 .macro h264_loop_filter_start_intra_10
846 dup v30.8h, w2 // alpha
847 dup v31.8h, w3 // beta
850 .macro h264_loop_filter_chroma_10
851 dup v22.8h, w2 // alpha
852 dup v23.8h, w3 // beta
853 uxtl v24.8h, v24.8b // tc0
855 uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
856 uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
857 uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
858 cmhi v26.8h, v22.8h, v26.8h // < alpha
859 cmhi v28.8h, v23.8h, v28.8h // < beta
860 cmhi v30.8h, v23.8h, v30.8h // < beta
862 and v26.16b, v26.16b, v28.16b
864 sub v4.8h, v4.8h, v16.8h
865 and v26.16b, v26.16b, v30.16b
869 sli v24.8h, v24.8h, #8
871 add v4.8h, v4.8h, v18.8h
873 shl v24.8h, v24.8h, #2
877 movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
878 uqsub v24.8h, v24.8h, v31.8h
879 sub v4.8h, v4.8h, v2.8h
880 srshr v4.8h, v4.8h, #3
881 smin v4.8h, v4.8h, v24.8h
883 smax v4.8h, v4.8h, v25.8h
884 and v4.16b, v4.16b, v26.16b
885 add v16.8h, v16.8h, v4.8h
886 sub v0.8h, v0.8h, v4.8h
888 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
890 smin v0.8h, v0.8h, v4.8h
891 smin v16.8h, v16.8h, v4.8h
892 smax v0.8h, v0.8h, v5.8h
893 smax v16.8h, v16.8h, v5.8h
896 function ff_h264_v_loop_filter_chroma_neon_10, export=1
897 h264_loop_filter_start_10
900 sub x0, x0, x1, lsl #1
901 ld1 {v18.8h}, [x0 ], x1
902 ld1 {v0.8h}, [x10], x1
903 ld1 {v16.8h}, [x0 ], x1
906 h264_loop_filter_chroma_10
908 sub x0, x10, x1, lsl #1
909 st1 {v16.8h}, [x0], x1
910 st1 {v0.8h}, [x0], x1
915 function ff_h264_h_loop_filter_chroma_neon_10, export=1
916 h264_loop_filter_start_10
918 sub x0, x0, #4 // access the 2nd left pixel
919 h_loop_filter_chroma420_10:
920 add x10, x0, x1, lsl #2
921 ld1 {v18.d}[0], [x0 ], x1
922 ld1 {v18.d}[1], [x10], x1
923 ld1 {v16.d}[0], [x0 ], x1
924 ld1 {v16.d}[1], [x10], x1
925 ld1 {v0.d}[0], [x0 ], x1
926 ld1 {v0.d}[1], [x10], x1
927 ld1 {v2.d}[0], [x0 ], x1
928 ld1 {v2.d}[1], [x10], x1
930 transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
932 h264_loop_filter_chroma_10
934 transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
936 sub x0, x10, x1, lsl #3
937 st1 {v18.d}[0], [x0], x1
938 st1 {v16.d}[0], [x0], x1
939 st1 {v0.d}[0], [x0], x1
940 st1 {v2.d}[0], [x0], x1
941 st1 {v18.d}[1], [x0], x1
942 st1 {v16.d}[1], [x0], x1
943 st1 {v0.d}[1], [x0], x1
944 st1 {v2.d}[1], [x0], x1
949 function ff_h264_h_loop_filter_chroma422_neon_10, export=1
950 h264_loop_filter_start_10
955 bl h_loop_filter_chroma420_10
959 b h_loop_filter_chroma420_10
962 .macro h264_loop_filter_chroma_intra_10
963 uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
964 uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
965 uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
966 cmhi v26.8h, v30.8h, v26.8h // < alpha
967 cmhi v27.8h, v31.8h, v27.8h // < beta
968 cmhi v28.8h, v31.8h, v28.8h // < beta
969 and v26.16b, v26.16b, v27.16b
970 and v26.16b, v26.16b, v28.16b
974 shl v4.8h, v18.8h, #1
975 shl v6.8h, v19.8h, #1
980 add v20.8h, v16.8h, v19.8h
981 add v22.8h, v17.8h, v18.8h
982 add v20.8h, v20.8h, v4.8h
983 add v22.8h, v22.8h, v6.8h
984 urshr v24.8h, v20.8h, #2
985 urshr v25.8h, v22.8h, #2
986 bit v16.16b, v24.16b, v26.16b
987 bit v17.16b, v25.16b, v26.16b
990 function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
991 h264_loop_filter_start_intra_10
993 sub x0, x0, x1, lsl #1
994 ld1 {v18.8h}, [x0], x1
995 ld1 {v17.8h}, [x9], x1
996 ld1 {v16.8h}, [x0], x1
999 h264_loop_filter_chroma_intra_10
1001 sub x0, x9, x1, lsl #1
1002 st1 {v16.8h}, [x0], x1
1003 st1 {v17.8h}, [x0], x1
1009 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
1010 h264_loop_filter_start_intra_10
1014 add x9, x4, x1, lsl #1
1015 ld1 {v18.8h}, [x4], x1
1016 ld1 {v17.8h}, [x9], x1
1017 ld1 {v16.8h}, [x4], x1
1018 ld1 {v19.8h}, [x9], x1
1020 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1022 h264_loop_filter_chroma_intra_10
1024 st2 {v16.h,v17.h}[0], [x0], x1
1025 st2 {v16.h,v17.h}[1], [x0], x1
1026 st2 {v16.h,v17.h}[2], [x0], x1
1027 st2 {v16.h,v17.h}[3], [x0], x1
1033 function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
1034 h264_loop_filter_start_intra_10
1037 h_loop_filter_chroma420_intra_10:
1038 add x9, x4, x1, lsl #2
1039 ld1 {v18.4h}, [x4], x1
1040 ld1 {v18.d}[1], [x9], x1
1041 ld1 {v16.4h}, [x4], x1
1042 ld1 {v16.d}[1], [x9], x1
1043 ld1 {v17.4h}, [x4], x1
1044 ld1 {v17.d}[1], [x9], x1
1045 ld1 {v19.4h}, [x4], x1
1046 ld1 {v19.d}[1], [x9], x1
1048 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1050 h264_loop_filter_chroma_intra_10
1052 st2 {v16.h,v17.h}[0], [x0], x1
1053 st2 {v16.h,v17.h}[1], [x0], x1
1054 st2 {v16.h,v17.h}[2], [x0], x1
1055 st2 {v16.h,v17.h}[3], [x0], x1
1056 st2 {v16.h,v17.h}[4], [x0], x1
1057 st2 {v16.h,v17.h}[5], [x0], x1
1058 st2 {v16.h,v17.h}[6], [x0], x1
1059 st2 {v16.h,v17.h}[7], [x0], x1
1065 function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
1066 h264_loop_filter_start_intra_10
1068 add x5, x0, x1, lsl #3
1071 bl h_loop_filter_chroma420_intra_10
1075 b h_loop_filter_chroma420_intra_10