--- /dev/null
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro h264_loop_filter_start
+ cmp w2, #0
+ ldr w6, [x4]
+ ccmp w3, #0, #0, ne
+ mov v24.S[0], w6
+ and w6, w6, w6, lsl #16
+ b.eq 1f
+ ands w6, w6, w6, lsl #8
+ b.ge 2f
+1:
+ ret
+2:
+.endm
+
+.macro h264_loop_filter_luma
+ dup v22.16B, w2 // alpha
+ uxtl v24.8H, v24.8B
+ uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
+ uxtl v24.4S, v24.4H
+ uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
+ sli v24.8H, v24.8H, #8
+ uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
+ sli v24.4S, v24.4S, #16
+ cmhi v21.16B, v22.16B, v21.16B // < alpha
+ dup v22.16B, w3 // beta
+ cmlt v23.16B, v24.16B, #0
+ cmhi v28.16B, v22.16B, v28.16B // < beta
+ cmhi v30.16B, v22.16B, v30.16B // < beta
+ bic v21.16B, v21.16B, v23.16B
+ uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
+ and v21.16B, v21.16B, v28.16B
+ uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
+ cmhi v17.16B, v22.16B, v17.16B // < beta
+ and v21.16B, v21.16B, v30.16B
+ cmhi v19.16B, v22.16B, v19.16B // < beta
+ and v17.16B, v17.16B, v21.16B
+ and v19.16B, v19.16B, v21.16B
+ and v24.16B, v24.16B, v21.16B
+ urhadd v28.16B, v16.16B, v0.16B
+ sub v21.16B, v24.16B, v17.16B
+ uqadd v23.16B, v18.16B, v24.16B
+ uhadd v20.16B, v20.16B, v28.16B
+ sub v21.16B, v21.16B, v19.16B
+ uhadd v28.16B, v4.16B, v28.16B
+ umin v23.16B, v23.16B, v20.16B
+ uqsub v22.16B, v18.16B, v24.16B
+ uqadd v4.16B, v2.16B, v24.16B
+ umax v23.16B, v23.16B, v22.16B
+ uqsub v22.16B, v2.16B, v24.16B
+ umin v28.16B, v4.16B, v28.16B
+ uxtl v4.8H, v0.8B
+ umax v28.16B, v28.16B, v22.16B
+ uxtl2 v20.8H, v0.16B
+ usubw v4.8H, v4.8H, v16.8B
+ usubw2 v20.8H, v20.8H, v16.16B
+ shl v4.8H, v4.8H, #2
+ shl v20.8H, v20.8H, #2
+ uaddw v4.8H, v4.8H, v18.8B
+ uaddw2 v20.8H, v20.8H, v18.16B
+ usubw v4.8H, v4.8H, v2.8B
+ usubw2 v20.8H, v20.8H, v2.16B
+ rshrn v4.8B, v4.8H, #3
+ rshrn2 v4.16B, v20.8H, #3
+ bsl v17.16B, v23.16B, v18.16B
+ bsl v19.16B, v28.16B, v2.16B
+ neg v23.16B, v21.16B
+ uxtl v28.8H, v16.8B
+ smin v4.16B, v4.16B, v21.16B
+ uxtl2 v21.8H, v16.16B
+ smax v4.16B, v4.16B, v23.16B
+ uxtl v22.8H, v0.8B
+ uxtl2 v24.8H, v0.16B
+ saddw v28.8H, v28.8H, v4.8B
+ saddw2 v21.8H, v21.8H, v4.16B
+ ssubw v22.8H, v22.8H, v4.8B
+ ssubw2 v24.8H, v24.8H, v4.16B
+ sqxtun v16.8B, v28.8H
+ sqxtun2 v16.16B, v21.8H
+ sqxtun v0.8B, v22.8H
+ sqxtun2 v0.16B, v24.8H
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+ sxtw x1, w1
+
+ ld1 {v0.16B}, [x0], x1
+ ld1 {v2.16B}, [x0], x1
+ ld1 {v4.16B}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ld1 {v20.16B}, [x0], x1
+ ld1 {v18.16B}, [x0], x1
+ ld1 {v16.16B}, [x0], x1
+
+ h264_loop_filter_luma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v17.16B}, [x0], x1
+ st1 {v16.16B}, [x0], x1
+ st1 {v0.16B}, [x0], x1
+ st1 {v19.16B}, [x0]
+
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #4
+ ld1 {v6.8B}, [x0], x1
+ ld1 {v20.8B}, [x0], x1
+ ld1 {v18.8B}, [x0], x1
+ ld1 {v16.8B}, [x0], x1
+ ld1 {v0.8B}, [x0], x1
+ ld1 {v2.8B}, [x0], x1
+ ld1 {v4.8B}, [x0], x1
+ ld1 {v26.8B}, [x0], x1
+ ld1 {v6.D}[1], [x0], x1
+ ld1 {v20.D}[1], [x0], x1
+ ld1 {v18.D}[1], [x0], x1
+ ld1 {v16.D}[1], [x0], x1
+ ld1 {v0.D}[1], [x0], x1
+ ld1 {v2.D}[1], [x0], x1
+ ld1 {v4.D}[1], [x0], x1
+ ld1 {v26.D}[1], [x0], x1
+
+ transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+ h264_loop_filter_luma
+
+ transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
+
+ sub x0, x0, x1, lsl #4
+ add x0, x0, #2
+ st1 {v17.S}[0], [x0], x1
+ st1 {v16.S}[0], [x0], x1
+ st1 {v0.S}[0], [x0], x1
+ st1 {v19.S}[0], [x0], x1
+ st1 {v17.S}[1], [x0], x1
+ st1 {v16.S}[1], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ st1 {v19.S}[1], [x0], x1
+ st1 {v17.S}[2], [x0], x1
+ st1 {v16.S}[2], [x0], x1
+ st1 {v0.S}[2], [x0], x1
+ st1 {v19.S}[2], [x0], x1
+ st1 {v17.S}[3], [x0], x1
+ st1 {v16.S}[3], [x0], x1
+ st1 {v0.S}[3], [x0], x1
+ st1 {v19.S}[3], [x0], x1
+
+ ret
+endfunc
+
+.macro h264_loop_filter_chroma
+ dup v22.8B, w2 // alpha
+ uxtl v24.8H, v24.8B
+ uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
+ uxtl v4.8H, v0.8B
+ uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
+ usubw v4.8H, v4.8H, v16.8B
+ sli v24.8H, v24.8H, #8
+ shl v4.8H, v4.8H, #2
+ uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ uaddw v4.8H, v4.8H, v18.8B
+ cmhi v26.8B, v22.8B, v26.8B // < alpha
+ usubw v4.8H, v4.8H, v2.8B
+ dup v22.8B, w3 // beta
+ rshrn v4.8B, v4.8H, #3
+ cmhi v28.8B, v22.8B, v28.8B // < beta
+ cmhi v30.8B, v22.8B, v30.8B // < beta
+ smin v4.8B, v4.8B, v24.8B
+ neg v25.8B, v24.8B
+ and v26.8B, v26.8B, v28.8B
+ smax v4.8B, v4.8B, v25.8B
+ and v26.8B, v26.8B, v30.8B
+ uxtl v22.8H, v0.8B
+ and v4.8B, v4.8B, v26.8B
+ uxtl v28.8H, v16.8B
+ saddw v28.8H, v28.8H, v4.8B
+ ssubw v22.8H, v22.8H, v4.8B
+ sqxtun v16.8B, v28.8H
+ sqxtun v0.8B, v22.8H
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8B}, [x0], x1
+ ld1 {v16.8B}, [x0], x1
+ ld1 {v0.8B}, [x0], x1
+ ld1 {v2.8B}, [x0]
+
+ h264_loop_filter_chroma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v16.8B}, [x0], x1
+ st1 {v0.8B}, [x0], x1
+
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #2
+ ld1 {v18.S}[0], [x0], x1
+ ld1 {v16.S}[0], [x0], x1
+ ld1 {v0.S}[0], [x0], x1
+ ld1 {v2.S}[0], [x0], x1
+ ld1 {v18.S}[1], [x0], x1
+ ld1 {v16.S}[1], [x0], x1
+ ld1 {v0.S}[1], [x0], x1
+ ld1 {v2.S}[1], [x0], x1
+
+ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
+
+ h264_loop_filter_chroma
+
+ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
+
+ sub x0, x0, x1, lsl #3
+ st1 {v18.S}[0], [x0], x1
+ st1 {v16.S}[0], [x0], x1
+ st1 {v0.S}[0], [x0], x1
+ st1 {v2.S}[0], [x0], x1
+ st1 {v18.S}[1], [x0], x1
+ st1 {v16.S}[1], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ st1 {v2.S}[1], [x0], x1
+
+ ret
+endfunc