#include "vp8/common/loopfilter.h"
#include "vpx_util/loongson_intrinsics.h"
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \
+ { \
+ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const __m128i cnst4b = __lsx_vldi(4); \
+ const __m128i cnst3b = __lsx_vldi(3); \
+ \
+ p1_m = __lsx_vxori_b(p1, 0x80); \
+ p0_m = __lsx_vxori_b(p0, 0x80); \
+ q0_m = __lsx_vxori_b(q0, 0x80); \
+ q1_m = __lsx_vxori_b(q1, 0x80); \
+ \
+ filt = __lsx_vssub_b(p1_m, q1_m); \
+ filt = __lsx_vand_v(filt, hev); \
+ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vand_v(filt, mask); \
+ t1 = __lsx_vsadd_b(filt, cnst4b); \
+ t1 = __lsx_vsra_b(filt, cnst3b); \
+ t2 = __lsx_vsadd_b(filt, cnst3b); \
+ t2 = __lsx_vsra_b(filt, cnst3b); \
+ q0_m = __lsx_vssub_b(q0_m, t1); \
+ q0 = __lsx_vxori_b(q0_m, 0x80); \
+ p0_m = __lsx_vsadd_b(p0_m, t2); \
+ p0 = __lsx_vxori_b(p0_m, 0x80); \
+ filt = __lsx_vsrari_b(t1, 1); \
+ hev = __lsx_vxori_b(hev, 0xff); \
+ filt = __lsx_vand_v(filt, hev); \
+ q1_m = __lsx_vssub_b(q1_m, filt); \
+ q1 = __lsx_vxori_b(q1_m, 0x80); \
+ p1_m = __lsx_vsadd_b(p1_m, filt); \
+ p1 = __lsx_vxori_b(p1_m, 0x80); \
+ }
+
#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
{ \
__m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
__lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \
}
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ uint8_t *temp_src;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+ temp_src = src - pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, p3, p2, p1, p0);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, q0, q1, q2, q3);
+
+ thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
+ thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr);
+ b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vreplgr2vr_b(*limit0_ptr);
+ limit1 = __lsx_vreplgr2vr_b(*limit1_ptr);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ __lsx_vstx(p1, src, -pitch_x2);
+ __lsx_vstx(p0, src, -pitch);
+ __lsx_vst(q0, src, 0);
+ __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ uint8_t *src_tmp0 = src - 4;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ row0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+ row3 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+ row4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+ row7 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+
+ row8 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+ row11 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+ row12 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+ row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src -= 2;
+ __lsx_vstelm_w(tmp2, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp3, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp4, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp5, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+ thresh = __lsx_vreplgr2vr_b(thresh_in);
+ limit = __lsx_vreplgr2vr_b(limit_in);
+ b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+ DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+ src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+ q0_u = __lsx_vld(src_u, 0);
+ DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+ q3_u = __lsx_vldx(src_u, pitch_x3);
+
+ DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+ src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+ q0_v = __lsx_vld(src_v, 0);
+ DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+ q3_v = __lsx_vldx(src_v, pitch_x3);
+
+ /* right 8 element of p3 are u pixel and
+ left 8 element of p3 are v pixel */
+ DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+ p2, p1, p0);
+ DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+ q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ __lsx_vstelm_d(p1, src_u, 0, 0);
+ __lsx_vstelm_d(p0, src_u + pitch, 0, 0);
+ __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0);
+ __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0);
+
+ __lsx_vstelm_d(p1, src_v, 0, 1);
+ __lsx_vstelm_d(p0, src_v + pitch, 0, 1);
+ __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1);
+ __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *src_u_tmp, *src_v_tmp;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ __m128i row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ thresh = __lsx_vreplgr2vr_b(thresh_in);
+ limit = __lsx_vreplgr2vr_b(limit_in);
+ b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+ src_u_tmp = src_u - 4;
+ row0 = __lsx_vld(src_u_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+ row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+ src_u_tmp += pitch_x4;
+ row4 = __lsx_vld(src_u_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+ row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+ src_v_tmp = src_v - 4;
+ row8 = __lsx_vld(src_v_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+ row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+ src_v_tmp += pitch_x4;
+ row12 = __lsx_vld(src_v_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+ row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+ tmp0 = __lsx_vilvl_b(p0, q1);
+ tmp1 = __lsx_vilvl_b(q1, q0);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src_u_tmp += 2;
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+ __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+ src_v_tmp += 2;
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+ __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
static inline void mbloop_filter_horizontal_edge_y_lsx(
uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
const uint8_t limit_in, const uint8_t thresh_in) {
*lpf_info_ptr->hev_thr);
}
}
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_horizontal_edge_uv_lsx(
+ src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}