From 63378a94f996304e2784ecd6584e70cf487991e9 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Mon, 9 May 2022 14:39:05 +0800 Subject: [PATCH] loongarch: Reduce the number of instructions Replace some redundant instructions to improve the efficiency of the program. 1. txfm_macros_lsx.h 2. vpx_convolve8_avg_lsx.c 3. vpx_convolve8_horiz_lsx.c 4. vpx_convolve8_lsx.c 5. vpx_convolve8_vert_lsx.c 6. vpx_convolve_copy_lsx.c 7. vpx_convolve_lsx.h Bug: webm:1755 Change-Id: I9b7fdf6900338a26f9b1775609ad387648684f3d --- vpx_dsp/loongarch/sad_lsx.c | 2 +- vpx_dsp/loongarch/txfm_macros_lsx.h | 53 +++--- vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 49 +++--- vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 161 ++++++++---------- vpx_dsp/loongarch/vpx_convolve8_lsx.c | 110 ++++++------- vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 243 ++++++++++++---------------- vpx_dsp/loongarch/vpx_convolve_copy_lsx.c | 1 - vpx_dsp/loongarch/vpx_convolve_lsx.h | 25 ++- 8 files changed, 271 insertions(+), 373 deletions(-) diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 46ee557..5eaebfb 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -198,7 +198,7 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, int32_t ref_stride, int32_t height, uint32_t *sad_array) { int32_t ht_cnt = (height >> 2); - uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; __m128i src0, src1, src2, src3, sad_tmp; __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h index 977f1c2..bd51483 100644 --- a/vpx_dsp/loongarch/txfm_macros_lsx.h +++ b/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -13,36 +13,29 @@ #include "vpx_util/loongson_intrinsics.h" -#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ - { \ - __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ - __m128i k0_m, k1_m, k2_m, k3_m; \ - __m128i zero = __lsx_vldi(0); \ - \ - k0_m = __lsx_vreplgr2vr_h(cnst0); \ - k1_m = __lsx_vreplgr2vr_h(cnst1); \ - k2_m = __lsx_vpackev_h(k1_m, k0_m); \ - k0_m = __lsx_vpackev_h(zero, k0_m); \ - k1_m = __lsx_vpackev_h(k1_m, zero); \ - \ - s5_m = __lsx_vilvl_h(reg1, reg0); \ - s4_m = __lsx_vilvh_h(reg1, reg0); \ - s3_m = __lsx_vilvl_h(reg0, reg1); \ - s2_m = __lsx_vilvh_h(reg0, reg1); \ - \ - s1_m = __lsx_vdp2_w_h(s5_m, k0_m); \ - s0_m = __lsx_vdp2_w_h(s4_m, k0_m); \ - k3_m = __lsx_vdp2_w_h(s5_m, k1_m); \ - s1_m = __lsx_vsub_w(s1_m, k3_m); \ - k3_m = __lsx_vdp2_w_h(s4_m, k1_m); \ - s0_m = __lsx_vsub_w(s0_m, k3_m); \ - \ - out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ - \ - s1_m = __lsx_vdp2_w_h(s3_m, k2_m); \ - s0_m = __lsx_vdp2_w_h(s2_m, k2_m); \ - out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ - } +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + \ + DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \ + DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \ + \ + DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \ + k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } while (0) #define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ do { \ diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c index 2b98355..54fcd6c 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -278,7 +278,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0, vec1; __m128i dst0, dst1, dst2, dst3; - __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; int32_t src_stride2 = src_stride << 1; @@ -311,13 +311,12 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( dst1 = __lsx_vilvl_w(dst3, dst2); dst0 = __lsx_vilvl_d(dst1, dst0); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - out = __lsx_vpickev_b(tmp1, tmp0); - out = __lsx_vavgr_bu(out, dst0); - __lsx_vstelm_w(out, dst, 0, 0); - __lsx_vstelm_w(out, dst + dst_stride, 0, 1); - __lsx_vstelm_w(out, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(out, dst + dst_stride3, 0, 3); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); } static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( @@ -386,9 +385,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, filt_vt, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, res0, res1); DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); __lsx_vstelm_w(res0, dst, 0, 0); @@ -467,10 +465,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); - - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( @@ -513,8 +510,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -522,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); dst0 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; @@ -534,7 +529,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( dst3 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); - PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); dst += dst_stride; } } @@ -597,8 +592,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst0); __lsx_vst(tmp3, dst, 0); @@ -606,8 +600,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst1); __lsx_vstx(tmp3, dst, dst_stride); @@ -615,8 +608,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst2); __lsx_vstx(tmp3, dst, dst_stride2); @@ -624,8 +616,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst3); __lsx_vstx(tmp3, dst, dst_stride3); dst += dst_stride4; @@ -642,8 +633,6 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - src += 16; - dst += 16; } static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c index 5d67d65..2c6459a 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -338,8 +338,7 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, mask; - __m128i filt0, vec0, vec1, res0, res1; - __m128i vec2, vec3; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride + src_stride2; @@ -355,8 +354,8 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, src3 = __lsx_vldx(src, src_stride3); DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); - DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3, + FILTER_BITS, res0, res1); __lsx_vstelm_w(res0, dst, 0, 0); __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); @@ -367,10 +366,9 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - __m128i vec0, vec1, vec2, vec3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; - __m128i res0, res1, res2, res3; - __m128i vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, filt0; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride + src_stride2; int32_t src_stride4 = src_stride2 << 1; @@ -396,10 +394,10 @@ static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, src7, src6, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, - FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, - res0, res1, res2, res3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + __lsx_vstelm_w(res0, dst, 0, 0); dst += dst_stride; __lsx_vstelm_w(res0, dst, 0, 1); @@ -451,14 +449,13 @@ static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1); - - __lsx_vstelm_d(src0, dst, 0, 0); - __lsx_vstelm_d(src0, dst + dst_stride, 0, 1); - __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0); - __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1); } static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, @@ -490,15 +487,9 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - - src0 = __lsx_vld(src, 0); - DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); - src3 = __lsx_vldx(src, src_stride3); - src += src_stride4; + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); dst += dst_stride; __lsx_vstelm_d(out0, dst, 0, 1); @@ -508,13 +499,17 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, __lsx_vstelm_d(out1, dst, 0, 1); dst += dst_stride; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); dst += dst_stride; @@ -537,27 +532,25 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - - src0 = __lsx_vld(src, 0); - DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); - src3 = __lsx_vldx(src, src_stride3); - src += src_stride4; + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst_tmp1, 0, 0); __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); @@ -582,7 +575,7 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 2) - 1; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; @@ -609,22 +602,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0, + out1, out2, out3); - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out0, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out1, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out2, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out3, dst, 0); dst += dst_stride; for (; loop_cnt--;) { @@ -648,22 +636,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out0, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out1, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out2, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out3, dst, 0); dst += dst_stride; } } @@ -674,7 +657,7 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 1); __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; mask = __lsx_vld(mc_filt_mask_arr, 0); @@ -699,21 +682,16 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); - - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 16); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 16); + __lsx_vst(out2, dst, 0); + __lsx_vst(out3, dst, 16); dst += dst_stride; } } @@ -724,7 +702,7 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = height; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; mask = __lsx_vld(mc_filt_mask_arr, 0); @@ -749,19 +727,14 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); - - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 16); - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 32); - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 48); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + __lsx_vst(out2, dst, 32); + __lsx_vst(out3, dst, 48); dst += dst_stride; } } diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c index 894c137..73583ab 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -248,7 +248,7 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, int8_t *filter_horiz, int8_t *filter_vert) { __m128i src0, src1, src2, src3, src4, mask; - __m128i filt_vt, filt_hz, vec0, vec1, res0, res1; + __m128i filt_vt, filt_hz, vec0, vec1; __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; @@ -276,13 +276,13 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1, + FILTER_BITS, tmp0, tmp1); - __lsx_vstelm_w(res0, dst, 0, 0); - __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, @@ -290,7 +290,6 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, int8_t *filter_horiz, int8_t *filter_vert) { __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - __m128i res0, res1, res2, res3; __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; @@ -331,20 +330,19 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, filt_vt, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, - FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, - res0, res1, res2, res3); - - __lsx_vstelm_w(res0, dst, 0, 0); - __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4, + vec5, vec6, vec7); + + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1); dst += dst_stride4; - __lsx_vstelm_w(res2, dst, 0, 0); - __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, @@ -364,7 +362,7 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; @@ -401,14 +399,13 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); - __lsx_vstelm_d(out0, dst, 0, 0); - __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); - __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); - __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, @@ -417,9 +414,9 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt = (height >> 3); - __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0; - __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; @@ -449,8 +446,6 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -463,43 +458,44 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4); - DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1); - __lsx_vstelm_d(out0, dst, 0, 0); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out0, dst, 0, 1); + __lsx_vstelm_d(tmp1, dst, 0, 1); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 0); + __lsx_vstelm_d(tmp2, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 1); + __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7, - FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8); - DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1); - __lsx_vstelm_d(out0, dst, 0, 0); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out0, dst, 0, 1); + __lsx_vstelm_d(tmp1, dst, 0, 1); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 0); + __lsx_vstelm_d(tmp2, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 1); + __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; } } @@ -554,8 +550,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -563,8 +558,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -572,8 +566,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -581,8 +574,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; } @@ -599,8 +591,6 @@ static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - src += 16; - dst += 16; } static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c index c0bb10f..7e3a95b 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -361,13 +361,12 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4; - __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; __m128i filt0, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; @@ -378,37 +377,33 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src1, src2, src3, src4); src += (src_stride4 + src_stride); - DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, - src10_l, src21_l, src32_l, src43_l); - DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110, - src4332); - DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - src2110 = __lsx_vpickev_b(tmp1, tmp0); - - __lsx_vstelm_w(src2110, dst, 0, 0); - __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); - __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); } static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; - __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; - __m128i src65_l, src87_l, src2110, src4332, src6554, src8776; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i vec6, vec7, vec8, vec9, vec10, vec11; __m128i tmp0, tmp1, tmp2, tmp3; __m128i filt0; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; int32_t dst_stride4 = dst_stride2 << 1; - uint8_t *dst_tmp1 = dst + dst_stride4; filt0 = __lsx_vldrepl_h(filter, 0); @@ -420,27 +415,27 @@ static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src5, src6, src7, src8); src += (src_stride4 + src_stride); - DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, - src10_l, src21_l, src32_l, src43_l); - DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, - src54_l, src65_l, src76_l, src87_l); - DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, - src87_l, src76_l, src2110, src4332, src6554, src8776); - DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, - src8776, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332); - - __lsx_vstelm_w(src2110, dst, 0, 0); - __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); - __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); - - __lsx_vstelm_w(src4332, dst_tmp1, 0, 0); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4, + vec5, vec6, vec7); + DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8, + vec9, vec10, vec11); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3); } static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, @@ -457,17 +452,14 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; - __m128i out0, out1; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; - /* rearranging filter_y */ filt0 = __lsx_vldrepl_h(filter, 0); src0 = __lsx_vld(src, 0); @@ -478,9 +470,8 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); @@ -494,13 +485,11 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 3); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i out0, out1; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; int32_t dst_stride4 = dst_stride2 << 1; @@ -525,9 +514,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, vec4, vec5, vec6, vec7); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); @@ -536,9 +525,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); @@ -559,29 +548,17 @@ static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, } } -static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int y0_q4, - int y_step_q4, int w, int height) { +static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { uint32_t loop_cnt = (height >> 2); - __m128i src0, src1, src2, src3, src4; + __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - const int16_t *const filter_y = filter[y0_q4]; - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 8; cnt--;) { - filt_ver[cnt] = filter_y[cnt]; - } - - filt0 = __lsx_vldrepl_h(&filt_ver[3], 0); + filt0 = __lsx_vldrepl_h(filter, 0); src0 = __lsx_vld(src, 0); src += src_stride; @@ -595,29 +572,25 @@ static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; src0 = src4; @@ -630,20 +603,18 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 2); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + __m128i tmp, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; - uint8_t *src_tmp; + filt0 = __lsx_vldrepl_h(filter, 0); - src0 = __lsx_vld(src, 0); - src5 = __lsx_vld(src, 16); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); src += src_stride; src_tmp = src + 16; @@ -658,53 +629,45 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, src_tmp += src_stride4; DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vstx(tmp4, dst, dst_stride); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride); DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vstx(tmp4, dst, dst_stride2); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride2); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vstx(tmp4, dst, dst_stride3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride3); DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 16); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); dst += dst_stride; @@ -719,7 +682,7 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 1); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t dst_stride2 = dst_stride << 1; @@ -743,49 +706,41 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp = __lsx_vpickev_b(tmp1, tmp0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 0); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp = __lsx_vpickev_b(tmp3, tmp2); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 0); DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); - DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); - tmp = __lsx_vpickev_b(tmp5, tmp4); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); - DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); - tmp = __lsx_vpickev_b(tmp7, tmp6); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 16); DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp = __lsx_vpickev_b(tmp1, tmp0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 32); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp = __lsx_vpickev_b(tmp3, tmp2); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 32); DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); - DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); - tmp = __lsx_vpickev_b(tmp5, tmp4); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 48); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); - DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); - tmp = __lsx_vpickev_b(tmp7, tmp6); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 48); dst += dst_stride2; dst_tmp1 += dst_stride2; @@ -823,8 +778,8 @@ void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; case 16: - common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4, - y_step_q4, w, h); + common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); break; case 32: common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c index 398788a..53dc709 100644 --- a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -15,7 +15,6 @@ static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; __m128i src0, src1, src2, src3, src4, src5, src6, src7; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index d319bc4..2428407 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -125,19 +125,18 @@ tmp1_m; \ }) -#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ - { \ - __m128i tmp0_m, tmp1_m; \ - \ - DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m); \ - DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ - __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ - pdst += stride; \ - __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ - pdst += stride; \ - __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ - pdst += stride; \ - __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ +#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ + { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ } #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ -- 2.7.4