From 65d9ac5b5a3dd1c72c15a1fc5bcc004a43ad4c90 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 12 Apr 2022 21:01:53 +0800 Subject: [PATCH] vp9[loongarch]: Optimize fdct4x4/8x8_lsx 1. vpx_fdct4x4_lsx 2. vpx_fdct8x8_lsx Bug: webm:1755 Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0 --- test/dct_test.cc | 6 ++- test/fdct8x8_test.cc | 7 +++ vpx_dsp/loongarch/fwd_txfm_lsx.c | 92 +++++++++++++++++++++++++++++++++++++ vpx_dsp/loongarch/fwd_txfm_lsx.h | 99 ++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 5 files changed, 204 insertions(+), 4 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 6178f8e..2182f87 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH -static const FuncInfo dct_lsx_func_info[2] = { +static const FuncInfo dct_lsx_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, { &fdct_wrapper, &idct_wrapper, 16, 1 }, { &fdct_wrapper, &idct_wrapper, @@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = { INSTANTIATE_TEST_SUITE_P( LSX, TransDCT, - ::testing::Combine(::testing::Range(0, 2), + ::testing::Combine(::testing::Range(0, 4), ::testing::Values(dct_lsx_func_info), ::testing::Values(0), ::testing::Values(VPX_BITS_8))); #endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 0822666..83d1ff1 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT, &vpx_idct8x8_64_add_vsx, 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_lsx, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c index 03f194b..6f2d4d6 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.c +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -11,6 +11,20 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + #if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { @@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) { __lsx_vst(in7, output, 240); } +void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2); + in3 = __lsx_vldx(input, src_stride6); + + /* fdct4 pre-process */ + { + __m128i vec, mask; + __m128i zero = __lsx_vldi(0); + + mask = __lsx_vinsgr2vr_b(zero, 1, 0); + DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2, + in3); + vec = __lsx_vseqi_h(in0, 0); + vec = __lsx_vxori_b(vec, 255); + vec = __lsx_vand_v(mask, vec); + in0 = __lsx_vadd_h(in0, vec); + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1, + in2); + in3 = __lsx_vldx(input_tmp, src_stride6); + input_tmp += src_stride4; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5, + in6); + in7 = __lsx_vldx(input_tmp, src_stride6); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 16); + __lsx_vst(in2, output, 32); + __lsx_vst(in3, output, 48); + __lsx_vst(in4, output, 64); + __lsx_vst(in5, output, 80); + __lsx_vst(in6, output, 96); + __lsx_vst(in7, output, 112); +} + void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index 9ed8102..d04427a 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -14,6 +14,105 @@ #include "vpx_dsp/loongarch/txfm_macros_lsx.h" #include "vpx_dsp/txfm_common.h" +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ + \ + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ + cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ + vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ + cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ + cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ + vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ + vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ + vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ + } + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ + vec1_m, vec2_m, vec3_m); \ + DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ + vec5_m, vec6_m, vec7_m); \ + DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ + in3, in0, in1, in2, in3); \ + DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ + in7, in4, in5, in6, in7); \ + } + #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ { \ __m128i tp0_m, tp1_m; \ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1c88dcd..f17fc3b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -573,13 +573,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 neon sse2 msa/; + specialize qw/vpx_fdct4x4 neon sse2 msa lsx/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; + specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64"; add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8_1 sse2 neon msa/; -- 2.7.4