From b1ed8e08a21b33c0f5039559113004bee7943dc4 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 7 Apr 2022 17:51:51 +0800 Subject: [PATCH] vp9[loongarch]: Optimize sad64x64/32x32_avg,comp_avg_pred 1. vpx_sad64x64_avg_lsx 2. vpx_sad32x32_avg_lsx 3. comp_avg_pred_lsx Bug: webm:1755 Change-Id: I58dabdcdd4265bd6ebd5670db8a132d2e838683f --- test/comp_avg_pred_test.cc | 5 ++ test/sad_test.cc | 6 ++ vpx_dsp/loongarch/avg_pred_lsx.c | 83 ++++++++++++++++++ vpx_dsp/loongarch/sad_lsx.c | 180 ++++++++++++++++++++++++++++++++++++++- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 6 files changed, 274 insertions(+), 7 deletions(-) create mode 100644 vpx_dsp/loongarch/avg_pred_lsx.c diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index b9201a2..3977a2d 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -183,4 +183,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest, INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_vsx)); #endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_lsx)); +#endif // HAVE_LSX } // namespace diff --git a/test/sad_test.cc b/test/sad_test.cc index e4952ba..12a6206 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1139,6 +1139,12 @@ const SadMxNParam lsx_tests[] = { }; INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); +const SadMxNAvgParam avg_lsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); + const SadMxNx4Param x4d_lsx_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), diff --git a/vpx_dsp/loongarch/avg_pred_lsx.c b/vpx_dsp/loongarch/avg_pred_lsx.c new file mode 100644 index 0000000..4826260 --- /dev/null +++ b/vpx_dsp/loongarch/avg_pred_lsx.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_util/loongson_intrinsics.h" + +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + // width > 8 || width == 8 || width == 4 + if (width > 8) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + __m128i p, r, avg; + + p = __lsx_vld(pred + j, 0); + r = __lsx_vld(ref + j, 0); + avg = __lsx_vavgr_bu(p, r); + __lsx_vst(avg, comp_pred + j, 0); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + int i = height * width; + do { + __m128i p, r, r_0, r_1; + + p = __lsx_vld(pred, 0); + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r = __lsx_vilvl_d(r_1, r_0); + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + + pred += 16; + comp_pred += 16; + i -= 16; + } while (i); + } else { // width = 4 + int i = height * width; + assert(width == 4); + do { + __m128i p, r, r_0, r_1, r_2, r_3; + p = __lsx_vld(pred, 0); + + if (width == ref_stride) { + r = __lsx_vld(ref, 0); + ref += 16; + } else { + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r_2 = __lsx_vld(ref, 0); + ref += ref_stride; + r_3 = __lsx_vld(ref, 0); + ref += ref_stride; + DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2); + r = __lsx_vilvl_d(r_2, r_0); + } + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + comp_pred += 16; + pred += 16; + i -= 16; + } while (i); + } +} diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index cd3f2d4..30464b3 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -46,6 +46,17 @@ sum_m; \ }) +#define HADD_SW_S32(in) \ + ({ \ + __m128i res0_m; \ + int32_t sum_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in, in); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ + sum_m; \ + }) + static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { @@ -355,7 +366,150 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, sad_array[3] = HADD_UW_U32(sad); } -#define VPX_SAD_16xHEIGHT_LSX(height) \ +static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i comp0, comp1, sad_tmp; + __m128i sad = __lsx_vldi(0); + uint8_t *src_tmp, *ref_tmp; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + src_tmp = (uint8_t *)src + 16; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + ref_tmp = (uint8_t *)ref + 16; + ref0 = __lsx_vld(ref, 0); + DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4); + ref6 = __lsx_vldx(ref, ref_stride3); + ref1 = __lsx_vld(ref_tmp, 0); + DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3, + ref5); + ref7 = __lsx_vldx(ref_tmp, ref_stride3); + ref += ref_stride4; + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96, + pred0, pred2, pred4, pred6); + DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred, + 112, pred1, pred3, pred5, pred7); + sec_pred += 128; + + DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); + sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); + sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); + sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; + __m128i sad, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0, sad0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); + sad = __lsx_vadd_w(sad, sad_tmp); + + return HADD_SW_S32(sad); +} + +#define VPX_SAD_16xHT_LSX(height) \ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride) { \ return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ @@ -394,15 +548,33 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ } -#define SAD64 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) +#define VPX_AVGSAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64) SAD64 -#define SAD32 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32) SAD32 -#define SAD16 VPX_SAD_16xHEIGHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) +#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) SAD16 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index efb253c..ddccfc1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -401,6 +401,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4ad698c..68d4f86 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -836,7 +836,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/; @@ -845,7 +845,7 @@ add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_st specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/; @@ -1147,7 +1147,7 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - specialize qw/vpx_comp_avg_pred neon sse2 vsx/; + specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; # # Subpixel Variance -- 2.7.4