From e0c0e65378a8510726b19a49ede8086544f02c78 Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Thu, 19 Jan 2017 14:44:03 +0530 Subject: [PATCH] Add mips msa vpx hadamard functions average improvement ~4x-5x Change-Id: I167132d894c04fa85dda8dde7906ff9c61b3a65d --- test/hadamard_test.cc | 10 +++ vpx_dsp/mips/avg_msa.c | 196 +++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 3 files changed, 208 insertions(+), 2 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index e771595..317feba 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -162,6 +162,11 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_neon)); #endif // HAVE_NEON +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, + ::testing::Values(&vpx_hadamard_8x8_msa)); +#endif // HAVE_MSA + class Hadamard16x16Test : public HadamardTestBase {}; TEST_P(Hadamard16x16Test, CompareReferenceRandom) { @@ -217,4 +222,9 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); #endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_msa)); +#endif // HAVE_MSA } // namespace diff --git a/vpx_dsp/mips/avg_msa.c b/vpx_dsp/mips/avg_msa.c index 52a24ed..48e3cf0 100644 --- a/vpx_dsp/mips/avg_msa.c +++ b/vpx_dsp/mips/avg_msa.c @@ -54,3 +54,199 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } + +void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); +} + +void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src11, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); + + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ee1b292..4c6984f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -886,10 +886,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_minmax_8x8 sse2 neon/; add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; + specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon/; + specialize qw/vpx_hadamard_16x16 sse2 neon msa/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd sse2 neon/; -- 2.7.4