From 479443a57082fe7998fda6dbfebb5965dd6998ca Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 7 Apr 2017 14:49:00 +0000 Subject: [PATCH] ppc: tm predictor 16x16 About 10x faster. Change-Id: I1f5a3752d346459df3b45f92963208bf3e520f06 --- test/test_intra_pred_speed.cc | 2 +- vpx_dsp/ppc/intrapred_vsx.c | 69 +++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 88b21cf..d011bea 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -318,7 +318,7 @@ INTRA_PRED_TEST(VSX, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, INTRA_PRED_TEST(VSX, TestIntraPred16, NULL, NULL, NULL, NULL, vpx_v_predictor_16x16_vsx, vpx_h_predictor_16x16_vsx, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_vsx) INTRA_PRED_TEST(VSX, TestIntraPred32, NULL, NULL, NULL, NULL, vpx_v_predictor_32x32_vsx, vpx_h_predictor_32x32_vsx, NULL, diff --git a/vpx_dsp/ppc/intrapred_vsx.c b/vpx_dsp/ppc/intrapred_vsx.c index f0e8d27..cff6c7c 100644 --- a/vpx_dsp/ppc/intrapred_vsx.c +++ b/vpx_dsp/ppc/intrapred_vsx.c @@ -226,3 +226,72 @@ void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, val = vec_sub(vec_add(vec_splat(l, 7), a), tl); vec_vsx_st(vec_packsu(val, tmp), 0, dst); } + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 6237762..5b009cf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -133,7 +133,7 @@ add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_16x16 neon msa sse2/; +specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/; -- 2.7.4