From a2a13cbe5f330d32ea702c034819cb213c8ca1ee Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 May 2015 20:17:27 -0700 Subject: [PATCH] vp9_reconintra_neon: add DC 16x16 predictors 85-89% faster over 20M pixels Change-Id: I9b320ed6b9e67f27df738b84c8b43b65a93c50c2 --- test/test_intra_pred_speed.cc | 9 ++-- vp9/common/arm/neon/vp9_reconintra_neon.c | 76 +++++++++++++++++++++++++++++++ vp9/common/vp9_rtcd_defs.pl | 8 ++-- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index d10c8ec..a684ea4 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -291,9 +291,12 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred16, vp9_dc_predictor_16x16_dspr2, NULL, #endif // HAVE_DSPR2 #if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred16, NULL, NULL, NULL, NULL, - vp9_v_predictor_16x16_neon, vp9_h_predictor_16x16_neon, NULL, - NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon) +INTRA_PRED_TEST(NEON, TestIntraPred16, vp9_dc_predictor_16x16_neon, + vp9_dc_left_predictor_16x16_neon, + vp9_dc_top_predictor_16x16_neon, + vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon, + vp9_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL, + vp9_tm_predictor_16x16_neon) #endif // HAVE_NEON // ----------------------------------------------------------------------------- diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index 82d11d6..387439f 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -85,6 +85,82 @@ void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, dc_8x8(dst, stride, NULL, NULL, 0, 0); } +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); +} + +void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + #if !HAVE_NEON_ASM void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d4c9070..5cb1cd5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -162,16 +162,16 @@ add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc"; add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc"; -- 2.7.4