From 337b221e00116d1e0557f2c8a245aa91a3d7ff8a Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 17 Jun 2015 18:23:19 -0700 Subject: [PATCH] vp9_reconintra_neon: add d135 4x4 based on webp's RD4() ~50% faster over 20M pixels Change-Id: Ifcb7bf7f7fc8eabf79d9e3b219ce1be67abc524a --- test/test_intra_pred_speed.cc | 4 ++-- vp9/common/arm/neon/vp9_reconintra_neon.c | 30 ++++++++++++++++++++++++++++++ vp9/common/vp9_rtcd_defs.pl | 2 +- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 9c3e527..096526a 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -211,8 +211,8 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL, INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon, vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon, vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon, - vp9_h_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL, NULL, - vp9_tm_predictor_4x4_neon) + vp9_h_predictor_4x4_neon, NULL, vp9_d135_predictor_4x4_neon, + NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index 65a2936..48bce78 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -313,6 +313,36 @@ void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, dc_32x32(dst, stride, NULL, NULL, 0, 0); } +// ----------------------------------------------------------------------------- + +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + #if !HAVE_NEON_ASM void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index da7d5fc..d191062 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -72,7 +72,7 @@ add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d117_predictor_4x4/; add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_4x4/; +specialize qw/vp9_d135_predictor_4x4 neon/; add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; -- 2.7.4