From d217c87139a3218d9dc4154782de53b9d0cc1119 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 15 May 2017 16:30:00 -0700 Subject: [PATCH] neon variance: special case 4x The sub pixel variance uses a temp buffer which guarantees width == stride. Take advantage of this with the 4x and avoid the very costly lane loads. Change-Id: Ia0c97eb8c29dc8dfa6e51a29dff9b75b3c6726f1 --- vpx_dsp/arm/mem_neon.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 23d2b4e..37b89b2 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -83,6 +83,7 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { uint32_t a; uint32x4_t a_u32 = vdupq_n_u32(0); + if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; a_u32 = vld1q_lane_u32(&a, a_u32, 0); @@ -102,6 +103,10 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, const uint8x16_t a) { const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); + if (stride == 4) { + vst1q_u8(buf, a); + return; + } uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); buf += stride; uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); -- 2.7.4