From d217c87139a3218d9dc4154782de53b9d0cc1119 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Mon, 15 May 2017 16:30:00 -0700
Subject: [PATCH] neon variance: special case 4x

The sub pixel variance uses a temp buffer which guarantees width ==
stride. Take advantage of this with the 4x and avoid the very costly
lane loads.

Change-Id: Ia0c97eb8c29dc8dfa6e51a29dff9b75b3c6726f1
---
 vpx_dsp/arm/mem_neon.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 23d2b4e..37b89b2 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -83,6 +83,7 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vld1q_lane_u32(&a, a_u32, 0);
@@ -102,6 +103,10 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+  if (stride == 4) {
+    vst1q_u8(buf, a);
+    return;
+  }
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
-- 
2.7.4