Optimize 4D Neon reduction for 4xh and 8xh SAD4D blocks

author Jonathan Wright <jonathan.wright@arm.com>

Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)

committer Jonathan Wright <jonathan.wright@arm.com>

Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)
author Jonathan Wright <jonathan.wright@arm.com>
Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)
committer Jonathan Wright <jonathan.wright@arm.com>
Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c

index 9509573..ab00e0e 100644 (file)
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -270,10 +270,7 @@ static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
      i++;
    } while (i < h);
  
-  res[0] = horizontal_add_uint16x8(sum[0]);
-  res[1] = horizontal_add_uint16x8(sum[1]);
-  res[2] = horizontal_add_uint16x8(sum[2]);
-  res[3] = horizontal_add_uint16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
  }
  
  static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
@@ -298,10 +295,7 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
      i += 2;
    } while (i < h);
  
-  res[0] = horizontal_add_uint16x8(sum[0]);
-  res[1] = horizontal_add_uint16x8(sum[1]);
-  res[2] = horizontal_add_uint16x8(sum[2]);
-  res[3] = horizontal_add_uint16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
  }
  
  #define SAD_WXH_4D_NEON(w, h)                                                  \
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h

index 1eb3484..6259add 100644 (file)
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -83,6 +83,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
  #endif
  }
  
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
  static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
                                                      const uint16x8_t vec_hi) {
  #if defined(__aarch64__)
author	Jonathan Wright <jonathan.wright@arm.com>
	Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Tue, 4 Apr 2023 13:52:52 +0000 (14:52 +0100)
vpx_dsp/arm/sad4d_neon.c		patch \| blob \| history
vpx_dsp/arm/sum_neon.h		patch \| blob \| history