From 19f3a754d62dcd21e400a3c715f2ed4235d1c4ec Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 26 Jun 2023 14:57:53 -0700
Subject: [PATCH] Fix a bug in vpx_hadamard_32x32_neon()

A right shift by 2 is equivalent to two halving operations if there is
no no addition or subtraction between the two halving operations.

Note: Since vhaddq_s16() and vhsubq_s16() have 17-bit intermediate
precision, the Neon code doesn't need to go to int32_t as was done in
https://chromium-review.googlesource.com/c/webm/libvpx/+/4604169.

Change-Id: Ibe0691cde0fd3b94ee7c497845ba459d30d503b0
---
 vpx_dsp/arm/hadamard_neon.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index f6b6d7e..f5a044b 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -138,15 +138,15 @@ void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
     const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
 
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
+    const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1);
+    const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1);
+    const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1);
+    const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1);
 
-    const int16x8_t c0 = vhaddq_s16(b0, b2);
-    const int16x8_t c1 = vhaddq_s16(b1, b3);
-    const int16x8_t c2 = vhsubq_s16(b0, b2);
-    const int16x8_t c3 = vhsubq_s16(b1, b3);
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
 
     store_s16q_to_tran_low(coeff + 0, c0);
     store_s16q_to_tran_low(coeff + 256, c1);
-- 
2.7.4