From 06b09ebd351deb35b5bdcf387904dcbecc3da02f Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Fri, 7 Oct 2022 05:53:50 -0700
Subject: [PATCH] Add vpx_highbd_sad64x{64,32}_avx2.

~2.8x faster than the sse2 version.

Bug: b/245917257

Change-Id: Ibc8e5d030ec145c9a9b742fff98fbd9131c9ede4
---
 test/sad_test.cc              | 20 +++++++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  4 +--
 vpx_dsp/x86/highbd_sad_avx2.c | 65 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4712c51..a8f04e6 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1065,21 +1065,29 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
 #if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
   SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
   SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
   SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
-  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
-  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
-  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
-  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
-  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
-  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
   SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
   SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
   SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d669b99..34ee981 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -941,10 +941,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2 neon/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2 neon/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index eb0e3ee..12ef2eb 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -19,6 +19,71 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
   return (unsigned int)_mm_cvtsi128_si32(sum);
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD64XN(n)                                                    \
+  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
+      const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr,      \
+      int ref_stride) {                                                      \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);                           \
+    __m256i sums_32 = _mm256_setzero_si256();                                \
+    int i;                                                                   \
+                                                                             \
+    for (i = 0; i < (n / 2); ++i) {                                          \
+      __m256i sums_16 = _mm256_setzero_si256();                              \
+                                                                             \
+      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
+                                                                             \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
+       * sums_32*/                                                           \
+      sums_32 = _mm256_add_epi32(                                            \
+          sums_32,                                                           \
+          _mm256_add_epi32(                                                  \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+                                                                             \
+      src += src_stride << 1;                                                \
+      ref += ref_stride << 1;                                                \
+    }                                                                        \
+    return calc_final(sums_32);                                              \
+  }
+
+// 64x64
+HIGHBD_SAD64XN(64)
+
+// 64x32
+HIGHBD_SAD64XN(32)
+
 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
                                             uint16_t *ref, int ref_stride,
-- 
2.7.4