Add AVX2 intrinsic for vpx_comp_avg_pred() function

author Anupam Pandey <anupam.pandey@ittiam.com>

Mon, 8 May 2023 06:40:09 +0000 (12:10 +0530)

committer Anupam Pandey <anupam.pandey@ittiam.com>

Tue, 9 May 2023 11:03:59 +0000 (16:33 +0530)
author Anupam Pandey <anupam.pandey@ittiam.com>
Mon, 8 May 2023 06:40:09 +0000 (12:10 +0530)
committer Anupam Pandey <anupam.pandey@ittiam.com>
Tue, 9 May 2023 11:03:59 +0000 (16:33 +0530)
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc

index f747c35..d8fabd5 100644 (file)
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -81,11 +81,11 @@ void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
          // Only the reference buffer may have a stride not equal to width.
          Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
          ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
          ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
          ASSERT_TRUE(avg_ref.Init());
-        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
          ASSERT_TRUE(avg_chk.Init());
          const int bitdepth_mask = (1 << bitdepth) - 1;
          for (int h = 0; h < height; ++h) {
@@ -121,11 +121,11 @@ void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
    const int height = 32;
    Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
    ASSERT_TRUE(ref.Init());
-  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
    ASSERT_TRUE(pred.Init());
-  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
    ASSERT_TRUE(avg_ref.Init());
-  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
    ASSERT_TRUE(avg_chk.Init());
  
    for (int i = 0; i < 500; ++i) {
@@ -167,9 +167,9 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
          const int height = 1 << height_pow;
          Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
          ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
          ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
          ASSERT_TRUE(avg.Init());
          const int bitdepth_mask = (1 << bitdepth) - 1;
          for (int h = 0; h < height; ++h) {
@@ -217,6 +217,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                           ::testing::Values(&vpx_comp_avg_pred_sse2));
  #endif  // HAVE_SSE2
  
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
  #if HAVE_NEON
  INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                           ::testing::Values(&vpx_comp_avg_pred_neon));
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c

index 64e9ef0..0ea0f85 100644 (file)
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -297,7 +297,7 @@ static unsigned int setup_center_error(
        besterr =
            vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
      } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
        vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
        besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
      }
@@ -312,7 +312,7 @@ static unsigned int setup_center_error(
    uint32_t besterr;
    (void)xd;
    if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
    } else {
@@ -635,7 +635,7 @@ static int accurate_sub_pel_search(
      vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                                0, kernel, MV_PRECISION_Q3, 0, 0);
      if (second_pred != NULL) {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
        vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
        besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
      } else {
@@ -654,7 +654,7 @@ static int accurate_sub_pel_search(
    vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                              0, kernel, MV_PRECISION_Q3, 0, 0);
    if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
      vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
      besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
    } else {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index f051c62..464705a 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1937,10 +1937,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
  
  // Prediction buffer from second frame.
  #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
    uint8_t *second_pred;
  #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
    // Check number of iterations do not exceed the max
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c

index 619d7aa..2a4c81d 100644 (file)
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,7 +40,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
    unsigned int vpx_sad##m##x##n##_avg_c(                                      \
        const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
        int ref_stride, const uint8_t *second_pred) {                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
      vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
      return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
    }                                                                           \
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c

index ce1e838..a6793ef 100644 (file)
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass(
        const uint8_t *second_pred) {                                          \
      uint16_t fdata3[(H + 1) * W];                                            \
      uint8_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
                                                                               \
      var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
                                        W, bilinear_filters[x_offset]);        \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index 67d3fb0..04969f3 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -424,6 +424,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
  DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
  
  DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
  DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
  DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
  DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index cae4ca8..f20f4e0 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1321,7 +1321,7 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
    specialize qw/vpx_get4x4sse_cs neon msa vsx/;
  
  add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
  
  #
  # Subpixel Variance
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c

new file mode 100644 (file)

index 0000000..f435799
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
author	Anupam Pandey <anupam.pandey@ittiam.com>
	Mon, 8 May 2023 06:40:09 +0000 (12:10 +0530)
committer	Anupam Pandey <anupam.pandey@ittiam.com>
	Tue, 9 May 2023 11:03:59 +0000 (16:33 +0530)
test/comp_avg_pred_test.cc		patch \| blob \| history
vp9/encoder/vp9_mcomp.c		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vpx_dsp/sad.c		patch \| blob \| history
vpx_dsp/variance.c		patch \| blob \| history
vpx_dsp/vpx_dsp.mk		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/avg_pred_avx2.c	[new file with mode: 0644]	patch \| blob