Use run-time feature detection for Neon DotProd HBD MSE

author Jonathan Wright <jonathan.wright@arm.com>

Sat, 19 Aug 2023 22:41:09 +0000 (23:41 +0100)

committer Jonathan Wright <jonathan.wright@arm.com>

Sun, 3 Sep 2023 22:04:50 +0000 (23:04 +0100)
author Jonathan Wright <jonathan.wright@arm.com>
Sat, 19 Aug 2023 22:41:09 +0000 (23:41 +0100)
committer Jonathan Wright <jonathan.wright@arm.com>
Sun, 3 Sep 2023 22:04:50 +0000 (23:04 +0100)
diff --git a/test/variance_test.cc b/test/variance_test.cc

index c32c919..e231df0 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1555,6 +1555,16 @@ INSTANTIATE_TEST_SUITE_P(
          MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
          MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
  
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+
  INSTANTIATE_TEST_SUITE_P(
      NEON, VpxHBDVarianceTest,
      ::testing::Values(
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c

index e361f6f..309ae7f 100644 (file)
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -384,69 +384,6 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
    return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
  }
  
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
-                                            int src_stride,
-                                            const uint16_t *ref_ptr,
-                                            int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h / 2;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    s1 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    r0 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-    r1 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
-                                             int src_stride,
-                                             const uint16_t *ref_ptr,
-                                             int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    s1 = vld1q_u16(src_ptr + 8);
-    r0 = vld1q_u16(ref_ptr);
-    r1 = vld1q_u16(ref_ptr + 8);
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
  static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                              int src_stride,
                                              const uint16_t *ref_ptr,
@@ -461,8 +398,6 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
    return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
  }
  
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
  #define HIGHBD_MSE_WXH_NEON(w, h)                                         \
    uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
        const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
@@ -497,3 +432,5 @@ HIGHBD_MSE_WXH_NEON(16, 16)
  HIGHBD_MSE_WXH_NEON(16, 8)
  HIGHBD_MSE_WXH_NEON(8, 16)
  HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c

new file mode 100644 (file)

index 0000000..1a88720
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index 84fd969..5343088 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -448,6 +448,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
  DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
  DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
  DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
  endif  # CONFIG_VP9_HIGHBITDEPTH
  endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index c9cdc28..1012df1 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1534,14 +1534,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
  
    add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
  
    add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon/;
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
    add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon/;
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
    add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
  
    add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
    specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
author	Jonathan Wright <jonathan.wright@arm.com>
	Sat, 19 Aug 2023 22:41:09 +0000 (23:41 +0100)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Sun, 3 Sep 2023 22:04:50 +0000 (23:04 +0100)
test/variance_test.cc		patch \| blob \| history
vpx_dsp/arm/highbd_variance_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_variance_neon_dotprod.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/vpx_dsp.mk		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history