Use run-time feature detection for Neon DotProd HBD MSE
authorJonathan Wright <jonathan.wright@arm.com>
Sat, 19 Aug 2023 22:41:09 +0000 (23:41 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Sun, 3 Sep 2023 22:04:50 +0000 (23:04 +0100)
Arm Neon DotProd implementations of vpx_highbd_8_mse<w>x<h> currently
need to be enabled at compile time since they're guarded by #ifdef
feature macros. Now that run-time feature detection has been enabled
for Arm platforms, expose these implementations with distinct
*neon_dotprod names in a separate file and wire them up to the build
system and rtcd.pl. Also add new test cases for the new functions.

Change-Id: I26be6fb587258c8fa9fbf03509b7602358a001a8

test/variance_test.cc
vpx_dsp/arm/highbd_variance_neon.c
vpx_dsp/arm/highbd_variance_neon_dotprod.c [new file with mode: 0644]
vpx_dsp/vpx_dsp.mk
vpx_dsp/vpx_dsp_rtcd_defs.pl

index c32c919..e231df0 100644 (file)
@@ -1555,6 +1555,16 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
     ::testing::Values(
index e361f6f..309ae7f 100644 (file)
@@ -384,69 +384,6 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
   return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
-                                            int src_stride,
-                                            const uint16_t *ref_ptr,
-                                            int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h / 2;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    s1 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    r0 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-    r1 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
-                                             int src_stride,
-                                             const uint16_t *ref_ptr,
-                                             int ref_stride, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint16x8_t s0, s1, r0, r1;
-    uint8x16_t s, r, diff;
-
-    s0 = vld1q_u16(src_ptr);
-    s1 = vld1q_u16(src_ptr + 8);
-    r0 = vld1q_u16(ref_ptr);
-    r1 = vld1q_u16(ref_ptr + 8);
-
-    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_uint32x4(sse_u32);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
@@ -461,8 +398,6 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
   return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define HIGHBD_MSE_WXH_NEON(w, h)                                         \
   uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
@@ -497,3 +432,5 @@ HIGHBD_MSE_WXH_NEON(16, 16)
 HIGHBD_MSE_WXH_NEON(16, 8)
 HIGHBD_MSE_WXH_NEON(8, 16)
 HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644 (file)
index 0000000..1a88720
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
index 84fd969..5343088 100644 (file)
@@ -448,6 +448,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
index c9cdc28..1012df1 100644 (file)
@@ -1534,14 +1534,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon/;
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon/;
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;