neon variance: add small missing sizes

author Johann <johannkoenig@google.com>

Mon, 1 May 2017 16:10:06 +0000 (09:10 -0700)

committer Johann <johannkoenig@google.com>

Thu, 4 May 2017 15:59:42 +0000 (08:59 -0700)
author Johann <johannkoenig@google.com>
Mon, 1 May 2017 16:10:06 +0000 (09:10 -0700)
committer Johann <johannkoenig@google.com>
Thu, 4 May 2017 15:59:42 +0000 (08:59 -0700)
diff --git a/test/variance_test.cc b/test/variance_test.cc

index 6e31165..ff69143 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -22,6 +22,7 @@
  #include "vpx/vpx_integer.h"
  #include "vpx_mem/vpx_mem.h"
  #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
  
  namespace {
  
@@ -345,6 +346,7 @@ class MainTestClass
    void RefTest();
    void RefStrideTest();
    void OneQuarterTest();
+  void SpeedTest();
  
    // MSE/SSE tests
    void RefTestMse();
@@ -363,6 +365,7 @@ class MainTestClass
    int byte_shift() const { return params_.bit_depth - 8; }
    int block_size() const { return params_.block_size; }
    int width() const { return params_.width; }
+  int height() const { return params_.height; }
    uint32_t mask() const { return params_.mask; }
  };
  
@@ -471,6 +474,35 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
    EXPECT_EQ(expected, var);
  }
  
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < 100000000 / block_size(); ++i) {
+    const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
+    // Ignore return value.
+    (void)variance;
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d time: %5d ms\n", width(), height(),
+         elapsed_time / 1000);
+}
+
  ////////////////////////////////////////////////////////////////////////////////
  // Tests related to MSE / SSE.
  
@@ -727,6 +759,7 @@ TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
  TEST_P(VpxVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
  TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
  TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
  TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
  TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -809,6 +842,7 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
  TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
  TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
  TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
  TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -1219,10 +1253,13 @@ INSTANTIATE_TEST_CASE_P(
                        VarianceParams(6, 5, &vpx_variance64x32_neon),
                        VarianceParams(5, 6, &vpx_variance32x64_neon),
                        VarianceParams(5, 5, &vpx_variance32x32_neon),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon),
                        VarianceParams(4, 4, &vpx_variance16x16_neon),
                        VarianceParams(4, 3, &vpx_variance16x8_neon),
                        VarianceParams(3, 4, &vpx_variance8x16_neon),
-                      VarianceParams(3, 3, &vpx_variance8x8_neon)));
+                      VarianceParams(3, 3, &vpx_variance8x8_neon),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon)));
  
  INSTANTIATE_TEST_CASE_P(
      NEON, VpxSubpelVarianceTest,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c

index b6d7f86..18b26ad 100644 (file)
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -70,29 +70,27 @@ void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
    variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
  }
  
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
-}
-
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
+#define varianceNxM(n, m, shift)                                            \
+  unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) {            \
+    int sum;                                                                \
+    variance_neon_w8(a, a_stride, b, b_stride, n, m, sse, &sum);            \
+    if (n * m < 16 * 16)                                                    \
+      return *sse - ((sum * sum) >> shift);                                 \
+    else                                                                    \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+  }
  
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+varianceNxM(8, 4, 5);
+varianceNxM(8, 8, 6);
+// TODO(johannkoenig) Investigate why the implementation below is faster.
+// varianceNxM(8, 16, 7);
+varianceNxM(16, 8, 7);
+varianceNxM(16, 16, 8);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM(32, 32, 10);
  
  unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
                                      const uint8_t *b, int b_stride,
@@ -144,82 +142,6 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
    return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
  }
  
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
  unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
                                     int source_stride,
                                     const unsigned char *ref_ptr,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 24e5b8b..61f7c88 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1177,10 +1177,10 @@ add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int sourc
    specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
+  specialize qw/vpx_variance16x32 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
    specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
@@ -1195,12 +1195,14 @@ add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_
    specialize qw/vpx_variance8x8 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
+  specialize qw/vpx_variance8x4 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
    specialize qw/vpx_variance4x8 sse2 msa/;
  
  add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
    specialize qw/vpx_variance4x4 sse2 msa/;
  
  #
author	Johann <johannkoenig@google.com>
	Mon, 1 May 2017 16:10:06 +0000 (09:10 -0700)
committer	Johann <johannkoenig@google.com>
	Thu, 4 May 2017 15:59:42 +0000 (08:59 -0700)
test/variance_test.cc		patch \| blob \| history
vpx_dsp/arm/variance_neon.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history