Fix bug and re-enable vpx_int_pro_row/col_neon
authorJonathan Wright <jonathan.wright@arm.com>
Thu, 10 Aug 2023 14:33:54 +0000 (15:33 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Thu, 10 Aug 2023 23:08:56 +0000 (00:08 +0100)
Fix a bug in vpx_int_pro_row_neon (increment pointer after peeled
first loop iteration) and re-enable both vpx_int_pro_row/col_neon
paths.

Also fix IntProRowTest to use width_ (instead of 0) as the src_stride
for the input data block. The test's use of 0 for src_stride is the
reason the tests passed with the buggy Neon implementation noted in
the listed bugs. (The old buggy Neon implementation fails the
adjusted unit tests.)

BUG=webm:1800
BUG=webm:1809

Change-Id: I1f4572ee155653a7596fe2c10b5938ea7a3f63ae

test/avg_test.cc
vpx_dsp/arm/avg_neon.c
vpx_dsp/vpx_dsp_rtcd_defs.pl

index b885c4d..ede9c0b 100644 (file)
@@ -190,8 +190,9 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+    ASM_REGISTER_STATE_CHECK(
+        asm_func_(hbuf_asm_, source_data_, width_, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch";
   }
@@ -681,25 +682,19 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
-// Disabled neon optimization since it caused mismatch. See details in:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1809
-// INSTANTIATE_TEST_SUITE_P(
-//    NEON, IntProRowTest,
-//    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon,
-//    &vpx_int_pro_row_c),
-//                      make_tuple(32, &vpx_int_pro_row_neon,
-//                      &vpx_int_pro_row_c), make_tuple(64,
-//                      &vpx_int_pro_row_neon,
-//                                 &vpx_int_pro_row_c)));
-//
-// INSTANTIATE_TEST_SUITE_P(
-//    NEON, IntProColTest,
-//    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon,
-//    &vpx_int_pro_col_c),
-//                      make_tuple(32, &vpx_int_pro_col_neon,
-//                      &vpx_int_pro_col_c), make_tuple(64,
-//                      &vpx_int_pro_col_neon,
-//                                 &vpx_int_pro_col_c)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProRowTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+                      make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+                      make_tuple(64, &vpx_int_pro_row_neon,
+                                 &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProColTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+                      make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+                      make_tuple(64, &vpx_int_pro_col_neon,
+                                 &vpx_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_neon),
index 0cb102f..1b17a32 100644 (file)
@@ -67,71 +67,73 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) {
   return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
-// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
-//                           const int ref_stride, const int height) {
-//   int i;
-//   uint8x16_t r0, r1, r2, r3;
-//   uint16x8_t sum_lo[2], sum_hi[2];
-//   uint16x8_t tmp_lo[2], tmp_hi[2];
-//   int16x8_t avg_lo, avg_hi;
-//
-//   const int norm_factor = (height >> 5) + 3;
-//   const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
-//
-//   assert(height >= 4 && height % 4 == 0);
-//
-//   r0 = vld1q_u8(ref + 0 * ref_stride);
-//   r1 = vld1q_u8(ref + 1 * ref_stride);
-//   r2 = vld1q_u8(ref + 2 * ref_stride);
-//   r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-//   sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-//   sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-//   sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-//   sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-//   for (i = 4; i < height; i += 4) {
-//     r0 = vld1q_u8(ref + 0 * ref_stride);
-//     r1 = vld1q_u8(ref + 1 * ref_stride);
-//     r2 = vld1q_u8(ref + 2 * ref_stride);
-//     r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-//     tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-//     tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-//     tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-//     tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-//     sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
-//     sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
-//     sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
-//     sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
-//
-//     ref += 4 * ref_stride;
-//   }
-//
-//   sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
-//   sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
-//
-//   avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
-//   avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
-//
-//   vst1q_s16(hbuf, avg_lo);
-//   vst1q_s16(hbuf + 8, avg_hi);
-// }
-
-// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
-//   uint16x8_t sum;
-//   int i;
-//
-//   assert(width >= 16 && width % 16 == 0);
-//
-//   sum = vpaddlq_u8(vld1q_u8(ref));
-//   for (i = 16; i < width; i += 16) {
-//     sum = vpadalq_u8(sum, vld1q_u8(ref + i));
-//   }
-//
-//   return (int16_t)horizontal_add_uint16x8(sum);
-// }
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
+
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+
+  assert(height >= 4 && height % 4 == 0);
+
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
+
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+  ref += 4 * ref_stride;
+
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
+
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
+
+    ref += 4 * ref_stride;
+  }
+
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
+  int i;
+
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
+  }
+
+  return (int16_t)horizontal_add_uint16x8(sum);
+}
 
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
index 798bd93..8033b4a 100644 (file)
@@ -876,12 +876,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
-  # Disabled neon optimization since it caused mismatch. See details in:
-  # https://bugs.chromium.org/p/webm/issues/detail?id=1809
   add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 msa/;
+  specialize qw/vpx_int_pro_row neon sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 msa/;
+  specialize qw/vpx_int_pro_col neon sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/vpx_vector_var neon sse2 msa/;