vp9_reconintra_neon: add DC 16x16 predictors
authorJames Zern <jzern@google.com>
Thu, 28 May 2015 03:17:27 +0000 (20:17 -0700)
committerJames Zern <jzern@google.com>
Fri, 29 May 2015 22:41:44 +0000 (15:41 -0700)
85-89% faster over 20M pixels

Change-Id: I9b320ed6b9e67f27df738b84c8b43b65a93c50c2

test/test_intra_pred_speed.cc
vp9/common/arm/neon/vp9_reconintra_neon.c
vp9/common/vp9_rtcd_defs.pl

index d10c8ec..a684ea4 100644 (file)
@@ -291,9 +291,12 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred16, vp9_dc_predictor_16x16_dspr2, NULL,
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred16, NULL, NULL, NULL, NULL,
-                vp9_v_predictor_16x16_neon, vp9_h_predictor_16x16_neon, NULL,
-                NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred16, vp9_dc_predictor_16x16_neon,
+                vp9_dc_left_predictor_16x16_neon,
+                vp9_dc_top_predictor_16x16_neon,
+                vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon,
+                vp9_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_16x16_neon)
 #endif  // HAVE_NEON
 
 // -----------------------------------------------------------------------------
index 82d11d6..387439f 100644 (file)
@@ -85,6 +85,82 @@ void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   dc_8x8(dst, stride, NULL, NULL, 0, 0);
 }
 
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
 #if !HAVE_NEON_ASM
 
 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
index d4c9070..5cb1cd5 100644 (file)
@@ -162,16 +162,16 @@ add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";