Add high bitdepth intra prediction NEON optimization (h and v)
authorLinfeng Zhang <linfengz@google.com>
Fri, 28 Oct 2016 16:42:11 +0000 (09:42 -0700)
committerJames Zern <jzern@google.com>
Sat, 12 Nov 2016 20:00:19 +0000 (12:00 -0800)
BUG=webm:1316

Change-Id: I47eeac698a98a31d1af5f72441052302e9fa4f46

test/test_intra_pred_speed.cc
test/vp9_intrapred_test.cc
vpx_dsp/arm/highbd_intrapred_neon.c
vpx_dsp/vpx_dsp_rtcd_defs.pl

index f23e946..a1adef2 100644 (file)
@@ -476,38 +476,32 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred4,
-                       vpx_highbd_dc_predictor_4x4_neon,
-                       vpx_highbd_dc_left_predictor_4x4_neon,
-                       vpx_highbd_dc_top_predictor_4x4_neon,
-                       vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL,
-                       vpx_highbd_d45_predictor_4x4_neon,
-                       vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL,
-                       NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8,
-                       vpx_highbd_dc_predictor_8x8_neon,
-                       vpx_highbd_dc_left_predictor_8x8_neon,
-                       vpx_highbd_dc_top_predictor_8x8_neon,
-                       vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL,
-                       vpx_highbd_d45_predictor_8x8_neon,
-                       vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL,
-                       NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_neon,
-                       vpx_highbd_dc_left_predictor_16x16_neon,
-                       vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL,
-                       vpx_highbd_d45_predictor_16x16_neon,
-                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
-                       NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_neon,
-                       vpx_highbd_dc_left_predictor_32x32_neon,
-                       vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL,
-                       vpx_highbd_d45_predictor_32x32_neon,
-                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
-                       NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
+    vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
+    vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
+    vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
+    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
+    vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
+    vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
+    vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
+    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
+    vpx_highbd_dc_left_predictor_16x16_neon,
+    vpx_highbd_dc_top_predictor_16x16_neon,
+    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
+    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
+    vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
+    vpx_highbd_dc_left_predictor_32x32_neon,
+    vpx_highbd_dc_top_predictor_32x32_neon,
+    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
+    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
+    vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
index d573bae..74107be 100644 (file)
@@ -463,7 +463,23 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
                              &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
-                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8)));
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON_TO_C_10, VP9HighbdIntraPredTest,
@@ -515,7 +531,23 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
                              &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
-                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10)));
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON_TO_C_12, VP9HighbdIntraPredTest,
@@ -567,7 +599,23 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
                              &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
-                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12)));
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
index 9177fb4..ea95958 100644 (file)
@@ -693,3 +693,205 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
     row_6 = row_7;
   }
 }
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t row = vld1_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, dst += stride) {
+    vst1_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t row = vld1q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row = vld2q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst2q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row0 = vld2q_u16(above);
+  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 32; i++) {
+    vst2q_u16(dst, row0);
+    dst += 16;
+    vst2q_u16(dst, row1);
+    dst += stride - 16;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t left_u16 = vld1_u16(left);
+  uint16x4_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdup_lane_u16(left_u16, 0);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 1);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 2);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 3);
+  vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x4_t left_low = vget_low_u16(left_u16);
+  const uint16x4_t left_high = vget_high_u16(left_u16);
+  uint16x8_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdupq_lane_u16(left_low, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 3);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 3);
+  vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_16(&dst, stride, row);
+  }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_32(&dst, stride, row);
+  }
+}
index 73da9ef..10e632a 100644 (file)
@@ -220,6 +220,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -229,7 +230,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
@@ -260,6 +261,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -269,7 +271,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
@@ -300,6 +302,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -309,7 +312,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
@@ -340,6 +343,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -349,7 +353,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;