From a49f896352671870f38c1374f3d5329e3b60193f Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Thu, 6 Oct 2022 16:00:43 +0000
Subject: [PATCH] [NEON] Add highbd FDCT 8x8 function

50% faster than C version in best/rt profiles

Change-Id: I0f9504ed52b5d5f7722407e91108ed4056d66bc2
---
 test/dct_test.cc             |  12 ++--
 vpx_dsp/arm/fdct8x8_neon.c   |  78 +++++++++++++++++++++++
 vpx_dsp/arm/fdct_neon.h      | 144 +++++++++++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 4 files changed, 229 insertions(+), 7 deletions(-)

diff --git a/test/dct_test.cc b/test/dct_test.cc
index e34122a..ff97fc7 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -543,12 +543,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
 static const FuncInfo dct_neon_func_info[] = {
   { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
     &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
-  /*  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
-      &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
-    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
-     &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
-    { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
-      &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+  /*    { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+      { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+        &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
 };
 #else
 static const FuncInfo dct_neon_func_info[4] = {
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index d9161c6..3fb15cc 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -66,3 +66,81 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  int i;
+
+  // input[M * stride] * 16
+  int32x4_t left[8], right[8];
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+
+  left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+  for (i = 0; i < 2; ++i) {
+    vpx_highbd_fdct8x8_pass1_neon(left, right);
+  }
+  {
+    left[0] = highbd_add_round_shift_s32(left[0]);
+    left[1] = highbd_add_round_shift_s32(left[1]);
+    left[2] = highbd_add_round_shift_s32(left[2]);
+    left[3] = highbd_add_round_shift_s32(left[3]);
+    left[4] = highbd_add_round_shift_s32(left[4]);
+    left[5] = highbd_add_round_shift_s32(left[5]);
+    left[6] = highbd_add_round_shift_s32(left[6]);
+    left[7] = highbd_add_round_shift_s32(left[7]);
+    right[0] = highbd_add_round_shift_s32(right[0]);
+    right[1] = highbd_add_round_shift_s32(right[1]);
+    right[2] = highbd_add_round_shift_s32(right[2]);
+    right[3] = highbd_add_round_shift_s32(right[3]);
+    right[4] = highbd_add_round_shift_s32(right[4]);
+    right[5] = highbd_add_round_shift_s32(right[5]);
+    right[6] = highbd_add_round_shift_s32(right[6]);
+    right[7] = highbd_add_round_shift_s32(right[7]);
+
+    // store results
+    vst1q_s32(final_output, left[0]);
+    vst1q_s32(final_output + 4, right[0]);
+    vst1q_s32(final_output + 8, left[1]);
+    vst1q_s32(final_output + 12, right[1]);
+    vst1q_s32(final_output + 16, left[2]);
+    vst1q_s32(final_output + 20, right[2]);
+    vst1q_s32(final_output + 24, left[3]);
+    vst1q_s32(final_output + 28, right[3]);
+    vst1q_s32(final_output + 32, left[4]);
+    vst1q_s32(final_output + 36, right[4]);
+    vst1q_s32(final_output + 40, left[5]);
+    vst1q_s32(final_output + 44, right[5]);
+    vst1q_s32(final_output + 48, left[6]);
+    vst1q_s32(final_output + 52, right[6]);
+    vst1q_s32(final_output + 56, left[7]);
+    vst1q_s32(final_output + 60, right[7]);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 68aeab3..c100e70 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -342,6 +342,20 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
+  const int32x2_t x_lo = vget_low_s32(x);
+  const int32x2_t x_hi = vget_high_s32(x);
+  const int64x2_t x64_lo = vmovl_s32(x_lo);
+  const int64x2_t x64_hi = vmovl_s32(x_hi);
+
+  const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
+  const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
+
+  const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
+  const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
+  return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
+}
+
 static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
                                                   const int32x4_t b,
                                                   const tran_high_t c,
@@ -413,5 +427,135 @@ static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
   in[3] = out[3];
 }
 
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // t0 = (x0 + x1) * cospi_16_64;
+  // t1 = (x0 - x1) * cospi_16_64;
+  // out[0] = (tran_low_t)fdct_round_shift(t0);
+  // out[4] = (tran_low_t)fdct_round_shift(t1);
+  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
+  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
+                                 &right[4]);
+  // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+  // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+  // out[2] = (tran_low_t)fdct_round_shift(t2);
+  // out[6] = (tran_low_t)fdct_round_shift(t3);
+  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
+                                 &left[2], &left[6]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
+                                 &right[2], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
+  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+  // out[1] = (tran_low_t)fdct_round_shift(t0);
+  // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+  // out[7] = (tran_low_t)fdct_round_shift(t3);
+  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
+                                 &left[1], &left[7]);
+  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
+                                 &right[1], &right[7]);
+
+  // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+  // out[5] = (tran_low_t)fdct_round_shift(t1);
+  // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  // out[3] = (tran_low_t)fdct_round_shift(t2);
+  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
+                                 &left[5], &left[3]);
+  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
+                                 &right[5], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c5514b1..e886c0a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -555,7 +555,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8 sse2/;
+  specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct8x8_1 neon/;
-- 
2.7.4