From a49f896352671870f38c1374f3d5329e3b60193f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 16:00:43 +0000 Subject: [PATCH] [NEON] Add highbd FDCT 8x8 function 50% faster than C version in best/rt profiles Change-Id: I0f9504ed52b5d5f7722407e91108ed4056d66bc2 --- test/dct_test.cc | 12 ++-- vpx_dsp/arm/fdct8x8_neon.c | 78 +++++++++++++++++++++++ vpx_dsp/arm/fdct_neon.h | 144 +++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 4 files changed, 229 insertions(+), 7 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index e34122a..ff97fc7 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -543,12 +543,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, static const FuncInfo dct_neon_func_info[] = { { &fdct_wrapper, &highbd_idct_wrapper, 4, 2 }, - /* { &fdct_wrapper, - &highbd_idct_wrapper, 8, 2 }, - { &fdct_wrapper, - &highbd_idct_wrapper, 16, 2 }, - { &fdct_wrapper, - &highbd_idct_wrapper, 32, 2 },*/ + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ }; #else static const FuncInfo dct_neon_func_info[4] = { diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c index d9161c6..3fb15cc 100644 --- a/vpx_dsp/arm/fdct8x8_neon.c +++ b/vpx_dsp/arm/fdct8x8_neon.c @@ -66,3 +66,81 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 7 * 8, in[7]); } } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + for (i = 0; i < 2; ++i) { + vpx_highbd_fdct8x8_pass1_neon(left, right); + } + { + left[0] = highbd_add_round_shift_s32(left[0]); + left[1] = highbd_add_round_shift_s32(left[1]); + left[2] = highbd_add_round_shift_s32(left[2]); + left[3] = highbd_add_round_shift_s32(left[3]); + left[4] = highbd_add_round_shift_s32(left[4]); + left[5] = highbd_add_round_shift_s32(left[5]); + left[6] = highbd_add_round_shift_s32(left[6]); + left[7] = highbd_add_round_shift_s32(left[7]); + right[0] = highbd_add_round_shift_s32(right[0]); + right[1] = highbd_add_round_shift_s32(right[1]); + right[2] = highbd_add_round_shift_s32(right[2]); + right[3] = highbd_add_round_shift_s32(right[3]); + right[4] = highbd_add_round_shift_s32(right[4]); + right[5] = highbd_add_round_shift_s32(right[5]); + right[6] = highbd_add_round_shift_s32(right[6]); + right[7] = highbd_add_round_shift_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 68aeab3..c100e70 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -342,6 +342,20 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { } #if CONFIG_VP9_HIGHBITDEPTH +static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) { + const int32x2_t x_lo = vget_low_s32(x); + const int32x2_t x_hi = vget_high_s32(x); + const int64x2_t x64_lo = vmovl_s32(x_lo); + const int64x2_t x64_hi = vmovl_s32(x_hi); + + const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63); + const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63); + + const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo); + const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi); + return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1)); +} + static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, const int32x4_t b, const tran_high_t c, @@ -413,5 +427,135 @@ static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { in[3] = out[3]; } +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // t0 = (x0 + x1) * cospi_16_64; + // t1 = (x0 - x1) * cospi_16_64; + // out[0] = (tran_low_t)fdct_round_shift(t0); + // out[4] = (tran_low_t)fdct_round_shift(t1); + highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]); + highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], + &right[4]); + // t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + // out[2] = (tran_low_t)fdct_round_shift(t2); + // out[6] = (tran_low_t)fdct_round_shift(t3); + highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, + &left[2], &left[6]); + highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, + &right[2], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]); + highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + // out[1] = (tran_low_t)fdct_round_shift(t0); + // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + // out[7] = (tran_low_t)fdct_round_shift(t3); + highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, + &left[1], &left[7]); + highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, + &right[1], &right[7]); + + // t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + // out[5] = (tran_low_t)fdct_round_shift(t1); + // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + // out[3] = (tran_low_t)fdct_round_shift(t2); + highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, + &left[5], &left[3]); + highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, + &right[5], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + #endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c5514b1..e886c0a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -555,7 +555,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_fdct4x4 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct8x8 sse2/; + specialize qw/vpx_highbd_fdct8x8 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct8x8_1 neon/; -- 2.7.4