From fff4e76b55900767083434248c67c3e041bab97b Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 9 Mar 2023 13:58:16 +0000 Subject: [PATCH] Add Neon implementation of vpx_highbd_minmax_8x8_c Add Neon implementation of vpx_highbd_minmax_8x8_c as well as the corresponding tests. Change-Id: I5d9444a239fb1baa53634c1bdb5292b44067d90c --- test/minmax_test.cc | 4 +++ vpx_dsp/arm/highbd_avg_neon.c | 77 +++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 1 + 3 files changed, 82 insertions(+) diff --git a/test/minmax_test.cc b/test/minmax_test.cc index 663b359..e710af6 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -234,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_neon)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_neon)); +#endif #endif #if HAVE_MSA diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index 3ba58b8..b84a787 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -38,3 +38,80 @@ int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); } + +void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int *min, + int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * a_stride); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * a_stride); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * a_stride); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * a_stride); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * a_stride); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * a_stride); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * a_stride); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * a_stride); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * b_stride); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * b_stride); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * b_stride); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * b_stride); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * b_stride); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * b_stride); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * b_stride); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * b_stride); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if defined(__aarch64__) + *min = *max = 0; // Clear high bits + *((uint16_t *)max) = vmaxvq_u16(max07); + *((uint16_t *)min) = vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2a01ec1..2780333 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1000,6 +1000,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_avg_4x4 sse2/; add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; + specialize qw/vpx_highbd_minmax_8x8 neon/; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; -- 2.7.4