From a55e248349a1d549dbb3d988ced7ff93ac20ca0d Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 2 Aug 2022 11:22:04 -0700 Subject: [PATCH] VPX: Add vp9_highbd_quantize_fp_avx2(). Up to 5.37x faster than vp9_highbd_quantize_fp_c() for full calculations. ~1.6% overall encoder improvement for the test clip used. Bug: b/237714063 Change-Id: I584fd1f60a3e02f1ded092de98970725fc66c5b8 --- test/vp9_quantize_test.cc | 3 + vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/x86/vp9_quantize_avx2.c | 108 ++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 4bd573b..a2dbfc5 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -580,6 +580,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, 16, + true), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index a4d28f0..b956877 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -196,6 +196,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_highbd_quantize_fp avx2/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 6ded091..bd93e71 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -259,3 +259,111 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = get_max_eob(eob_max); } + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(round_ptr); + *quant = highbd_init_256(quant_ptr); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} +#endif // CONFIG_VP9_HIGHBITDEPTH -- 2.7.4