From 02fd7d6aeb1ea6d8eeef17315a6a7c4ffa6d7352 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 12 Nov 2022 08:23:17 +0900 Subject: [PATCH] Reland "quantize: simplifly highbd 32x32_b args" This is a reland of commit 573f5e662b544dbc553d73fa2b61055c30dfe8cc Alignment issue with tests fixed in crrev.com/c/webm/libvpx/+/4305500 Original change's description: > quantize: simplify highbd 32x32_b args > > Change-Id: I431a41279c4c4193bc70cfe819da6ea7e1d2fba1 Change-Id: Ic868b6f987c99d88672858fedd092fa49c125e19 --- test/vp9_quantize_test.cc | 54 +++++++++++++++---------------- vp9/encoder/vp9_encodemb.c | 10 +++--- vpx_dsp/arm/highbd_quantize_neon.c | 21 ++++++------ vpx_dsp/quantize.c | 16 +++++---- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 ++++---- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 ++++++------ 7 files changed, 69 insertions(+), 68 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 6a8f1da..bff2fa5 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -557,15 +557,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantWrapper, &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( @@ -634,15 +634,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -680,15 +680,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4910dc2..6a5f628 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,9 +511,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -856,9 +855,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index b9f72a9..3b1fec3 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, @@ -224,11 +225,9 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( } void vpx_highbd_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -237,12 +236,13 @@ void vpx_highbd_quantize_b_32x32_neon( // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); - int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -300,8 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and + // Need this here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 212db45..c464281 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -272,14 +272,16 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; int idx_arr[1024]; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2a01ec1..ab86b9c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -731,7 +731,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 8edddd6..6041d72 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -222,17 +223,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; (void)scan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, + mb_plane->quant_shift, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index ae1981a..6a8f42b 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, @@ -93,18 +94,17 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + const intptr_t n_coeffs = 32 * 32; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); @@ -140,10 +140,11 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; -- 2.7.4