From 848f6e733789c627b6606baf1c85e32be997e36f Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 5 Nov 2022 09:53:07 +0900 Subject: [PATCH] quantize: simplify 32x32_b args Now that all the implementations of the 32x32 quantize are in intrinsics we can reference struct members directly. Saves pushing them to the stack. n_coeffs is not used at all for this function. Change-Id: I2104fea3fa20c455087e21b347d6abd7ea1f3e1e --- test/vp9_quantize_test.cc | 285 +++++++++++++++++++++++++------------------ vp9/encoder/vp9_block.h | 1 + vp9/encoder/vp9_encodemb.c | 6 +- vpx_dsp/arm/quantize_neon.c | 17 ++- vpx_dsp/quantize.c | 17 +-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +- vpx_dsp/x86/quantize_avx.c | 30 +---- vpx_dsp/x86/quantize_avx2.c | 15 +-- vpx_dsp/x86/quantize_sse2.h | 28 +++++ vpx_dsp/x86/quantize_ssse3.c | 35 +----- 10 files changed, 238 insertions(+), 201 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 587cec6..ecb6116 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,6 +26,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -38,8 +39,7 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); @@ -47,6 +47,41 @@ typedef std::tuple QuantizeParam; +// Wrapper which takes a macroblock_plane. +typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, + const int16_t *quant_shift, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template +void QuantWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); + +template +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, const int16_t *round, const int16_t *quant, @@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, const int16_t *scan, - const int16_t *iscan) { - (void)zbin; - (void)quant_shift; - - fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, + dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -119,17 +150,16 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = + zbin_ptr_ = mb_plane_.zbin = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = reinterpret_cast( - vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); - quant_fp_ptr_ = reinterpret_cast( + round_fp_ptr_ = mb_plane_.round_fp; + quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = + round_ptr_ = mb_plane_.round = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = + quant_ptr_ = mb_plane_.quant = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = reinterpret_cast( + quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -140,7 +170,6 @@ class VP9QuantizeBase : public AbstractBench { ~VP9QuantizeBase() { vpx_free(zbin_ptr_); - vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); vpx_free(round_ptr_); vpx_free(quant_ptr_); @@ -157,6 +186,7 @@ class VP9QuantizeBase : public AbstractBench { } protected: + macroblock_plane mb_plane_; int16_t *zbin_ptr_; int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; @@ -193,10 +223,9 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, - scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan); } void VP9QuantizeTest::Speed(bool is_median) { @@ -266,8 +295,8 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); } @@ -275,10 +304,9 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, - scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -417,15 +445,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -475,15 +502,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -510,28 +536,35 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true))); @@ -541,11 +574,12 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -555,13 +589,14 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -577,22 +612,29 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -602,11 +644,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -615,22 +658,29 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, @@ -639,11 +689,12 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -683,9 +734,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, - 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1786952..fc27a0f 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,6 +13,7 @@ #include "vpx_util/vpx_thread.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9..4910dc2 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: @@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 9c227d5..e81738a 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int i; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif // __aarch64__ // Need these here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64..212db45 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; (void)iscan; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index eef7224..639c18b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,6 +17,9 @@ print <zbin, &v_zbin, mb_plane->round, &v_round, + mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, + mb_plane->quant_shift, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e..fe42fee 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 4762302..6fe54d7 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; __m128i zbin, round, quant, dequant, shift; @@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); -- 2.7.4