From eeea3daacbf0c3f8e1bbfd2f9b67e4eda1badafc Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 1 Oct 2022 11:18:09 +0900 Subject: [PATCH] vp9 quantize: change index In assembly it made sense to iterate using n_coeffs. In intrinsics it's just as fast to use index and easier to read. Change-Id: I403c959709309dad68123d0a3d0efe183874543d --- vp9/encoder/x86/vp9_quantize_sse2.c | 99 ++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 57 deletions(-) diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index da4cd9e..272e5fb 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -26,72 +26,58 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; - __m128i eob; + int index = 16; __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; (void)scan; - coeff_ptr += n_coeffs; - iscan += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - // Setup global values. load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); - { - __m128i coeff0, coeff1; - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - // Do DC and first 15 AC. - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - // Poor man's abs(). - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); - round = _mm_unpackhi_epi64(round, round); - quant = _mm_unpackhi_epi64(quant, quant); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); - // Reinsert signs. - qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); - qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero); - - n_coeffs += 8 * 2; - } + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); thr = _mm_srai_epi16(dequant, 1); // AC only loop. - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); // Poor man's abs(). coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -112,28 +98,27 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); } if (nzflag) { - const __m128i eob0 = - scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero); + const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } - n_coeffs += 8 * 2; + index += 16; } *eob_ptr = accumulate_eob(eob); -- 2.7.4