From 43a30d3a1a6b627fa05ba63f4c51414ced781ccb Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 12 Nov 2018 11:30:03 -0800 Subject: [PATCH] quantize: use aarch64 vmaxv MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Simplify max value calculation on aarch64 by using vmaxv. Much faster for 4x4 but diminishing returns as the block size grows. Only the vp9 quantize has a speed test hooked up. Anticipate similar results for the other quantize versions. Before: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.2 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.2 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) After: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.1 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2 --- vp8/encoder/arm/neon/fastquantizeb_neon.c | 10 ++++++++-- vp9/encoder/arm/neon/vp9_quantize_neon.c | 8 ++++++++ vpx_dsp/arm/quantize_neon.c | 8 ++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index c42005d..d066be1 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -26,9 +26,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; +#ifndef __aarch64__ uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; +#endif // __arch64__ /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -66,11 +68,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); +#ifdef __aarch64__ + *d->eob = (int8_t)vmaxvq_u16(eob0); +#else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); eob_q32 = vmovl_u16(eob_d16); eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); eob_d32 = vpmax_u32(eob_d32, eob_d32); + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +#endif // __aarch64__ + /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); vst1q_s16(d->qcoeff + 8, x1); @@ -78,6 +86,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* dqcoeff = x * dequant */ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); - - vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 2cec8bd..8b62b45 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else { const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); @@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -226,6 +230,9 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, dqcoeff_ptr += 8; } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 1e33851..b5d1e7e 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -135,6 +135,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -142,6 +145,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -288,6 +292,9 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -295,4 +302,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } -- 2.7.4