From 9824167ad292ee42c9c97f3e6ce1d9ca90bf679f Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 22 Mar 2023 11:49:33 +0000 Subject: [PATCH] Avoid LD2/ST2 instructions in highbd v predictors in Neon The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4) are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use the normal load/store instructions. This patch replaces such occurrences in the two larger block sizes: vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon. Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4 --- vpx_dsp/arm/highbd_intrapred_neon.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index b2aea14..ec97094 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } -- 2.7.4