From 472c839c9f6e88e976faebdc05848e64e7f3945d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 31 Jan 2023 13:32:33 +0000 Subject: [PATCH] Use load_unaligned mem_neon.h helpers in SAD and SAD4D Use the load_unaligned helper functions in mem_neon.h to load strided sequences of 4 bytes where alignment is not guaranteed in the Neon SAD and SAD4D paths. Change-Id: I941d226ef94fd7a633b09fc92165a00ba68a1501 --- vpx_dsp/arm/sad4d_neon.c | 39 ++++++++++----------------------------- vpx_dsp/arm/sad_neon.c | 48 ++++++++++++------------------------------------ 2 files changed, 22 insertions(+), 65 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5064770..85f6c1e 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -285,35 +285,16 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, int i = 0; do { - uint32x2_t s, r0, r1, r2, r3; - uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi; - - memcpy(&s_lo, src + i * src_stride, 4); - memcpy(&r0_lo, ref[0] + i * ref_stride, 4); - memcpy(&r1_lo, ref[1] + i * ref_stride, 4); - memcpy(&r2_lo, ref[2] + i * ref_stride, 4); - memcpy(&r3_lo, ref[3] + i * ref_stride, 4); - s = vdup_n_u32(s_lo); - r0 = vdup_n_u32(r0_lo); - r1 = vdup_n_u32(r1_lo); - r2 = vdup_n_u32(r2_lo); - r3 = vdup_n_u32(r3_lo); - - memcpy(&s_hi, src + (i + 1) * src_stride, 4); - memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4); - memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4); - memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4); - memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4); - s = vset_lane_u32(s_hi, s, 1); - r0 = vset_lane_u32(r0_hi, r0, 1); - r1 = vset_lane_u32(r1_hi, r1, 1); - r2 = vset_lane_u32(r2_hi, r2, 1); - r3 = vset_lane_u32(r3_hi, r3, 1); - - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]); + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); + + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); i += 2; } while (i < h); diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 7336edb..9382b80 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; - - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; + sum = vabal_u8(sum, s, r); - sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r)); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } while (--i != 0); return horizontal_add_uint16x8(sum); @@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; - uint8x8_t p, avg; - - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; - - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); - p = vld1_u8(second_pred); - avg = vrhadd_u8(vreinterpret_u8_u32(r), p); + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); - sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; second_pred += 8; } while (--i != 0); -- 2.7.4