From 0ce866562fb9c70d5825a6279f3aa3a10f7a9289 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 16 Jan 2023 16:44:04 +0000 Subject: [PATCH] Refactor Neon implementation of variance functions Refactor and optimize the Neon implementation of variance functions - effectively backporting these libaom changes[1,2]. After this change, the only differences between the code in libvpx and libaom are due to libvpx being compiled with ISO C90, which forbids mixing declarations and code [-Wdeclaration-after-statement]. [1] https://aomedia-review.googlesource.com/c/aom/+/162241 [2] https://aomedia-review.googlesource.com/c/aom/+/162262 Change-Id: Ia4e8fff4d53297511d1a1e43bca8053bf811e551 --- vpx_dsp/arm/variance_neon.c | 538 ++++++++++++++++++++++---------------------- 1 file changed, 275 insertions(+), 263 deletions(-) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index f9969ed..3ccc4e8 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -22,236 +22,310 @@ #if defined(__ARM_FEATURE_DOTPROD) // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; i += 4) { - const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t abs_diff = vabdq_u8(a, b); + const uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); src_ptr += 4 * src_stride; ref_ptr += 4 * ref_stride; - } + i -= 4; + } while (i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a = vld1q_u8(src_ptr + j); - const uint8x16_t b = vld1q_u8(ref_ptr + j); + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - const uint8x16_t abs_diff = vabdq_u8(a, b); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); - } src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - uint32x2_t sum_a = vdup_n_u32(0); - uint32x2_t sum_b = vdup_n_u32(0); - uint32x2_t sse_lo_u32 = vdup_n_u32(0); - uint32x2_t sse_hi_u32 = vdup_n_u32(0); +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int i = h; do { - const uint8x8_t a_0 = vld1_u8(src_ptr); - const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0 = vld1_u8(ref_ptr); - const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); - - const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); - const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); - sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); - sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); - - sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); - sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); - - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; - } while (i < h); + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); - *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); - *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); } -#else // !defined(__ARM_FEATURE_DOTPROD) +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} -// The variance helper functions use int16_t for sum. 8 values are accumulated -// and then added (at which point they expand up to int32_t). To avoid overflow, -// there can be no more than 32767 / 255 ~= 128 values accumulated in each -// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32 -// rows = 128. Asserts have been added to each function to warn against reaching -// this limit. +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} -// Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; +#else // !defined(__ARM_FEATURE_DOTPROD) + +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; - // Since width is only 4, sum_s16 only loads a half row per loop. + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. assert(h <= 256); - for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); + sum_s16 = vaddq_s16(sum_s16, diff); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - } + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); - - // The loop loads 16 values at a time but doubles them up when accumulating - // into sum_s16. - assert(w / 8 * h <= 128); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); - - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); - - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); - - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); - } + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; - // Each column has it's own accumulator entry in sum_s16. + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(src_ptr); - const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); - const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); - const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); - const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); - const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); - const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); - sum_s16 = vaddq_s16(sum_s16, diff_0_s16); - sum_s16 = vaddq_s16(sum_s16, diff_1_s16); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), - vget_low_s16(diff_0_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), - vget_low_s16(diff_1_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), - vget_high_s16(diff_0_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), - vget_high_s16(diff_1_s16)); - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; } while (i < h); - *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); } #endif // defined(__ARM_FEATURE_DOTPROD) @@ -259,103 +333,41 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); } -#define VARIANCENXM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else if (n == 8) \ - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else \ - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ - &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } -VARIANCENXM(4, 4, 4) -VARIANCENXM(4, 8, 5) -VARIANCENXM(8, 4, 5) -VARIANCENXM(8, 8, 6) -VARIANCENXM(8, 16, 7) -VARIANCENXM(16, 8, 7) -VARIANCENXM(16, 16, 8) -VARIANCENXM(16, 32, 9) -VARIANCENXM(32, 16, 9) -VARIANCENXM(32, 32, 10) - -unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, - &sum1); - variance_neon_w16(src_ptr + (32 * src_stride), src_stride, - ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) -unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) -unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, - ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, - ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) #if defined(__ARM_FEATURE_DOTPROD) -- 2.7.4