i++;
} while (i < h);
- res[0] = horizontal_add_uint16x8(sum[0]);
- res[1] = horizontal_add_uint16x8(sum[1]);
- res[2] = horizontal_add_uint16x8(sum[2]);
- res[3] = horizontal_add_uint16x8(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
}
static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
i += 2;
} while (i < h);
- res[0] = horizontal_add_uint16x8(sum[0]);
- res[1] = horizontal_add_uint16x8(sum[1]);
- res[2] = horizontal_add_uint16x8(sum[2]);
- res[3] = horizontal_add_uint16x8(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
}
#define SAD_WXH_4D_NEON(w, h) \
#endif
}
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if defined(__aarch64__)
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ return vpaddlq_u16(b0);
+#else
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
#if defined(__aarch64__)