*sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
}
-static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse[16];
- highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
- highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
- src += src_stride;
- ref += ref_stride;
-
- while (--height != 0) {
- highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
- highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
- src += src_stride;
- ref += ref_stride;
- }
-
- return horizontal_long_add_uint32x4_x16(sse);
-}
-
static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int height) {
return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
case 64:
return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
- case 128:
- return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
default:
return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
height);
return horizontal_add_uint32x4(sse);
}
-static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = height;
- do {
- sse_16x1_neon(src, ref, &sse[0]);
- sse_16x1_neon(src + 16, ref + 16, &sse[1]);
- sse_16x1_neon(src + 32, ref + 32, &sse[0]);
- sse_16x1_neon(src + 48, ref + 48, &sse[1]);
- sse_16x1_neon(src + 64, ref + 64, &sse[0]);
- sse_16x1_neon(src + 80, ref + 80, &sse[1]);
- sse_16x1_neon(src + 96, ref + 96, &sse[0]);
- sse_16x1_neon(src + 112, ref + 112, &sse[1]);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int height) {
case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
- case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
default:
return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
}
return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
}
-static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
- int src_stride,
- const uint8_t *ref,
- int ref_stride, int height) {
- uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = height;
- do {
- sse_16x1_neon_dotprod(src, ref, &sse[0]);
- sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
- sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
- sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
- sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
- sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
- sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
- sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int height) {
return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
case 64:
return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
- case 128:
- return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
default:
return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
height);
} while (y < height);
sse = summary_all_avx2(&sum);
break;
- case 128:
- do {
- sse_w32_avx2(&sum, a, b);
- sse_w32_avx2(&sum, a + 32, b + 32);
- sse_w32_avx2(&sum, a + 64, b + 64);
- sse_w32_avx2(&sum, a + 96, b + 96);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
default:
if ((width & 0x07) == 0) {
do {
} while (y < height);
sse = summary_4x64_avx2(sum);
break;
- case 128:
- do {
- int l = 0;
- __m256i sum32 = _mm256_setzero_si256();
- do {
- highbd_sse_w16_avx2(&sum32, a, b);
- highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
- highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
- highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
- highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
- highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
- highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
- highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- l += 1;
- } while (l < 16 && l < (height - y));
- summary_32_avx2(&sum32, &sum);
- y += 16;
- } while (y < height);
- sse = summary_4x64_avx2(sum);
- break;
default:
if (width & 0x7) {
do {
} while (y < height);
sse = summary_all_sse4(&sum);
break;
- case 128:
- do {
- sse_w16_sse4_1(&sum, a, b);
- sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
- sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
- sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
- sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
- sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
- sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
- sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
default:
if (width & 0x07) {
do {
_mm_storel_epi64((__m128i *)&sse,
_mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
break;
- case 128:
- do {
- int l = 0;
- __m128i sum32 = _mm_setzero_si128();
- do {
- highbd_sse_w8_sse4_1(&sum32, a, b);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
- a += a_stride;
- b += b_stride;
- l += 1;
- } while (l < 8 && l < (height - y));
- summary_32_sse4(&sum32, &sum);
- y += 8;
- } while (y < height);
- _mm_storel_epi64((__m128i *)&sse,
- _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
- break;
default:
if (width & 0x7) {
do {