From 714aa9f3c072624186df161589bacbb778369312 Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Thu, 28 Feb 2013 08:32:14 -0800 Subject: [PATCH] this commit converts all sad ptrs to uint32 sse4_1 code used uint16_t for returning sad, but that won't work for 32x32 or 64x64. This code fixes the assembly for those and also reenables sse4_1 on linux Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81 --- build/make/configure.sh | 23 +-- vp9/common/vp9_rtcd_defs.sh | 15 +- vp9/encoder/vp9_mcomp.c | 2 +- vp9/encoder/vp9_sad_c.c | 350 +++++++++++++++++++-------------------- vp9/encoder/vp9_variance.h | 2 +- vp9/encoder/x86/vp9_sad_sse4.asm | 118 ++++++------- 6 files changed, 258 insertions(+), 252 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 9e1d7ed..e558ff2 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -997,17 +997,6 @@ process_common_toolchain() { #error "not x32" #endif EOF - soft_enable runtime_cpu_detect - soft_enable mmx - soft_enable sse - soft_enable sse2 - soft_enable sse3 - soft_enable ssse3 - if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 " - else - soft_enable sse4_1 - fi case ${tgt_os} in win*) @@ -1061,6 +1050,18 @@ EOF ;; esac + soft_enable runtime_cpu_detect + soft_enable mmx + soft_enable sse + soft_enable sse2 + soft_enable sse3 + soft_enable ssse3 + if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 " + else + soft_enable sse4_1 + fi + case "${AS}" in auto|"") which nasm >/dev/null 2>&1 && AS=nasm diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index bdeea3f..3ea0b06 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -449,25 +449,25 @@ specialize vp9_sad8x8x3 sse3 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x3 sse3 -prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad64x64x8 -prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad32x32x8 -prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad16x16x8 sse4 -prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad16x8x8 sse4 -prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad8x16x8 sse4 -prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad8x8x8 sse4 -prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad4x4x8 sse4 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" @@ -490,7 +490,6 @@ specialize vp9_sad8x8x4d sse2 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse - prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" specialize vp9_sub_pixel_mse16x16 sse2 mmx diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 64d8d7d..5287f97 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1782,7 +1782,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int col_min = ref_col - distance; int col_max = ref_col + distance; - DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8); + DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8); unsigned int sad_array[3]; int_mv fcenter_mv; diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index daff0c9..dc21f02 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -103,62 +103,62 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x16x3_c(const uint8_t *src_ptr, @@ -178,31 +178,31 @@ void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x8x3_c(const uint8_t *src_ptr, @@ -222,31 +222,31 @@ void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x8x3_c(const uint8_t *src_ptr, @@ -266,31 +266,31 @@ void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x16x3_c(const uint8_t *src_ptr, @@ -310,31 +310,31 @@ void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad4x4x3_c(const uint8_t *src_ptr, @@ -354,31 +354,31 @@ void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad64x64x4d_c(const uint8_t *src_ptr, diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index eb903bf..8b32524 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -29,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, - unsigned short *sad_array); + unsigned int *sad_array); typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, int source_stride, diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm index b42982a..faf1768 100644 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ b/vp9/encoder/x86/vp9_sad_sse4.asm @@ -154,6 +154,16 @@ paddw xmm1, xmm5 %endmacro +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro ;void vp9_sad16x16x8_sse4( ; const unsigned char *src_ptr, @@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi -- 2.7.4