this commit converts all sad ptrs to uint32
authorJim Bankoski <jimbankoski@google.com>
Thu, 28 Feb 2013 16:32:14 +0000 (08:32 -0800)
committerJim Bankoski <jimbankoski@google.com>
Thu, 28 Feb 2013 16:46:35 +0000 (08:46 -0800)
sse4_1 code used uint16_t for returning sad, but that
won't work for 32x32 or 64x64.   This code fixes the
assembly for those and also reenables sse4_1 on linux

Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81

build/make/configure.sh
vp9/common/vp9_rtcd_defs.sh
vp9/encoder/vp9_mcomp.c
vp9/encoder/vp9_sad_c.c
vp9/encoder/vp9_variance.h
vp9/encoder/x86/vp9_sad_sse4.asm

index 9e1d7ed..e558ff2 100755 (executable)
@@ -997,17 +997,6 @@ process_common_toolchain() {
 #error "not x32"
 #endif
 EOF
-        soft_enable runtime_cpu_detect
-        soft_enable mmx
-        soft_enable sse
-        soft_enable sse2
-        soft_enable sse3
-        soft_enable ssse3
-        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
-            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
-        else
-            soft_enable sse4_1
-        fi
 
         case  ${tgt_os} in
             win*)
@@ -1061,6 +1050,18 @@ EOF
             ;;
         esac
 
+        soft_enable runtime_cpu_detect
+        soft_enable mmx
+        soft_enable sse
+        soft_enable sse2
+        soft_enable sse3
+        soft_enable ssse3
+        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
+        else
+            soft_enable sse4_1
+        fi
+
         case "${AS}" in
             auto|"")
                 which nasm >/dev/null 2>&1 && AS=nasm
index bdeea3f..3ea0b06 100644 (file)
@@ -449,25 +449,25 @@ specialize vp9_sad8x8x3 sse3
 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3
 
-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad64x64x8
 
-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad32x32x8
 
-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x16x8 sse4
 
-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x8x8 sse4
 
-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x16x8 sse4
 
-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x8x8 sse4
 
-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
@@ -490,7 +490,6 @@ specialize vp9_sad8x8x4d sse2
 
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
-
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
index 64d8d7d..5287f97 100644 (file)
@@ -1782,7 +1782,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
 
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
+  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
   unsigned int sad_array[3];
   int_mv fcenter_mv;
 
index daff0c9..dc21f02 100644 (file)
@@ -103,62 +103,62 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad32x32x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x16x3_c(const uint8_t *src_ptr,
@@ -178,31 +178,31 @@ void vp9_sad16x16x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x8x3_c(const uint8_t *src_ptr,
@@ -222,31 +222,31 @@ void vp9_sad16x8x8_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr, ref_stride,
-                                       0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 1, ref_stride,
-                                       0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 2, ref_stride,
-                                       0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 3, ref_stride,
-                                       0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 4, ref_stride,
-                                       0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 5, ref_stride,
-                                       0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 6, ref_stride,
-                                       0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 7, ref_stride,
-                                       0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad8x8x3_c(const uint8_t *src_ptr,
@@ -266,31 +266,31 @@ void vp9_sad8x8x8_c(const uint8_t *src_ptr,
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr, ref_stride,
-                                      0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 1, ref_stride,
-                                      0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 2, ref_stride,
-                                      0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 3, ref_stride,
-                                      0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 4, ref_stride,
-                                      0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 5, ref_stride,
-                                      0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 6, ref_stride,
-                                      0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 7, ref_stride,
-                                      0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad8x16x3_c(const uint8_t *src_ptr,
@@ -310,31 +310,31 @@ void vp9_sad8x16x8_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr, ref_stride,
-                                       0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 1, ref_stride,
-                                       0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 2, ref_stride,
-                                       0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 3, ref_stride,
-                                       0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 4, ref_stride,
-                                       0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 5, ref_stride,
-                                       0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 6, ref_stride,
-                                       0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 7, ref_stride,
-                                       0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad4x4x3_c(const uint8_t *src_ptr,
@@ -354,31 +354,31 @@ void vp9_sad4x4x8_c(const uint8_t *src_ptr,
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr, ref_stride,
-                                      0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 1, ref_stride,
-                                      0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 2, ref_stride,
-                                      0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 3, ref_stride,
-                                      0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 4, ref_stride,
-                                      0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 5, ref_stride,
-                                      0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 6, ref_stride,
-                                      0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 7, ref_stride,
-                                      0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
index eb903bf..8b32524 100644 (file)
@@ -29,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
                                     int  ref_stride,
-                                    unsigned short *sad_array);
+                                    unsigned int *sad_array);
 
 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
index b42982a..faf1768 100644 (file)
         paddw           xmm1,       xmm5
 %endmacro
 
+%macro WRITE_AS_INTS 0
+    mov             rdi,        arg(4)           ;Results
+    pxor            xmm0, xmm0
+    movdqa          xmm2, xmm1
+    punpcklwd       xmm1, xmm0
+    punpckhwd       xmm2, xmm0
+
+    movdqa          [rdi],    xmm1
+    movdqa          [rdi + 16],    xmm2
+%endmacro
 
 ;void vp9_sad16x16x8_sse4(
 ;    const unsigned char *src_ptr,
@@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
+    PROCESS_4X2X8 1
+    PROCESS_4X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi