From 0c481f4d1824866527a82ab60f28d7c9d304679d Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 17 Apr 2013 10:31:59 -0700 Subject: [PATCH] Add SSE2 versions for rectangular sad and sad4d functions. About 11% overall encoder speedup with the sbsegment experiment enabled. Change-Id: Iffb1bdba6932d9f11a6c791cda8697ccf9327183 --- vp9/common/vp9_rtcd_defs.sh | 40 +++++++++++++++++++++++--------------- vp9/encoder/x86/vp9_sad4d_sse2.asm | 4 ++++ vp9/encoder/x86/vp9_sad_sse2.asm | 24 +++++++++++++++++------ 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index d787a26..6db44a0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -376,7 +376,8 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then # variance [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 -#if CONFIG_SBSEGMENT +if [ "$CONFIG_SBSEGMENT" = "yes" ]; then + prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance32x16 @@ -388,7 +389,8 @@ specialize vp9_variance64x32 prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance32x64 -#endif + +fi prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance32x32 @@ -424,7 +426,8 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance64x64 sse2 -#if CONFIG_SBSEGMENT +if [ "$CONFIG_SBSEGMENT" = "yes" ]; then + prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x64 @@ -436,7 +439,8 @@ specialize vp9_sub_pixel_variance32x16 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x32 -#endif + +fi prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x32 sse2 @@ -464,19 +468,21 @@ vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad64x64 sse2 -#if CONFIG_SBSEGMENT +if [ "$CONFIG_SBSEGMENT" = "yes" ]; then + prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x64 +specialize vp9_sad32x64 sse2 prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x32 +specialize vp9_sad64x32 sse2 prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x16 +specialize vp9_sad32x16 sse2 prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x32 -#endif +specialize vp9_sad16x32 sse2 + +fi prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad32x32 sse2 @@ -571,19 +577,21 @@ specialize vp9_sad4x4x8 sse4 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad64x64x4d sse2 -#if CONFIG_SBSEGMENT +if [ "$CONFIG_SBSEGMENT" = "yes" ]; then + prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x64x4d +specialize vp9_sad32x64x4d sse2 prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x32x4d +specialize vp9_sad64x32x4d sse2 prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x16x4d +specialize vp9_sad32x16x4d sse2 prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x32x4d -#endif +specialize vp9_sad16x32x4d sse2 + +fi prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad32x32x4d sse2 diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm index 3716d91..25dd064 100644 --- a/vp9/encoder/x86/vp9_sad4d_sse2.asm +++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm @@ -215,7 +215,11 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ INIT_XMM sse2 SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 SADNXN4D 16, 16 SADNXN4D 16, 8 SADNXN4D 8, 16 diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index ea482e0..ea92377 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -14,11 +14,11 @@ SECTION .text ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -INIT_XMM sse2 -cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows +%macro SAD64XN 1 +cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided - mov n_rowsd, 64 + mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] @@ -42,14 +42,19 @@ cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows paddd m0, m1 movd eax, m0 RET +%endmacro + +INIT_XMM sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -INIT_XMM sse2 -cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows +%macro SAD32XN 1 +cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided - mov n_rowsd, 16 + mov n_rowsd, %1/2 pxor m0, m0 .loop: @@ -74,6 +79,12 @@ cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows paddd m0, m1 movd eax, m0 RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -112,6 +123,7 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ %endmacro INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 -- 2.7.4