From 4db2076594be3a48c6c1b3755c1d9621f5ad1c5b Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 18 Oct 2010 14:15:15 -0400 Subject: [PATCH] Add SSE2 subtract functions Instead of doing 8-bit data unpack and 16-bit subtraction, use psubb to do 16 8-bit subtractions and pcmpgtb to preserve the sign information. This does not bring noticable gain since these functions are not called frequently. Change-Id: I90a0dfaa3db9d422e4ada324076596ffb178548e --- vp8/encoder/x86/encodemb_x86.h | 13 +- vp8/encoder/x86/subtract_mmx.asm | 2 +- vp8/encoder/x86/subtract_sse2.asm | 348 +++++++++++++++++++++++++++++++++ vp8/encoder/x86/x86_csystemdependent.c | 18 +- vp8/vp8cx.mk | 1 + 5 files changed, 377 insertions(+), 5 deletions(-) create mode 100644 vp8/encoder/x86/subtract_sse2.asm diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h index d090b2d..69b3edd 100644 --- a/vp8/encoder/x86/encodemb_x86.h +++ b/vp8/encoder/x86/encodemb_x86.h @@ -55,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx); extern prototype_berr(vp8_block_error_xmm); extern prototype_mberr(vp8_mbblock_error_xmm); extern prototype_mbuverr(vp8_mbuverror_xmm); - +extern prototype_subb(vp8_subtract_b_sse2); +extern prototype_submby(vp8_subtract_mby_sse2); +extern prototype_submbuv(vp8_subtract_mbuv_sse2); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_encodemb_berr @@ -67,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm); #undef vp8_encodemb_mbuverr #define vp8_encodemb_mbuverr vp8_mbuverror_xmm +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_sse2 + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_sse2 + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2 + #endif #endif diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index 8fe3ee1..a47e1f0 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -12,7 +12,7 @@ %include "vpx_ports/x86_abi_support.asm" ;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; unsigned short *diff, unsigned char *Predictor, +; short *diff, unsigned char *Predictor, ; int pitch); global sym(vp8_subtract_b_mmx_impl) sym(vp8_subtract_b_mmx_impl): diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm new file mode 100644 index 0000000..ef329de --- /dev/null +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -0,0 +1,348 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, +; short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_sse2_impl) +sym(vp8_subtract_b_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi], mm0 + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2],mm0 + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) +global sym(vp8_subtract_mby_sse2) +sym(vp8_subtract_mby_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(1) ;src + mov rdi, arg(0) ;diff + + mov rax, arg(2) ;pred + movsxd rdx, dword ptr arg(3) ;stride + + mov rcx, 8 ; do two lines at one time + +submby_loop: + movdqa xmm0, [rsi] ; src + movdqa xmm1, [rax] ; pred + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 + + movdqa xmm4, [rsi + rdx] + movdqa xmm5, [rax + 16] + + movdqa xmm6, xmm4 + psubb xmm4, xmm5 + + pxor xmm5, [GLOBAL(t80)] ;convert to signed values + pxor xmm6, [GLOBAL(t80)] + pcmpgtb xmm5, xmm6 ; obtain sign information + + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + punpcklbw xmm4, xmm5 ; put sign back to subtraction + punpckhbw xmm6, xmm7 ; put sign back to subtraction + + movdqa [rdi +32], xmm4 + movdqa [rdi +48], xmm6 + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + sub rcx, 1 + jnz submby_loop + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +global sym(vp8_subtract_mbuv_sse2) +sym(vp8_subtract_mbuv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(1) ;z = usrc + add rdi, 256*2 ;diff = diff + 256 (shorts) + add rax, 256 ;Predictor = pred + 256 + movsxd rdx, dword ptr arg(4) ;stride; + lea rcx, [rdx + rdx*2] + + ;u + ;line 0 1 + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] + movdqa xmm1, [rax] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 + + ;line 2 3 + movq xmm0, [rsi+rdx*2] ; src + movq xmm2, [rsi+rcx] + movdqa xmm1, [rax+16] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 32], xmm0 + movdqa [rdi + 48], xmm2 + + ;line 4 5 + lea rsi, [rsi + rdx*4] + + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] + movdqa xmm1, [rax + 32] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 64], xmm0 + movdqa [rdi + 80], xmm2 + + ;line 6 7 + movq xmm0, [rsi+rdx*2] ; src + movq xmm2, [rsi+rcx] + movdqa xmm1, [rax+ 48] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 96], xmm0 + movdqa [rdi + 112], xmm2 + + ;v + mov rsi, arg(2) ;z = vsrc + add rdi, 64*2 ;diff = diff + 320 (shorts) + add rax, 64 ;Predictor = pred + 320 + + ;line 0 1 + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] + movdqa xmm1, [rax] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 + + ;line 2 3 + movq xmm0, [rsi+rdx*2] ; src + movq xmm2, [rsi+rcx] + movdqa xmm1, [rax+16] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 32], xmm0 + movdqa [rdi + 48], xmm2 + + ;line 4 5 + lea rsi, [rsi + rdx*4] + + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] + movdqa xmm1, [rax + 32] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 64], xmm0 + movdqa [rdi + 80], xmm2 + + ;line 6 7 + movq xmm0, [rsi+rdx*2] ; src + movq xmm2, [rsi+rcx] + movdqa xmm1, [rax+ 48] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi + 96], xmm0 + movdqa [rdi + 112], xmm2 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t80: + times 16 db 0x80 diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 6d36aff..9b753bf 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -165,6 +165,18 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb) return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); } +void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); +} + #endif void vp8_arch_x86_encoder_init(VP8_COMP *cpi) @@ -282,12 +294,12 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.berr = vp8_block_error_xmm; cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; - /* cpi->rtcd.encodemb.sub* not implemented for wmt */ + cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; - } #endif diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 4ce18b6..2a84402 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -104,6 +104,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm -- 2.7.4