From 48c84d138f4a5f66e982c49d4972f85aaae532a8 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 14 Jun 2010 14:07:56 -0400 Subject: [PATCH] sse2 version of vp8_regular_quantize_b Added sse2 version of vp8_regular_quantize_b which improved encode performance(for the clip used) by ~10% for 32 bit builds and ~3% for 64 bit builds. Also updated SHADOW_ARGS_TO_STACK to allow for more than 9 arguments. Change-Id: I62f78eabc8040b39f3ffdf21be175811e96b39af --- vp8/encoder/quantize.h | 4 + vp8/encoder/x86/quantize_sse2.asm | 254 +++++++++++++++++++++++++++++++++ vp8/encoder/x86/quantize_x86.h | 38 +++++ vp8/encoder/x86/x86_csystemdependent.c | 35 +++++ vp8/vp8cx.mk | 1 + vpx_ports/x86_abi_support.asm | 15 +- 6 files changed, 338 insertions(+), 9 deletions(-) create mode 100644 vp8/encoder/x86/quantize_sse2.asm create mode 100644 vp8/encoder/x86/quantize_x86.h diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h index f5ee9d7..ca073ef 100644 --- a/vp8/encoder/quantize.h +++ b/vp8/encoder/quantize.h @@ -17,6 +17,10 @@ #define prototype_quantize_block(sym) \ void (sym)(BLOCK *b,BLOCKD *d) +#if ARCH_X86 || ARCH_X86_64 +#include "x86/quantize_x86.h" +#endif + #if ARCH_ARM #include "arm/quantize_arm.h" #endif diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm new file mode 100644 index 0000000..c64a8ba --- /dev/null +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -0,0 +1,254 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; const int *default_zig_zag, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr, +; unsigned short zbin_oq_value, +; short *zbin_boost_ptr); +; +global sym(vp8_regular_quantize_b_impl_sse2) +sym(vp8_regular_quantize_b_impl_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 10 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + + %define abs_minus_zbin_lo 0 + %define abs_minus_zbin_hi 16 + %define temp_qcoeff_lo 32 + %define temp_qcoeff_hi 48 + %define save_xmm6 64 + %define save_xmm7 80 + %define eob 96 + + %define vp8_regularquantizeb_stack_size eob + 16 + + sub rsp, vp8_regularquantizeb_stack_size + + movdqa DQWORD PTR[rsp + save_xmm6], xmm6 + movdqa DQWORD PTR[rsp + save_xmm7], xmm7 + + mov rdx, arg(0) ;coeff_ptr + mov eax, arg(8) ;zbin_oq_value + + mov rcx, arg(1) ;zbin_ptr + movd xmm7, eax + + movdqa xmm0, DQWORD PTR[rdx] + movdqa xmm4, DQWORD PTR[rdx + 16] + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + + movdqa xmm2, DQWORD PTR[rcx] ;load zbin_ptr + movdqa xmm3, DQWORD PTR[rcx + 16] ;load zbin_ptr + + pshuflw xmm7, xmm7, 0 + psubw xmm1, xmm0 ;x = abs(z) + + punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value + psubw xmm5, xmm4 ;x = abs(z) + + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) + psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) + + mov rdi, arg(5) ;round_ptr + mov rsi, arg(6) ;quant_ptr + + movdqa DQWORD PTR[rsp + abs_minus_zbin_lo], xmm1 + movdqa DQWORD PTR[rsp + abs_minus_zbin_hi], xmm5 + + paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back + paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back + + movdqa xmm2, DQWORD PTR[rdi] + movdqa xmm3, DQWORD PTR[rsi] + + movdqa xmm6, DQWORD PTR[rdi + 16] + movdqa xmm7, DQWORD PTR[rsi + 16] + + paddw xmm1, xmm2 + paddw xmm5, xmm6 + + pmulhw xmm1, xmm3 + pmulhw xmm5, xmm7 + + mov rsi, arg(2) ;qcoeff_ptr + pxor xmm6, xmm6 + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa DQWORD PTR[rsp + temp_qcoeff_lo], xmm1 + movdqa DQWORD PTR[rsp + temp_qcoeff_hi], xmm5 + + movdqa DQWORD PTR[rsi], xmm6 ;zero qcoeff + movdqa DQWORD PTR[rsi + 16], xmm6 ;zero qcoeff + + xor rax, rax + mov rcx, -1 + + mov [rsp + eob], rcx + mov rsi, arg(9) ;zbin_boost_ptr + + mov rbx, arg(4) ;default_zig_zag + +rq_zigzag_loop: + movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + + sub edx, edi ;x - zbin + jl rq_zigzag_1 + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1 + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1a + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1a + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1a: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1b + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1b + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1b: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1c + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1c + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1c: + lea rax, [rax + 1] + + cmp rax, 16 + jl rq_zigzag_loop + + mov rdi, arg(2) ;qcoeff_ptr + mov rcx, arg(3) ;dequant_ptr + mov rsi, arg(7) ;dqcoeff_ptr + + movdqa xmm2, DQWORD PTR[rdi] + movdqa xmm3, DQWORD PTR[rdi + 16] + + movdqa xmm0, DQWORD PTR[rcx] + movdqa xmm1, DQWORD PTR[rcx + 16] + + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + movdqa DQWORD PTR[rsi], xmm0 ;store dqcoeff + movdqa DQWORD PTR[rsi + 16], xmm1 ;store dqcoeff + + mov rax, [rsp + eob] + + movdqa xmm6, DQWORD PTR[rsp + save_xmm6] + movdqa xmm7, DQWORD PTR[rsp + save_xmm7] + + add rax, 1 + + add rsp, vp8_regularquantizeb_stack_size + pop rsp + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h new file mode 100644 index 0000000..37d69a8 --- /dev/null +++ b/vp8/encoder/x86/quantize_x86.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + +#ifndef QUANTIZE_X86_H +#define QUANTIZE_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX + +#endif + + +#if HAVE_SSE2 +extern prototype_quantize_block(vp8_regular_quantize_b_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_quantize_quantb +#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 + +#endif + +#endif + + +#endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index ad10a9e..f6123a8 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -121,6 +121,40 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) ); } +int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr,short *dequant_ptr, + const int *default_zig_zag, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr, + unsigned short zbin_oq_value, + short *zbin_boost_ptr); + +void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) +{ + short *zbin_boost_ptr = &b->zrun_zbin_boost[0]; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + short zbin_oq_value = b->zbin_extra; + + d->eob = vp8_regular_quantize_b_impl_sse2( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + vp8_default_zig_zag1d, + + round_ptr, + quant_ptr, + dqcoeff_ptr, + zbin_oq_value, + zbin_boost_ptr + ); +} + int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) { @@ -251,6 +285,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /* cpi->rtcd.encodemb.sub* not implemented for wmt */ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; } #endif diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 9496ef0..971a175 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -98,6 +98,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm index 7840e35..a1622e6 100644 --- a/vpx_ports/x86_abi_support.asm +++ b/vpx_ports/x86_abi_support.asm @@ -199,16 +199,13 @@ push r9 %endif %if %1 > 6 - mov rax,[rbp+16] - push rax - %endif - %if %1 > 7 - mov rax,[rbp+24] - push rax - %endif - %if %1 > 8 - mov rax,[rbp+32] + %assign i %1-6 + %assign off 16 + %rep i + mov rax,[rbp+off] push rax + %assign off off+8 + %endrep %endif %endm %endif -- 2.7.4