From c32e0ecc592d12573199c992f0fb710b7785c5eb Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 24 Mar 2011 13:31:10 -0400 Subject: [PATCH] use asm_offsets with vp8_fast_quantize_b_sse2 on the same order as the regular quantize change: ~2% Change-Id: I5c9eec18e89ae7345dd96945cb740e6f349cee86 --- vp8/encoder/x86/quantize_sse2.asm | 125 ++++++++++++++++++++------------- vp8/encoder/x86/quantize_x86.h | 4 ++ vp8/encoder/x86/x86_csystemdependent.c | 27 +------ 3 files changed, 83 insertions(+), 73 deletions(-) diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index e00faeb..5e40dc7 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -233,72 +233,97 @@ ZIGZAG_LOOP 15 pop rbp ret -; int vp8_fast_quantize_b_impl_sse2 | arg -; (short *coeff_ptr, | 0 -; short *qcoeff_ptr, | 1 -; short *dequant_ptr, | 2 -; short *inv_scan_order, | 3 -; short *round_ptr, | 4 -; short *quant_ptr, | 5 -; short *dqcoeff_ptr) | 6 - -global sym(vp8_fast_quantize_b_impl_sse2) -sym(vp8_fast_quantize_b_impl_sse2): +; void vp8_fast_quantize_b_sse2 | arg +; (BLOCK *b, | 0 +; BLOCKD *d) | 1 + +global sym(vp8_fast_quantize_b_sse2) +sym(vp8_fast_quantize_b_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + +%if ABI_IS_32BIT + push rdi push rsi +%else + %ifidn __OUTPUT_FORMAT__,x64 push rdi + push rsi + %else + ; these registers are used for passing arguments + %endif +%endif + ; end prolog - mov rdx, arg(0) ;coeff_ptr - mov rcx, arg(2) ;dequant_ptr - mov rdi, arg(4) ;round_ptr - mov rsi, arg(5) ;quant_ptr +%if ABI_IS_32BIT + mov rdi, arg(0) ; BLOCK *b + mov rsi, arg(1) ; BLOCKD *d +%else + %ifidn __OUTPUT_FORMAT__,x64 + mov rdi, rcx ; BLOCK *b + mov rsi, rdx ; BLOCKD *d + %else + ;mov rdi, rdi ; BLOCK *b + ;mov rsi, rsi ; BLOCKD *d + %endif +%endif - movdqa xmm0, XMMWORD PTR[rdx] - movdqa xmm4, XMMWORD PTR[rdx + 16] + mov rax, [rdi + vp8_block_coeff] + mov rcx, [rdi + vp8_block_round] + mov rdx, [rdi + vp8_block_quant_fast] - movdqa xmm2, XMMWORD PTR[rdi] ;round lo - movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi + ; z = coeff + movdqa xmm0, [rax] + movdqa xmm4, [rax + 16] + ; dup z so we can save sz movdqa xmm1, xmm0 movdqa xmm5, xmm4 - psraw xmm0, 15 ;sign of z (aka sz) - psraw xmm4, 15 ;sign of z (aka sz) + ; sz = z >> 15 + psraw xmm0, 15 + psraw xmm4, 15 + ; x = abs(z) = (z ^ sz) - sz pxor xmm1, xmm0 pxor xmm5, xmm4 - psubw xmm1, xmm0 ;x = abs(z) - psubw xmm5, xmm4 ;x = abs(z) - - paddw xmm1, xmm2 - paddw xmm5, xmm3 + psubw xmm1, xmm0 + psubw xmm5, xmm4 - pmulhw xmm1, XMMWORD PTR[rsi] - pmulhw xmm5, XMMWORD PTR[rsi + 16] + ; x += round + paddw xmm1, [rcx] + paddw xmm5, [rcx + 16] - mov rdi, arg(1) ;qcoeff_ptr - mov rsi, arg(6) ;dqcoeff_ptr + mov rax, [rsi + vp8_blockd_qcoeff] + mov rcx, [rsi + vp8_blockd_dequant] + mov rdi, [rsi + vp8_blockd_dqcoeff] - movdqa xmm2, XMMWORD PTR[rcx] - movdqa xmm3, XMMWORD PTR[rcx + 16] + ; y = x * quant >> 16 + pmulhw xmm1, [rdx] + pmulhw xmm5, [rdx + 16] + ; x = (y ^ sz) - sz pxor xmm1, xmm0 pxor xmm5, xmm4 psubw xmm1, xmm0 psubw xmm5, xmm4 - movdqa XMMWORD PTR[rdi], xmm1 - movdqa XMMWORD PTR[rdi + 16], xmm5 + ; qcoeff = x + movdqa [rax], xmm1 + movdqa [rax + 16], xmm5 - pmullw xmm2, xmm1 - pmullw xmm3, xmm5 + ; x * dequant + movdqa xmm2, xmm1 + movdqa xmm3, xmm5 + pmullw xmm2, [rcx] + pmullw xmm3, [rcx + 16] - mov rdi, arg(3) ;inv_scan_order + ; dqcoeff = x * dequant + movdqa [rdi], xmm2 + movdqa [rdi + 16], xmm3 - ; Start with 16 pxor xmm4, xmm4 ;clear all bits pcmpeqw xmm1, xmm4 pcmpeqw xmm5, xmm4 @@ -307,8 +332,8 @@ sym(vp8_fast_quantize_b_impl_sse2): pxor xmm1, xmm4 pxor xmm5, xmm4 - pand xmm1, XMMWORD PTR[rdi] - pand xmm5, XMMWORD PTR[rdi+16] + pand xmm1, [GLOBAL(inv_zig_zag)] + pand xmm5, [GLOBAL(inv_zig_zag + 16)] pmaxsw xmm1, xmm5 @@ -327,16 +352,22 @@ sym(vp8_fast_quantize_b_impl_sse2): pmaxsw xmm1, xmm5 - movd rax, xmm1 - and rax, 0xff - - movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff - movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff + movd eax, xmm1 + and eax, 0xff + mov [rsi + vp8_blockd_eob], eax ; begin epilog +%if ABI_IS_32BIT + pop rsi pop rdi +%else + %ifidn __OUTPUT_FORMAT__,x64 pop rsi - UNSHADOW_ARGS + pop rdi + %endif +%endif + + RESTORE_GOT pop rbp ret diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index 6f54bec..df2e0bc 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -24,12 +24,16 @@ #if HAVE_SSE2 extern prototype_quantize_block(vp8_regular_quantize_b_sse2); +extern prototype_quantize_block(vp8_fast_quantize_b_sse2); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_quantb #define vp8_quantize_quantb vp8_regular_quantize_b_sse2 +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_sse2 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 2b6bd98..8bceece 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -81,31 +81,6 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, - short *qcoeff_ptr, short *dequant_ptr, - const short *inv_scan_order, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr); -static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) -{ - short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; - short *coeff_ptr = b->coeff; - short *round_ptr = b->round; - short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - d->eob = vp8_fast_quantize_b_impl_sse2( - coeff_ptr, - qcoeff_ptr, - dequant_ptr, - vp8_default_inv_zig_zag, - round_ptr, - quant_ptr, - dqcoeff_ptr - ); -} - int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); static int mbblock_error_xmm(MACROBLOCK *mb, int dc) { @@ -294,7 +269,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; - cpi->rtcd.quantize.fastquantb = fast_quantize_b_sse2; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; #if !(CONFIG_REALTIME_ONLY) cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; -- 2.7.4