From d860f685b85ffafb32dfc20da53aaa81cb62c5c5 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 7 Oct 2010 11:43:19 -0400 Subject: [PATCH] Added vp8_fast_quantize_b_sse2 Moved vp8_fast_quantize_b_sse from quantize_mmx.asm into quantize_sse2.asm and renamed. Updated the assembly code to match the C version. Change-Id: I1766d9e1ca60e173f65badc0ca0c160c2b51b200 --- vp8/encoder/x86/quantize_mmx.asm | 153 --------------------------------- vp8/encoder/x86/quantize_sse2.asm | 134 +++++++++++++++++++++++++++++ vp8/encoder/x86/x86_csystemdependent.c | 15 ++-- 3 files changed, 142 insertions(+), 160 deletions(-) diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm index 51cd940..f29a54e 100644 --- a/vp8/encoder/x86/quantize_mmx.asm +++ b/vp8/encoder/x86/quantize_mmx.asm @@ -284,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx): UNSHADOW_ARGS pop rbp ret - - -;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *scan_mask, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); -global sym(vp8_fast_quantize_b_impl_sse) -sym(vp8_fast_quantize_b_impl_sse): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - movdqa xmm0, [rsi] - - mov rax, arg(1) ;zbin_ptr - movdqa xmm1, [rax] - - movdqa xmm3, xmm0 - psraw xmm0, 15 - - pxor xmm3, xmm0 - psubw xmm3, xmm0 ; abs - - movdqa xmm2, xmm3 - pcmpgtw xmm1, xmm2 - - pandn xmm1, xmm2 - movdqa xmm3, xmm1 - - mov rdx, arg(6) ; quant_ptr - movdqa xmm1, [rdx] - - mov rcx, arg(5) ; round_ptr - movdqa xmm2, [rcx] - - paddw xmm3, xmm2 - pmulhuw xmm3, xmm1 - - pxor xmm3, xmm0 - psubw xmm3, xmm0 ;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - movdqa xmm0, xmm3 - - movdqa [rdi], xmm3 - - mov rax, arg(3) ;dequant_ptr - movdqa xmm2, [rax] - - pmullw xmm3, xmm2 - mov rax, arg(7) ;dqcoeff_ptr - - movdqa [rax], xmm3 - - ; next 8 - movdqa xmm4, [rsi+16] - - mov rax, arg(1) ;zbin_ptr - movdqa xmm5, [rax+16] - - movdqa xmm7, xmm4 - psraw xmm4, 15 - - pxor xmm7, xmm4 - psubw xmm7, xmm4 ; abs - - movdqa xmm6, xmm7 - pcmpgtw xmm5, xmm6 - - pandn xmm5, xmm6 - movdqa xmm7, xmm5 - - movdqa xmm5, [rdx+16] - movdqa xmm6, [rcx+16] - - - paddw xmm7, xmm6 - pmulhuw xmm7, xmm5 - - pxor xmm7, xmm4 - psubw xmm7, xmm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movdqa xmm1, xmm7 - movdqa [rdi+16], xmm7 - - mov rax, arg(3) ;dequant_ptr - movdqa xmm6, [rax+16] - - pmullw xmm7, xmm6 - mov rax, arg(7) ;dqcoeff_ptr - - movdqa [rax+16], xmm7 - mov rdi, arg(4) ;scan_mask - - pxor xmm7, xmm7 - movdqa xmm2, [rdi] - - movdqa xmm3, [rdi+16]; - pcmpeqw xmm0, xmm7 - - pcmpeqw xmm1, xmm7 - pcmpeqw xmm6, xmm6 - - pxor xmm0, xmm6 - pxor xmm1, xmm6 - - psrlw xmm0, 15 - psrlw xmm1, 15 - - pmaddwd xmm0, xmm2 - pmaddwd xmm1, xmm3 - - movq xmm2, xmm0 - movq xmm3, xmm1 - - psrldq xmm0, 8 - psrldq xmm1, 8 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - - paddd xmm0, xmm2 - movq xmm1, xmm0 - - psrldq xmm0, 4 - paddd xmm1, xmm0 - - movq rcx, xmm1 - and rcx, 0xffff - - xor rdx, rdx - sub rdx, rcx - - bsr rax, rcx - inc rax - - sar rdx, 31 - and rax, rdx - - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index a1b1c40..3248813 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -252,3 +252,137 @@ rq_zigzag_1c: UNSHADOW_ARGS pop rbp ret + + +;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_ssse2) +sym(vp8_fast_quantize_b_impl_ssse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + + %define save_xmm6 0 + %define save_xmm7 16 + + %define vp8_fastquantizeb_stack_size save_xmm7 + 16 + + sub rsp, vp8_fastquantizeb_stack_size + + movdqa XMMWORD PTR[rsp + save_xmm6], xmm6 + movdqa XMMWORD PTR[rsp + save_xmm7], xmm7 + + mov rdx, arg(0) ;coeff_ptr + mov rcx, arg(2) ;dequant_ptr + mov rax, arg(3) ;scan_mask + mov rdi, arg(4) ;round_ptr + mov rsi, arg(5) ;quant_ptr + + movdqa xmm0, XMMWORD PTR[rdx] + movdqa xmm4, XMMWORD PTR[rdx + 16] + + movdqa xmm6, XMMWORD PTR[rdi] ;round lo + movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 ;x = abs(z) + psubw xmm5, xmm4 ;x = abs(z) + + paddw xmm1, xmm6 + paddw xmm5, xmm7 + + pmulhw xmm1, XMMWORD PTR[rsi] + pmulhw xmm5, XMMWORD PTR[rsi + 16] + + mov rdi, arg(1) ;qcoeff_ptr + mov rsi, arg(6) ;dqcoeff_ptr + + movdqa xmm6, XMMWORD PTR[rcx] + movdqa xmm7, XMMWORD PTR[rcx + 16] + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa XMMWORD PTR[rdi], xmm1 + movdqa XMMWORD PTR[rdi + 16], xmm5 + + pmullw xmm6, xmm1 + pmullw xmm7, xmm5 + + movdqa xmm2, XMMWORD PTR[rax] + movdqa xmm3, XMMWORD PTR[rax+16]; + + pxor xmm4, xmm4 ;clear all bits + pcmpeqw xmm1, xmm4 + pcmpeqw xmm5, xmm4 + + pcmpeqw xmm4, xmm4 ;set all bits + pxor xmm1, xmm4 + pxor xmm5, xmm4 + + psrlw xmm1, 15 + psrlw xmm5, 15 + + pmaddwd xmm1, xmm2 + pmaddwd xmm5, xmm3 + + movq xmm2, xmm1 + movq xmm3, xmm5 + + psrldq xmm1, 8 + psrldq xmm5, 8 + + paddd xmm1, xmm5 + paddd xmm2, xmm3 + + paddd xmm1, xmm2 + movq xmm5, xmm1 + + psrldq xmm1, 4 + paddd xmm5, xmm1 + + movq rcx, xmm5 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + + movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff + movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff + + movdqa xmm6, XMMWORD PTR[rsp + save_xmm6] + movdqa xmm7, XMMWORD PTR[rsp + save_xmm7] + + add rsp, vp8_fastquantizeb_stack_size + pop rsp + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 18dc49c..7490a8a 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -88,24 +88,22 @@ void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); } -int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, +int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, short *quant_ptr, short *dqcoeff_ptr); -void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; short *coeff_ptr = &b->coeff[0]; - short *zbin_ptr = &b->zbin[0][0]; short *round_ptr = &b->round[0][0]; short *quant_ptr = &b->quant[0][0]; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = &d->dequant[0][0]; - d->eob = vp8_fast_quantize_b_impl_sse( + d->eob = vp8_fast_quantize_b_impl_ssse2( coeff_ptr, - zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask, @@ -116,6 +114,7 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) ); } + int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr,short *dequant_ptr, const int *default_zig_zag, short *round_ptr, @@ -285,8 +284,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; /* cpi->rtcd.encodemb.sub* not implemented for wmt */ - /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; - cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; + } #endif -- 2.7.4