From d96ba65a23f383c64ea2b244503636c96d1e8437 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 22 Feb 2011 18:01:08 -0500 Subject: [PATCH] Add prefetch before variance calculation This improved encoding performance by 0.5% (good, speed 1) to 1.5% (good, speed 5). Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e --- vp8/encoder/x86/variance_impl_sse2.asm | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 7178e7e..6cdc47b 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,10 +85,9 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx + push rbx push rsi push rdi - sub rsp, 16 ; end prolog mov rsi, arg(0) ;[src_ptr] @@ -97,6 +96,29 @@ sym(vp8_get16x16var_sse2): movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + ; Prefetch data + lea rcx, [rax+rax*2] + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax*2] + prefetcht0 [rsi+rcx] + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax*2] + prefetcht0 [rbx+rcx] + + lea rcx, [rdx+rdx*2] + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx*2] + prefetcht0 [rdi+rcx] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx*2] + prefetcht0 [rbx+rcx] + pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs @@ -107,6 +129,9 @@ var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] + prefetcht0 [rsi+rax*8] + prefetcht0 [rdi+rdx*8] + movdqa xmm3, xmm1 movdqa xmm4, xmm2 @@ -178,10 +203,9 @@ var16loop: ; begin epilog - add rsp, 16 pop rdi pop rsi - RESTORE_GOT + pop rbx UNSHADOW_ARGS pop rbp ret -- 2.7.4