From: Yunqing Wang Date: Thu, 20 Jan 2011 18:01:30 +0000 (-0500) Subject: Modify sub-pixel filters to eliminate unnecessary calculations X-Git-Tag: 1.0_branch~711^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0822a62f4051289fb3853c997b797ae3b6a006f5;p=profile%2Fivi%2Flibvpx.git Modify sub-pixel filters to eliminate unnecessary calculations In sub-pixel calculation, xoffset and yoffset mostly take some specific values. Modified sub-pixel filter functions according to these possible values to improve performance. Change-Id: I83083570af8b00ff65093467914fbb97a4e9ea21 --- diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index cefa0a9..7178e7e 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2): ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, +; int xoffset, +; int yoffset, ; int *sum, ; unsigned int *sumsquared;; ; @@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM GET_GOT rbx push rsi push rdi - sub rsp, 16 + push rbx ; end prolog pxor xmm6, xmm6 ; pxor xmm7, xmm7 ; - mov rax, arg(5) ;HFilter ; - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; - movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; - - pmullw xmm1, [rax] ; + pmullw xmm1, [rax] ; punpcklbw xmm3, xmm0 - ; pmullw xmm3, [rax+16] ; - paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - psraw xmm1, xmm_filter_shift ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; movdqa xmm5, xmm1 -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif -filter_block2d_bil_var_sse2_loop: +filter_block2d_bil_var_sse2_loop: movq xmm1, QWORD PTR [rsi] ; movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; - movdqa xmm3, xmm5 ; + movdqa xmm3, xmm5 ; movdqa xmm5, xmm1 ; - pmullw xmm3, [rdx] ; + pmullw xmm3, [rdx] ; pmullw xmm1, [rdx+16] ; paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; movq xmm3, QWORD PTR [rdi] ; @@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop: pmaddwd xmm1, xmm1 ; paddd xmm7, xmm1 ; + lea rsi, [rsi + rbx] ;ref_pixels_per_line %if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line %else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 + lea rdi, [rdi + r9] %endif sub rcx, 1 ; jnz filter_block2d_bil_var_sse2_loop ; + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance +filter_block2d_bil_variance: movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop: movd [rsi], mm2 ; xsum movd [rdi], mm4 ; xxsum - ; begin epilog - add rsp, 16 + pop rbx pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -974,3 +1069,13 @@ SECTION_RODATA align 16 xmm_bi_rd: times 8 dw 64 +align 16 +vp8_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 006e0a2..6f79f0d 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2 const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, - const short *HFilter, - const short *VFilter, + int xoffset, + int yoffset, int *sum, unsigned int *sumsquared ); @@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt } -/////////////////////////////////////////////////////////////////////////// -// the mmx function that does the bilinear filtering and var calculation // -// int one pass // -/////////////////////////////////////////////////////////////////////////// -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = -{ - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, @@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt unsigned int *sse ) { - int xsum; unsigned int xxsum; - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum, &xxsum - ); + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum, &xxsum); + } *sse = xxsum; return (xxsum - ((xsum * xsum) >> 6)); @@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt vp8_filter_block2d_bil_var_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + xoffset, yoffset, &xsum0, &xxsum0 ); @@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt vp8_filter_block2d_bil_var_sse2( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + xoffset, yoffset, &xsum1, &xxsum1 ); } @@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt int xsum0, xsum1; unsigned int xxsum0, xxsum1; + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum0, &xxsum0 - ); + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); - vp8_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum1, &xxsum1 - ); + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum1, &xxsum1); + } xsum0 += xsum1; xxsum0 += xxsum1; @@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt { int xsum; unsigned int xxsum; - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum, &xxsum - ); + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum, &xxsum); + } *sse = xxsum; return (xxsum - ((xsum * xsum) >> 7));