From e0a80519c726f3097c3896b6fc155741f64f68b0 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 12 Apr 2012 14:22:47 -0400 Subject: [PATCH] loopfilter improvements Local variable offsets are now consistent for the functions, removed unused parameters, reworked the assembly to eliminate stalls/instructions. Change-Id: Iaa37668f8a9bb8754df435f6a51c3a08d547f879 --- vp8/common/loopfilter.c | 99 +++-- vp8/common/x86/loopfilter_sse2.asm | 750 ++++++++++++++++--------------------- vp8/common/x86/loopfilter_x86.c | 28 +- 3 files changed, 402 insertions(+), 475 deletions(-) diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index 66b280d..3f05efe 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -210,6 +210,8 @@ void vp8_loop_filter_frame int mb_row; int mb_col; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; int filter_level; @@ -217,6 +219,8 @@ void vp8_loop_filter_frame /* Point at base of Mb MODE_INFO list */ const MODE_INFO *mode_info_context = cm->mi; + int post_y_stride = post->y_stride; + int post_uv_stride = post->uv_stride; /* Initialize the loop filter for this frame. */ vp8_loop_filter_frame_init(cm, mbd, cm->filter_level); @@ -227,23 +231,23 @@ void vp8_loop_filter_frame v_ptr = post->v_buffer; /* vp8_filter each macro block */ - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + if (cm->filter_type == NORMAL_LOOPFILTER) { - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + for (mb_row = 0; mb_row < mb_rows; mb_row++) { - int skip_lf = (mode_info_context->mbmi.mode != B_PRED && - mode_info_context->mbmi.mode != SPLITMV && - mode_info_context->mbmi.mb_skip_coeff); + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; - const int seg = mode_info_context->mbmi.segment_id; - const int ref_frame = mode_info_context->mbmi.ref_frame; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; - if (filter_level) - { - if (cm->filter_type == NORMAL_LOOPFILTER) + if (filter_level) { const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; lfi.mblim = lfi_n->mblim[filter_level]; @@ -253,54 +257,87 @@ void vp8_loop_filter_frame if (mb_col > 0) vp8_loop_filter_mbv - (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); if (!skip_lf) vp8_loop_filter_bv - (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); /* don't apply across umv border */ if (mb_row > 0) vp8_loop_filter_mbh - (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); if (!skip_lf) vp8_loop_filter_bh - (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); } - else + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } + else /* SIMPLE_LOOPFILTER */ + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if (filter_level) { + const unsigned char * mblim = lfi_n->mblim[filter_level]; + const unsigned char * blim = lfi_n->blim[filter_level]; + if (mb_col > 0) vp8_loop_filter_simple_mbv - (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + (y_ptr, post_y_stride, mblim); if (!skip_lf) vp8_loop_filter_simple_bv - (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + (y_ptr, post_y_stride, blim); /* don't apply across umv border */ if (mb_row > 0) vp8_loop_filter_simple_mbh - (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + (y_ptr, post_y_stride, mblim); if (!skip_lf) vp8_loop_filter_simple_bh - (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + (y_ptr, post_y_stride, blim); } - } - y_ptr += 16; - u_ptr += 8; - v_ptr += 8; + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; - mode_info_context++; /* step to next MB */ - } + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; - y_ptr += post->y_stride * 16 - post->y_width; - u_ptr += post->uv_stride * 8 - post->uv_width; - v_ptr += post->uv_stride * 8 - post->uv_width; + mode_info_context++; /* Skip border mb */ - mode_info_context++; /* Skip border mb */ + } } } diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 2ad010a..9944c33 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -10,6 +10,17 @@ %include "vpx_ports/x86_abi_support.asm" +%define _t0 0 +%define _t1 _t0 + 16 +%define _p3 _t1 + 16 +%define _p2 _p3 + 16 +%define _p1 _p2 + 16 +%define _p0 _p1 + 16 +%define _q0 _p0 + 16 +%define _q1 _q0 + 16 +%define _q2 _q1 + 16 +%define _q3 _q2 + 16 +%define lf_var_size 160 ; Use of pmaxub instead of psubusb to compute filter mask was seen ; in ffvp8 @@ -35,9 +46,10 @@ lea rsi, [rsi + rax*4] lea rdi, [rdi + rax*4] - movdqa XMMWORD PTR [rsp], xmm1 ; store q2 - movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 + movdqa [rsp+_q2], xmm1 ; store q2 + movdqa [rsp+_q1], xmm4 ; store q1 %endif + movdqa xmm7, [rdx] ;limit movdqa xmm6, xmm1 ; q2 movdqa xmm3, xmm4 ; q1 @@ -58,7 +70,7 @@ psubusb xmm3, xmm0 ; q1-=q0 por xmm5, xmm3 ; abs(q0-q1) - movdqa t0, xmm5 ; save to t0 + movdqa [rsp+_t0], xmm5 ; save to t0 pmaxub xmm1, xmm5 @@ -75,8 +87,8 @@ movhps xmm4, [rdi] movhps xmm6, [rdi + rcx] - movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 - movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 + movdqa [rsp+_p2], xmm4 ; store p2 + movdqa [rsp+_p1], xmm6 ; store p1 %endif movdqa xmm5, xmm4 ; p2 @@ -101,7 +113,7 @@ %else movlps xmm4, [rsi + rcx*2] ; p0 movhps xmm4, [rdi + rcx*2] - movdqa xmm3, q1 ; q1 + movdqa xmm3, [rsp+_q1] ; q1 %endif movdqa xmm5, xmm4 ; p0 @@ -112,7 +124,7 @@ por xmm6, xmm4 ; abs(p1 - p0) mov rdx, arg(2) ; get blimit - movdqa t1, xmm6 ; save to t1 + movdqa [rsp+_t1], xmm6 ; save to t1 movdqa xmm4, xmm3 ; q1 pmaxub xmm1, xmm6 @@ -123,30 +135,27 @@ psubusb xmm1, xmm7 por xmm2, xmm3 ; abs(p1-q1) - movdqa xmm7, XMMWORD PTR [rdx] ; blimit + movdqa xmm7, [rdx] ; blimit + mov rdx, arg(4) ; hev get thresh movdqa xmm3, xmm0 ; q0 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - mov rdx, arg(4) ; hev get thresh - movdqa xmm6, xmm5 ; p0 psrlw xmm2, 1 ; abs(p1-q1)/2 psubusb xmm5, xmm3 ; p0-=q0 - psubusb xmm3, xmm6 ; q0-=p0 por xmm5, xmm3 ; abs(p0 - q0) paddusb xmm5, xmm5 ; abs(p0-q0)*2 - movdqa xmm4, t0 ; hev get abs (q1 - q0) - - movdqa xmm3, t1 ; get abs (p1 - p0) + movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) + movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - movdqa xmm2, XMMWORD PTR [rdx] ; hev + movdqa xmm2, [rdx] ; hev psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit psubusb xmm4, xmm2 ; hev @@ -165,43 +174,37 @@ %endmacro %macro B_FILTER 1 + movdqa xmm3, [GLOBAL(t80)] %if %1 == 0 - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 %elif %1 == 1 movdqa xmm2, [rsi+2*rax] ; p1 movdqa xmm7, [rdi] ; q1 %elif %1 == 2 - lea rdx, srct - - movdqa xmm2, [rdx] ; p1 - movdqa xmm7, [rdx+48] ; q1 - movdqa xmm6, [rdx+16] ; p0 - movdqa xmm0, [rdx+32] ; q0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 %endif - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values movdqa xmm3, xmm0 ; q0 psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand xmm1, xmm2 ; mask filter values we don't care about movdqa xmm2, xmm1 - paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 @@ -221,47 +224,49 @@ movdqa xmm5, xmm0 ; save results packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw xmm5, [GLOBAL(ones)] - paddsw xmm1, [GLOBAL(ones)] - psraw xmm5, 1 ; partial shifted one more time for 2nd tap + paddsb xmm6, xmm2 ; p0+= p0 add + movdqa xmm2, [GLOBAL(ones)] + paddsw xmm5, xmm2 + paddsw xmm1, xmm2 + psraw xmm5, 1 ; partial shifted one more time for 2nd tap psraw xmm1, 1 ; partial shifted one more time for 2nd tap - - paddsb xmm6, xmm2 ; p0+= p0 add packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm2, [GLOBAL(t80)] %if %1 == 0 - movdqa xmm1, p1 ; p1 + movdqa xmm1, [rsp+_p1] ; p1 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] %elif %1 == 1 movdqa xmm1, [rsi+2*rax] ; p1 %elif %1 == 2 - movdqa xmm1, [rdx] ; p1 + movdqa xmm1, [rsp+_p1] ; p1 %endif + pandn xmm4, xmm5 ; high edge variance additive - pxor xmm6, [GLOBAL(t80)] ; unoffset + pxor xmm6, xmm2 ; unoffset - pxor xmm1, [GLOBAL(t80)] ; reoffset + pxor xmm1, xmm2 ; reoffset psubsb xmm3, xmm0 ; q0-= q0 add paddsb xmm1, xmm4 ; p1+= p1 add - pxor xmm3, [GLOBAL(t80)] ; unoffset + pxor xmm3, xmm2 ; unoffset - pxor xmm1, [GLOBAL(t80)] ; unoffset + pxor xmm1, xmm2 ; unoffset psubsb xmm7, xmm4 ; q1-= q1 add - pxor xmm7, [GLOBAL(t80)] ; unoffset + pxor xmm7, xmm2 ; unoffset %if %1 == 0 - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] - movq MMWORD PTR [rsi], xmm6 ; p0 - movhps MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rax], xmm1 ; p1 - movhps MMWORD PTR [rdi + rax], xmm1 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - movhps MMWORD PTR [rdi + rcx], xmm3 - movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 - movhps MMWORD PTR [rdi + rcx*2],xmm7 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rax], xmm1 ; p1 + movhps [rdi + rax], xmm1 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + movq [rsi + rcx*2], xmm7 ; q1 + movhps [rdi + rcx*2], xmm7 %elif %1 == 1 movdqa [rsi+rax], xmm6 ; write back movdqa [rsi+2*rax], xmm1 ; write back @@ -280,13 +285,12 @@ ; const char *blimit, ; const char *limit, ; const char *thresh, -; int count ;) global sym(vp8_loop_filter_horizontal_edge_sse2) sym(vp8_loop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx push rsi @@ -294,15 +298,12 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + sub rsp, lf_var_size mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing @@ -311,7 +312,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ; filter and write back the result B_FILTER 1 - add rsp, 32 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -345,13 +346,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; - %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; - %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; - %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; - %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; + sub rsp, lf_var_size mov rsi, arg(0) ; u mov rdi, arg(5) ; v @@ -360,7 +355,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): neg rax ; negate pitch to deal with above border mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] lea rsi, [rsi + rcx] lea rdi, [rdi + rcx] @@ -370,7 +364,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ; filter and write back the result B_FILTER 0 - add rsp, 96 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -383,9 +377,10 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): %macro MB_FILTER_AND_WRITEBACK 1 + movdqa xmm3, [GLOBAL(t80)] %if %1 == 0 - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 %elif %1 == 1 movdqa xmm2, [rsi+2*rax] ; p1 movdqa xmm7, [rdi] ; q1 @@ -393,30 +388,24 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): mov rcx, rax neg rcx %elif %1 == 2 - lea rdx, srct - - movdqa xmm2, [rdx+32] ; p1 - movdqa xmm7, [rdx+80] ; q1 - movdqa xmm6, [rdx+48] ; p0 - movdqa xmm0, [rdx+64] ; q0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 %endif - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values psubsb xmm2, xmm7 ; p1 - q1 - movdqa xmm3, xmm0 ; q0 + movdqa xmm3, xmm0 ; q0 psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) - pand xmm1, xmm2 ; mask filter values we don't care about movdqa xmm2, xmm1 ; vp8_filter @@ -428,19 +417,20 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): pxor xmm1, xmm1 punpcklbw xmm0, xmm4 ; Filter 2 (hi) + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + movdqa xmm5, xmm2 - punpckhbw xmm1, xmm4 ; Filter 2 (lo) + movdqa xmm4, [GLOBAL(s9)] paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) + paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9 - - pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9 + pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 + pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 punpckhbw xmm7, xmm5 ; axbxcxdx - paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - punpcklbw xmm5, xmm5 ; exfxgxhx + psraw xmm7, 11 ; sign extended shift right by 3 psraw xmm5, 11 ; sign extended shift right by 3 @@ -453,18 +443,19 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): psraw xmm2, 11 ; sign extended shift right by 3 packsswb xmm2, xmm4 ; Filter1 >>=3; - movdqa xmm7, xmm1 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 - movdqa xmm4, xmm1 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 - movdqa xmm5, xmm0 + movdqa xmm7, xmm1 + movdqa xmm4, [GLOBAL(s63)] + movdqa xmm5, xmm0 movdqa xmm2, xmm5 - paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63 + paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 + paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 + movdqa xmm4, xmm7 - paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63 paddw xmm5, xmm5 ; Filter 2 (hi) * 18 paddw xmm7, xmm7 ; Filter 2 (lo) * 18 @@ -472,99 +463,91 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 - - paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 - packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) - paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + movdqa xmm7, [GLOBAL(t80)] %if %1 == 0 - movdqa xmm5, q2 ; q2 - movdqa xmm1, q1 ; q1 - movdqa xmm4, p1 ; p1 - movdqa xmm7, p2 ; p2 + movdqa xmm1, [rsp+_q1] ; q1 + movdqa xmm4, [rsp+_p1] ; p1 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] %elif %1 == 1 - movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2 - movdqa xmm1, XMMWORD PTR [rdi] ; q1 - movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1 - movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2 + movdqa xmm1, [rdi] ; q1 + movdqa xmm4, [rsi+rax*2] ; p1 %elif %1 == 2 - movdqa xmm5, XMMWORD PTR [rdx+96] ; q2 - movdqa xmm1, XMMWORD PTR [rdx+80] ; q1 - movdqa xmm4, XMMWORD PTR [rdx+32] ; p1 - movdqa xmm7, XMMWORD PTR [rdx+16] ; p2 + movdqa xmm4, [rsp+_p1] ; p1 + movdqa xmm1, [rsp+_q1] ; q1 %endif - pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80 - pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80 - - pxor xmm1, [GLOBAL(t80)] - pxor xmm4, [GLOBAL(t80)] + pxor xmm1, xmm7 + pxor xmm4, xmm7 + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) - pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80; - pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80; - - pxor xmm7, [GLOBAL(t80)] - pxor xmm5, [GLOBAL(t80)] +%if %1 == 1 + movdqa xmm2, [rdi+rax*4] ; p2 + movdqa xmm5, [rdi+rcx] ; q2 +%else + movdqa xmm2, [rsp+_p2] ; p2 + movdqa xmm5, [rsp+_q2] ; q2 +%endif - paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + pxor xmm1, xmm7 ; *oq1 = sq^0x80; + pxor xmm4, xmm7 ; *op1 = sp^0x80; + pxor xmm2, xmm7 + pxor xmm5, xmm7 + paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) - - pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80; - pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80; - + pxor xmm2, xmm7 ; *op2 = sp^0x80; + pxor xmm5, xmm7 ; *oq2 = sq^0x80; + pxor xmm3, xmm7 ; *oq0 = sq^0x80 + pxor xmm6, xmm7 ; *oq0 = sp^0x80 %if %1 == 0 - lea rsi, [rsi+rcx*2] - lea rdi, [rdi+rcx*2] - - movq MMWORD PTR [rsi], xmm6 ; p0 - movhps MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - movhps MMWORD PTR [rdi + rcx], xmm3 - - movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1 - movhps MMWORD PTR [rdi+rcx*2], xmm1 - - movq MMWORD PTR [rsi + rax], xmm4 ; p1 - movhps MMWORD PTR [rdi + rax], xmm4 - - movq MMWORD PTR [rsi+rax*2], xmm7 ; p2 - movhps MMWORD PTR [rdi+rax*2], xmm7 - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2 - movhps MMWORD PTR [rdi+rcx*2], xmm5 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + lea rdx, [rcx + rcx*2] + movq [rsi+rcx*2], xmm1 ; q1 + movhps [rdi+rcx*2], xmm1 + + movq [rsi + rax], xmm4 ; p1 + movhps [rdi + rax], xmm4 + + movq [rsi+rax*2], xmm2 ; p2 + movhps [rdi+rax*2], xmm2 + + movq [rsi+rdx], xmm5 ; q2 + movhps [rdi+rdx], xmm5 %elif %1 == 1 - movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2 - movdqa XMMWORD PTR [rdi], xmm1 ; q1 - movdqa XMMWORD PTR [rsi], xmm3 ; q0 - movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0 - movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1 - movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2 + movdqa [rdi+rcx], xmm5 ; q2 + movdqa [rdi], xmm1 ; q1 + movdqa [rsi], xmm3 ; q0 + movdqa [rsi+rax ], xmm6 ; p0 + movdqa [rsi+rax*2], xmm4 ; p1 + movdqa [rdi+rax*4], xmm2 ; p2 %elif %1 == 2 - movdqa XMMWORD PTR [rdx+80], xmm1 ; q1 - movdqa XMMWORD PTR [rdx+64], xmm3 ; q0 - movdqa XMMWORD PTR [rdx+48], xmm6 ; p0 - movdqa XMMWORD PTR [rdx+32], xmm4 ; p1 + movdqa [rsp+_p1], xmm4 ; p1 + movdqa [rsp+_p0], xmm6 ; p0 + movdqa [rsp+_q0], xmm3 ; q0 + movdqa [rsp+_q1], xmm1 ; q1 %endif %endmacro @@ -577,13 +560,12 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ; const char *blimit, ; const char *limit, ; const char *thresh, -; int count ;) global sym(vp8_mbloop_filter_horizontal_edge_sse2) sym(vp8_mbloop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx push rsi @@ -591,15 +573,11 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + sub rsp, lf_var_size mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing @@ -608,7 +586,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ; filter and write back the results MB_FILTER_AND_WRITEBACK 1 - add rsp, 32 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -641,22 +619,14 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; - %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; - %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; - %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; - %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; + sub rsp, lf_var_size mov rsi, arg(0) ; u mov rdi, arg(5) ; v movsxd rax, dword ptr arg(1) ; src_pixel_step mov rcx, rax neg rax ; negate pitch to deal with above border - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] lea rsi, [rsi + rcx] lea rdi, [rdi + rcx] @@ -666,7 +636,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ; filter and write back the results MB_FILTER_AND_WRITEBACK 0 - add rsp, 96 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -679,46 +649,39 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): %macro TRANSPOSE_16X8 2 - movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 - movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 - movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 %if %1 lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] %else mov rsi, arg(5) ; v_ptr %endif movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 - punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 -%if %1 - lea rdi, [rdi+rax*8] -%else - lea rsi, [rsi - 4] -%endif - punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 -%if %1 - lea rdx, srct -%else - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + +%if %1 == 0 + lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing + lea rsi, [rsi - 4] %endif movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 @@ -733,24 +696,25 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - movdqa t0, xmm2 ; save to free XMM2 - movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 - movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 - movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + movdqa [rsp+_t0], xmm2 ; save to free XMM2 + + movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 - movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 @@ -778,64 +742,38 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 -%if %2 - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - movdqa [rdx], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+16], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rdx+32], xmm4 ; save 4 - movdqa [rdx+48], xmm5 ; save 5 - movdqa xmm1, t0 ; get - - movdqa xmm2, xmm1 ; - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 -%else - movdqa [rdx+112], xmm7 ; save 7 - - movdqa [rdx+96], xmm6 ; save 6 +%if %2 == 0 + movdqa [rsp+_q3], xmm7 ; save 7 + movdqa [rsp+_q2], xmm6 ; save 6 +%endif movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - movdqa [rdx+32], xmm2 ; save 2 + movdqa [rsp+_p1], xmm2 ; save 2 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+48], xmm3 ; save 3 + movdqa [rsp+_p0], xmm3 ; save 3 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - movdqa [rdx+64], xmm4 ; save 4 - movdqa [rdx+80], xmm5 ; save 5 - movdqa xmm1, t0 ; get + movdqa [rsp+_q0], xmm4 ; save 4 + movdqa [rsp+_q1], xmm5 ; save 5 + movdqa xmm1, [rsp+_t0] - movdqa xmm2, xmm1 + movdqa xmm2, xmm1 ; punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - movdqa [rdx+16], xmm1 - - movdqa [rdx], xmm2 +%if %2 == 0 + movdqa [rsp+_p2], xmm1 + movdqa [rsp+_p3], xmm2 %endif + %endmacro -%macro LFV_FILTER_MASK_HEV_MASK 1 +%macro LFV_FILTER_MASK_HEV_MASK 0 movdqa xmm0, xmm6 ; q2 psubusb xmm0, xmm7 ; q2-q3 @@ -853,14 +791,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): psubusb xmm2, xmm1 ; p3 - p2; por xmm0, xmm2 ; abs(p2-p3) -%if %1 - movdqa xmm2, [rdx] ; p1 -%else - movdqa xmm2, [rdx+32] ; p1 -%endif - movdqa xmm5, xmm2 ; p1 + + movdqa xmm5, [rsp+_p1] ; p1 pmaxub xmm0, xmm7 + movdqa xmm2, xmm5 ; p1 psubusb xmm5, xmm1 ; p1-p2 psubusb xmm1, xmm2 ; p2-p1 @@ -874,43 +809,33 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm1, xmm2 ; p1 psubusb xmm2, xmm3 ; p1-p0 - lea rdx, srct por xmm2, xmm7 ; abs(p1-p0) - movdqa t0, xmm2 ; save abs(p1-p0) - pmaxub xmm0, xmm2 -%if %1 - movdqa xmm5, [rdx+32] ; q0 - movdqa xmm7, [rdx+48] ; q1 -%else - movdqa xmm5, [rdx+64] ; q0 - movdqa xmm7, [rdx+80] ; q1 -%endif + movdqa xmm5, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 + mov rdx, arg(3) ; limit movdqa xmm6, xmm5 ; q0 - movdqa xmm2, xmm7 ; q1 + movdqa xmm4, xmm7 ; q1 psubusb xmm5, xmm7 ; q0-q1 psubusb xmm7, xmm6 ; q1-q0 por xmm7, xmm5 ; abs(q1-q0) - movdqa t1, xmm7 ; save abs(q1-q0) + pmaxub xmm0, xmm7 - movdqa xmm4, XMMWORD PTR [rdx]; limit + psubusb xmm0, [rdx] ; limit - pmaxub xmm0, xmm7 mov rdx, arg(2) ; blimit - - psubusb xmm0, xmm4 - movdqa xmm5, xmm2 ; q1 + movdqa xmm5, xmm4 ; q1 psubusb xmm5, xmm1 ; q1-=p1 - psubusb xmm1, xmm2 ; p1-=q1 + psubusb xmm1, xmm4 ; p1-=q1 por xmm5, xmm1 ; abs(p1-q1) movdqa xmm1, xmm3 ; p0 @@ -918,39 +843,32 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psubusb xmm1, xmm6 ; p0-q0 + movdqa xmm4, [rdx] ; blimit + mov rdx, arg(4) ; get thresh + psrlw xmm5, 1 ; abs(p1-q1)/2 psubusb xmm6, xmm3 ; q0-p0 - movdqa xmm4, XMMWORD PTR [rdx]; blimit - - mov rdx, arg(4) ; get thresh - por xmm1, xmm6 ; abs(q0-p0) - - movdqa xmm6, t0 ; get abs (q1 - q0) - paddusb xmm1, xmm1 ; abs(q0-p0)*2 - - movdqa xmm3, t1 ; get abs (p1 - p0) - - movdqa xmm7, XMMWORD PTR [rdx] + movdqa xmm3, [rdx] paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh + psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh - psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh + psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh por xmm1, xmm0 ; mask - pcmpeqb xmm6, xmm0 + pcmpeqb xmm2, xmm0 pxor xmm0, xmm0 pcmpeqb xmm4, xmm4 pcmpeqb xmm1, xmm0 - pxor xmm4, xmm6 + pxor xmm4, xmm2 %endmacro %macro BV_TRANSPOSE 0 @@ -985,25 +903,18 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): %macro BV_WRITEBACK 2 movd [rsi+2], %1 - psrldq %1, 4 - - movd [rdi+2], %1 - psrldq %1, 4 - - movd [rsi+2*rax+2], %1 - psrldq %1, 4 - - movd [rdi+2*rax+2], %1 - movd [rsi+4*rax+2], %2 + psrldq %1, 4 psrldq %2, 4 - + movd [rdi+2], %1 movd [rdi+4*rax+2], %2 + psrldq %1, 4 psrldq %2, 4 - + movd [rsi+2*rax+2], %1 movd [rsi+2*rcx+2], %2 + psrldq %1, 4 psrldq %2, 4 - + movd [rdi+2*rax+2], %1 movd [rdi+2*rcx+2], %2 %endmacro @@ -1016,13 +927,12 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ; const char *blimit, ; const char *limit, ; const char *thresh, -; int count ;) global sym(vp8_loop_filter_vertical_edge_sse2) sym(vp8_loop_filter_vertical_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx push rsi @@ -1030,10 +940,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + sub rsp, lf_var_size mov rsi, arg(0) ; src_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step @@ -1046,7 +953,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): TRANSPOSE_16X8 1, 1 ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 + LFV_FILTER_MASK_HEV_MASK ; start work on filters B_FILTER 2 @@ -1064,7 +971,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): lea rdi, [rdi+rdx*8] BV_WRITEBACK xmm2, xmm6 - add rsp, 96 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -1098,10 +1005,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + sub rsp, lf_var_size mov rsi, arg(0) ; u_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step @@ -1110,13 +1014,11 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing lea rcx, [rax+2*rax] - lea rdx, srct - ;transpose 16x8 to 8x16, and store the 8-line result on stack. TRANSPOSE_16X8 0, 1 ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 + LFV_FILTER_MASK_HEV_MASK ; start work on filters B_FILTER 2 @@ -1134,7 +1036,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing BV_WRITEBACK xmm2, xmm6 - add rsp, 96 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -1146,92 +1048,89 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ret %macro MBV_TRANSPOSE 0 - movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 - movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 - punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 - punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 + punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 %endmacro %macro MBV_WRITEBACK_1 0 - movq QWORD PTR [rsi], xmm0 - movhps MMWORD PTR [rdi], xmm0 + movq [rsi], xmm0 + movhps [rdi], xmm0 - movq QWORD PTR [rsi+2*rax], xmm6 - movhps MMWORD PTR [rdi+2*rax], xmm6 + movq [rsi+2*rax], xmm6 + movhps [rdi+2*rax], xmm6 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 - punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + movq [rsi+4*rax], xmm0 + movhps [rdi+4*rax], xmm0 - movq QWORD PTR [rsi+4*rax], xmm0 - movhps MMWORD PTR [rdi+4*rax], xmm0 + movq [rsi+2*rcx], xmm3 + movhps [rdi+2*rcx], xmm3 - movq QWORD PTR [rsi+2*rcx], xmm3 - movhps MMWORD PTR [rdi+2*rcx], xmm3 - - movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - - punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 - movdqa xmm0, xmm2 + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + movdqa xmm0, xmm7 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 - punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 - punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 %endmacro %macro MBV_WRITEBACK_2 0 - movq QWORD PTR [rsi], xmm1 - movhps MMWORD PTR [rdi], xmm1 + movq [rsi], xmm1 + movhps [rdi], xmm1 - movq QWORD PTR [rsi+2*rax], xmm5 - movhps MMWORD PTR [rdi+2*rax], xmm5 + movq [rsi+2*rax], xmm5 + movhps [rdi+2*rax], xmm5 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 - punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 - movq QWORD PTR [rsi+4*rax], xmm1 - movhps MMWORD PTR [rdi+4*rax], xmm1 + movq [rsi+4*rax], xmm1 + movhps [rdi+4*rax], xmm1 - movq QWORD PTR [rsi+2*rcx], xmm4 - movhps MMWORD PTR [rdi+2*rcx], xmm4 + movq [rsi+2*rcx], xmm4 + movhps [rdi+2*rcx], xmm4 %endmacro @@ -1242,13 +1141,12 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; const char *blimit, ; const char *limit, ; const char *thresh, -; int count ;) global sym(vp8_mbloop_filter_vertical_edge_sse2) sym(vp8_mbloop_filter_vertical_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx push rsi @@ -1256,10 +1154,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 160 ; reserve 160 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; + sub rsp, lf_var_size mov rsi, arg(0) ; src_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step @@ -1272,7 +1167,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): TRANSPOSE_16X8 1, 0 ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 0 + LFV_FILTER_MASK_HEV_MASK neg rax ; start work on filters @@ -1288,11 +1183,12 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): MBV_WRITEBACK_1 + lea rsi, [rsi+rax*8] lea rdi, [rdi+rax*8] MBV_WRITEBACK_2 - add rsp, 160 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -1325,10 +1221,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): ; end prolog ALIGN_STACK 16, rax - sub rsp, 160 ; reserve 160 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; + sub rsp, lf_var_size mov rsi, arg(0) ; u_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step @@ -1337,13 +1230,11 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing lea rcx, [rax+2*rax] - lea rdx, srct - ; Transpose TRANSPOSE_16X8 0, 0 ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 0 + LFV_FILTER_MASK_HEV_MASK ; start work on filters MB_FILTER_AND_WRITEBACK 2 @@ -1360,7 +1251,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] MBV_WRITEBACK_2 - add rsp, 160 + add rsp, lf_var_size pop rsp ; begin epilog pop rdi @@ -1389,7 +1280,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): mov rcx, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - + movdqa xmm6, [GLOBAL(tfe)] lea rdx, [rcx + rax] neg rax @@ -1399,15 +1290,15 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): movdqa xmm1, [rcx+2*rax] ; p1 movdqa xmm2, xmm1 - movdqa xmm7, xmm0 + movdqa xmm3, xmm0 psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm7 ; p1-=q1 + psubusb xmm1, xmm3 ; p1-=q1 por xmm1, xmm0 ; abs(p1-q1) - pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + pand xmm1, xmm6 ; set lsb of each byte to zero psrlw xmm1, 1 ; abs(p1-q1)/2 - movdqa xmm3, XMMWORD PTR [rdx] + movdqa xmm7, XMMWORD PTR [rdx] movdqa xmm5, [rcx+rax] ; p0 movdqa xmm4, [rcx] ; q0 @@ -1421,15 +1312,15 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm3, xmm3 - pcmpeqb xmm5, xmm3 + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; start work on filters pxor xmm2, xmm4 ; p1 offset to convert to signed values - pxor xmm7, xmm4 ; q1 offset to convert to signed values - psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm2, xmm3 ; p1 - q1 pxor xmm6, xmm4 ; offset to convert to signed values pxor xmm0, xmm4 ; offset to convert to signed values @@ -1440,14 +1331,14 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) pand xmm5, xmm2 ; mask filter values we don't care about - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 movdqa xmm0, xmm5 - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 movdqa xmm1, [GLOBAL(te0)] movdqa xmm2, [GLOBAL(t1f)] - pxor xmm7, xmm7 +; pxor xmm7, xmm7 pcmpgtb xmm7, xmm0 ;save sign pand xmm7, xmm1 ;preserve the upper 3 bits psrlw xmm0, 3 @@ -1605,29 +1496,26 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): pxor xmm3, xmm4 ; q1 offset to convert to signed values psubsb xmm0, xmm3 ; p1 - q1 - movdqa xmm6, xmm1 ; p0 -; movdqa xmm7, xmm2 ; q0 - - pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm1, xmm4 ; offset to convert to signed values pxor xmm2, xmm4 ; offset to convert to signed values movdqa xmm3, xmm2 ; offseted ; q0 - psubsb xmm2, xmm6 ; q0 - p0 + psubsb xmm2, xmm1 ; q0 - p0 paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) pand xmm5, xmm0 ; mask filter values we don't care about - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 movdqa xmm0, xmm5 - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 - movdqa xmm1, [GLOBAL(te0)] + movdqa xmm6, [GLOBAL(te0)] movdqa xmm2, [GLOBAL(t1f)] - pxor xmm7, xmm7 +; pxor xmm7, xmm7 pcmpgtb xmm7, xmm0 ;save sign - pand xmm7, xmm1 ;preserve the upper 3 bits + pand xmm7, xmm6 ;preserve the upper 3 bits psrlw xmm0, 3 pand xmm0, xmm2 ;clear out upper 3 bits por xmm0, xmm7 ;add sign @@ -1635,26 +1523,29 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): pxor xmm7, xmm7 pcmpgtb xmm7, xmm5 ;save sign - pand xmm7, xmm1 ;preserve the upper 3 bits + pand xmm7, xmm6 ;preserve the upper 3 bits psrlw xmm5, 3 pand xmm5, xmm2 ;clear out upper 3 bits por xmm5, xmm7 ;add sign - paddsb xmm6, xmm5 ; p0+= p0 add + paddsb xmm1, xmm5 ; p0+= p0 add pxor xmm3, xmm4 ; unoffset q0 - pxor xmm6, xmm4 ; unoffset p0 + pxor xmm1, xmm4 ; unoffset p0 movdqa xmm0, t0 ; p1 movdqa xmm4, t1 ; q1 + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + ; transpose back to write out ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + movdqa xmm6, xmm0 + punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 movdqa xmm5, xmm3 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 @@ -1664,27 +1555,23 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + movdqa xmm3, xmm6 + punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - ; write out order: xmm0 xmm2 xmm1 xmm3 - lea rdx, [rsi + rax*4] - - movd [rsi], xmm1 ; write the second 8-line result - psrldq xmm1, 4 - movd [rdi], xmm1 - psrldq xmm1, 4 - movd [rsi + rax*2], xmm1 - psrldq xmm1, 4 - movd [rdi + rax*2], xmm1 - + movd [rsi], xmm6 ; write the second 8-line result movd [rdx], xmm3 + psrldq xmm6, 4 psrldq xmm3, 4 + movd [rdi], xmm6 movd [rcx], xmm3 + psrldq xmm6, 4 psrldq xmm3, 4 + movd [rsi + rax*2], xmm6 movd [rdx + rax*2], xmm3 + psrldq xmm6, 4 psrldq xmm3, 4 + movd [rdi + rax*2], xmm6 movd [rcx + rax*2], xmm3 neg rax @@ -1695,19 +1582,18 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): lea rcx, [rdx + rax] movd [rsi], xmm0 ; write the first 8-line result - psrldq xmm0, 4 - movd [rdi], xmm0 - psrldq xmm0, 4 - movd [rsi + rax*2], xmm0 - psrldq xmm0, 4 - movd [rdi + rax*2], xmm0 - movd [rdx], xmm2 + psrldq xmm0, 4 psrldq xmm2, 4 + movd [rdi], xmm0 movd [rcx], xmm2 + psrldq xmm0, 4 psrldq xmm2, 4 + movd [rsi + rax*2], xmm0 movd [rdx + rax*2], xmm2 + psrldq xmm0, 4 psrldq xmm2, 4 + movd [rdi + rax*2], xmm0 movd [rcx + rax*2], xmm2 add rsp, 32 diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index 066df43..6586004 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -16,6 +16,10 @@ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ const unsigned char *limit, const unsigned char *thresh, int count) +#define prototype_loopfilter_nc(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh) + #define prototype_simple_loopfilter(sym) \ void sym(unsigned char *y, int ystride, const unsigned char *blimit) @@ -30,11 +34,11 @@ prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); prototype_loopfilter(vp8_loop_filter_bv_y_sse2); prototype_loopfilter(vp8_loop_filter_bh_y_sse2); #else -prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); -prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); #endif -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; @@ -124,7 +128,7 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); if (u_ptr) vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); @@ -135,7 +139,7 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); if (u_ptr) vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); @@ -149,9 +153,9 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne #if ARCH_X86_64 vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); #else - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); #endif if (u_ptr) @@ -174,9 +178,9 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne #if ARCH_X86_64 vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); #else - vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); #endif if (u_ptr) -- 2.7.4