From 0964ef0e7195fdf990162da9ba3efa911180bb02 Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Tue, 28 Sep 2010 12:01:34 -0700 Subject: [PATCH] Optimizations on the loopfilters. - Scheduling for Atom processors - Combining of macros to allow for better interleaving - Change from multiplies to adds for main filter - Use of movhps/movlps to fill xmm registers without shifting and orring Change-Id: I0b3500a5f58abf7085253ec92d64c8a96723040b --- vp8/common/x86/loopfilter_sse2.asm | 1165 +++++++++++++----------------------- 1 file changed, 432 insertions(+), 733 deletions(-) diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index de801df..dc8167d 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -14,175 +14,172 @@ ; Use of pmaxub instead of psubusb to compute filter mask was seen ; in ffvp8 -%macro LFH_FILTER_MASK 1 +%macro LFH_FILTER_AND_HEV_MASK 1 %if %1 movdqa xmm2, [rdi+2*rax] ; q3 movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border %else - movq xmm0, [rsi + rcx*2] ; q3 - movq xmm2, [rdi + rcx*2] - pslldq xmm2, 8 - por xmm2, xmm0 - movq xmm1, [rsi + rcx] ; q2 - movq xmm3, [rdi + rcx] - pslldq xmm3, 8 - por xmm1, xmm3 + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + movdqa XMMWORD PTR [rsp], xmm1 ; store q2 + movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 %endif movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + psubusb xmm1, xmm2 ; q2-=q3 psubusb xmm2, xmm6 ; q3-=q2 - por xmm1, xmm2 ; abs(q3-q2) -%if %1 - movdqa xmm4, [rsi+rax] ; q1 -%else - movq xmm0, [rsi] ; q1 - movq xmm4, [rdi] - pslldq xmm4, 8 - por xmm4, xmm0 - movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 -%endif - - movdqa xmm3, xmm4 ; q1 psubusb xmm4, xmm6 ; q1-=q2 psubusb xmm6, xmm3 ; q2-=q1 + por xmm4, xmm6 ; abs(q2-q1) - pmaxub xmm1, xmm4 + por xmm1, xmm2 ; abs(q3-q2) -%if %1 - movdqa xmm4, [rsi] ; q0 -%else - movq xmm4, [rsi + rax] ; q0 - movq xmm0, [rdi + rax] - pslldq xmm0, 8 - por xmm4, xmm0 -%endif + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 - movdqa xmm0, xmm4 ; q0 - psubusb xmm4, xmm3 ; q0-=q1 + psubusb xmm5, xmm3 ; q0-=q1 psubusb xmm3, xmm0 ; q1-=q0 - por xmm4, xmm3 ; abs(q0-q1) - movdqa t0, xmm4 ; save to t0 - pmaxub xmm1, xmm4 -%if %1 - neg rax ; negate pitch to deal with above border + por xmm5, xmm3 ; abs(q0-q1) + movdqa t0, xmm5 ; save to t0 + + pmaxub xmm1, xmm5 +%if %1 movdqa xmm2, [rsi+4*rax] ; p3 movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 %else - lea rsi, [rsi + rax*4] - lea rdi, [rdi + rax*4] + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] - movq xmm2, [rsi + rax] ; p3 - movq xmm3, [rdi + rax] - pslldq xmm3, 8 - por xmm2, xmm3 - movq xmm4, [rsi] ; p2 - movq xmm5, [rdi] - pslldq xmm5, 8 - por xmm4, xmm5 movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 + movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 %endif movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + psubusb xmm4, xmm2 ; p2-=p3 psubusb xmm2, xmm5 ; p3-=p2 - por xmm4, xmm2 ; abs(p3 - p2) - pmaxub xmm1, xmm4 -%if %1 - movdqa xmm4, [rsi+2*rax] ; p1 -%else - movq xmm4, [rsi + rcx] ; p1 - movq xmm3, [rdi + rcx] - pslldq xmm3, 8 - por xmm4, xmm3 - movdqa XMMWORD PTR [rsp + 48], xmm4 ; store p1 -%endif + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) - movdqa xmm3, xmm4 ; p1 - psubusb xmm4, xmm5 ; p1-=p2 - psubusb xmm5, xmm3 ; p2-=p1 - por xmm4, xmm5 ; abs(p2 - p1) - pmaxub xmm1, xmm4 + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) - movdqa xmm2, xmm3 ; p1 + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + pmaxub xmm1, xmm3 ; abs(p2 - p1) %if %1 movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 %else - movq xmm4, [rsi + rcx*2] ; p0 - movq xmm5, [rdi + rcx*2] - pslldq xmm5, 8 - por xmm4, xmm5 + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, q1 ; q1 %endif movdqa xmm5, xmm4 ; p0 - psubusb xmm4, xmm3 ; p0-=p1 - psubusb xmm3, xmm5 ; p1-=p0 - por xmm4, xmm3 ; abs(p1 - p0) - movdqa t1, xmm4 ; save to t1 + psubusb xmm4, xmm6 ; p0-=p1 - pmaxub xmm1, xmm4 - psubusb xmm1, xmm7 + psubusb xmm6, xmm5 ; p1-=p0 -%if %1 - movdqa xmm3, [rdi] ; q1 -%else - movdqa xmm3, q1 ; q1 -%endif + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get flimit + + movdqa t1, xmm6 ; save to t1 movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + psubusb xmm3, xmm2 ; q1-=p1 psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm4, XMMWORD PTR [rdx] ; flimit + + movdqa xmm3, xmm0 ; q0 pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero - psrlw xmm2, 1 ; abs(p1-q1)/2 + + mov rdx, arg(4) ; hev get thresh movdqa xmm6, xmm5 ; p0 - movdqa xmm3, xmm0 ; q0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + psubusb xmm5, xmm3 ; p0-=q0 + paddb xmm4, xmm4 ; flimit*2 (less than 255) + psubusb xmm3, xmm6 ; q0-=p0 por xmm5, xmm3 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) + + movdqa xmm4, t0 ; hev get abs (q1 - q0) + + movdqa xmm3, t1 ; get abs (p1 - p0) + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ; get flimit - movdqa xmm2, XMMWORD PTR [rdx] - paddb xmm2, xmm2 ; flimit*2 (less than 255) - paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255) + movdqa xmm2, XMMWORD PTR [rdx] ; hev psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev por xmm1, xmm5 - pxor xmm5, xmm5 - pcmpeqb xmm1, xmm5 ; mask mm1 -%endmacro -%macro LFH_HEV_MASK 0 - mov rdx, arg(4) ; get thresh - movdqa xmm7, XMMWORD PTR [rdx] + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - movdqa xmm4, t0 ; get abs (q1 - q0) - psubusb xmm4, xmm7 - movdqa xmm3, t1 ; get abs (p1 - p0) - psubusb xmm3, xmm7 - paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb xmm4, xmm5 + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev - pcmpeqb xmm5, xmm5 - pxor xmm4, xmm5 + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev %endmacro -%macro BH_FILTER 1 -%if %1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 -%else +%macro B_FILTER 1 +%if %1 == 0 movdqa xmm2, p1 ; p1 movdqa xmm7, q1 ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + lea rdx, srct + + movdqa xmm2, [rdx] ; p1 + movdqa xmm7, [rdx+48] ; q1 + movdqa xmm6, [rdx+16] ; p0 + movdqa xmm0, [rdx+32] ; q0 %endif pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values @@ -196,88 +193,84 @@ movdqa xmm3, xmm0 ; q0 psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + movdqa xmm2, xmm1 + paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 punpckhbw xmm5, xmm2 ; axbxcxdx punpcklbw xmm2, xmm2 ; exfxgxhx + punpcklbw xmm0, xmm1 ; exfxgxhx psraw xmm5, 11 ; sign extended shift right by 3 - psraw xmm2, 11 ; sign extended shift right by 3 - packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - punpcklbw xmm0, xmm1 ; exfxgxhx punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; psraw xmm0, 11 ; sign extended shift right by 3 - psraw xmm1, 11 ; sign extended shift right by 3 + psraw xmm1, 11 ; sign extended shift right by 3 movdqa xmm5, xmm0 ; save results - packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 paddsw xmm5, [ones GLOBAL] - paddsw xmm1, [ones GLOBAL] + paddsw xmm1, [ones GLOBAL] psraw xmm5, 1 ; partial shifted one more time for 2nd tap - psraw xmm1, 1 ; partial shifted one more time for 2nd tap - packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - pandn xmm4, xmm5 ; high edge variance additive -%endmacro + psraw xmm1, 1 ; partial shifted one more time for 2nd tap -%macro BH_WRITEBACK 1 paddsb xmm6, xmm2 ; p0+= p0 add - pxor xmm6, [t80 GLOBAL] ; unoffset -%if %1 - movdqa [rsi+rax], xmm6 ; write back -%else - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] - movq MMWORD PTR [rsi], xmm6 ; p0 - psrldq xmm6, 8 - movq MMWORD PTR [rdi], xmm6 -%endif + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 -%if %1 - movdqa xmm6, [rsi+2*rax] ; p1 -%else - movdqa xmm6, p1 ; p1 +%if %1 == 0 + movdqa xmm1, p1 ; p1 +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rdx] ; p1 %endif - pxor xmm6, [t80 GLOBAL] ; reoffset - paddsb xmm6, xmm4 ; p1+= p1 add + pandn xmm4, xmm5 ; high edge variance additive pxor xmm6, [t80 GLOBAL] ; unoffset -%if %1 - movdqa [rsi+2*rax], xmm6 ; write back -%else - movq MMWORD PTR [rsi + rax], xmm6 ; p1 - psrldq xmm6, 8 - movq MMWORD PTR [rdi + rax], xmm6 -%endif + pxor xmm1, [t80 GLOBAL] ; reoffset psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add pxor xmm3, [t80 GLOBAL] ; unoffset -%if %1 - movdqa [rsi], xmm3 ; write back -%else - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - psrldq xmm3, 8 - movq MMWORD PTR [rdi + rcx], xmm3 -%endif + pxor xmm1, [t80 GLOBAL] ; unoffset psubsb xmm7, xmm4 ; q1-= q1 add + pxor xmm7, [t80 GLOBAL] ; unoffset -%if %1 - movdqa [rdi], xmm7 ; write back -%else +%if %1 == 0 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] + movq MMWORD PTR [rsi], xmm6 ; p0 + movhps MMWORD PTR [rdi], xmm6 + movq MMWORD PTR [rsi + rax], xmm1 ; p1 + movhps MMWORD PTR [rdi + rax], xmm1 + movq MMWORD PTR [rsi + rcx], xmm3 ; q0 + movhps MMWORD PTR [rdi + rcx], xmm3 movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 - psrldq xmm7, 8 - movq MMWORD PTR [rdi + rcx*2],xmm7 + movhps MMWORD PTR [rdi + rcx*2],xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back %endif + %endmacro @@ -314,16 +307,10 @@ sym(vp8_loop_filter_horizontal_edge_sse2): lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - ; calculate breakout conditions - LFH_FILTER_MASK 1 - - ; calculate high edge variance - LFH_HEV_MASK - - ; start work on filters - BH_FILTER 1 - ; write back the result - BH_WRITEBACK 1 + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 add rsp, 32 pop rsp @@ -378,15 +365,10 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): lea rsi, [rsi + rcx] lea rdi, [rdi + rcx] - ; calculate breakout conditions - LFH_FILTER_MASK 0 - ; calculate high edge variance - LFH_HEV_MASK - - ; start work on filters - BH_FILTER 0 - ; write back the result - BH_WRITEBACK 0 + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 add rsp, 96 pop rsp @@ -400,208 +382,191 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ret -%macro MBH_FILTER 1 -%if %1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 -%else - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 -%endif - pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values - pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values +%macro MB_FILTER_AND_WRITEBACK 1 +%if %1 == 0 + movdqa xmm2, p1 ; p1 + movdqa xmm7, q1 ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 - psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values - pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + mov rcx, rax + neg rcx +%elif %1 == 2 + lea rdx, srct + + movdqa xmm2, [rdx+32] ; p1 + movdqa xmm7, [rdx+80] ; q1 + movdqa xmm6, [rdx+48] ; p0 + movdqa xmm0, [rdx+64] ; q0 +%endif - pand xmm1, xmm2 ; mask filter values we don't care about - movdqa xmm2, xmm1 ; vp8_filter - pand xmm2, xmm4; ; Filter2 = vp8_filter & hev + pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values + pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values + pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values + pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values - movdqa xmm5, xmm2 - paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3) + psubsb xmm2, xmm7 ; p1 - q1 + movdqa xmm3, xmm0 ; q0 - punpckhbw xmm7, xmm5 ; axbxcxdx - punpcklbw xmm5, xmm5 ; exfxgxhx + psubsb xmm0, xmm6 ; q0 - p0 - psraw xmm7, 11 ; sign extended shift right by 3 - psraw xmm5, 11 ; sign extended shift right by 3 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - packsswb xmm5, xmm7 ; Filter2 >>=3; - paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) - punpckhbw xmm7, xmm2 ; axbxcxdx - punpcklbw xmm0, xmm2 ; exfxgxhx + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) - psraw xmm7, 11 ; sign extended shift right by 3 - psraw xmm0, 11 ; sign extended shift right by 3 + pand xmm1, xmm2 ; mask filter values we don't care about - packsswb xmm0, xmm7 ; Filter2 >>=3; - paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + movdqa xmm2, xmm1 ; vp8_filter - psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 - pandn xmm4, xmm1 ; vp8_filter&=~hev -%endmacro + pand xmm2, xmm4 ; Filter2 = vp8_filter & hev + pxor xmm0, xmm0 -%macro MBH_WRITEBACK 1 - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; + pandn xmm4, xmm1 ; vp8_filter&=~hev pxor xmm1, xmm1 - pxor xmm2, xmm2 - punpcklbw xmm1, xmm4 + punpcklbw xmm0, xmm4 ; Filter 2 (hi) + movdqa xmm5, xmm2 - punpckhbw xmm2, xmm4 - pmulhw xmm1, [s27 GLOBAL] + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3) - pmulhw xmm2, [s27 GLOBAL] - paddw xmm1, [s63 GLOBAL] + pmulhw xmm1, [s9 GLOBAL] ; Filter 2 (lo) * 9 - paddw xmm2, [s63 GLOBAL] - psraw xmm1, 7 + pmulhw xmm0, [s9 GLOBAL] ; Filter 2 (hi) * 9 - psraw xmm2, 7 - packsswb xmm1, xmm2 + punpckhbw xmm7, xmm5 ; axbxcxdx + paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) - psubsb xmm3, xmm1 - paddsb xmm6, xmm1 + punpcklbw xmm5, xmm5 ; exfxgxhx + psraw xmm7, 11 ; sign extended shift right by 3 - pxor xmm3, [t80 GLOBAL] - pxor xmm6, [t80 GLOBAL] + psraw xmm5, 11 ; sign extended shift right by 3 + punpckhbw xmm4, xmm2 ; axbxcxdx -%if %1 - movdqa XMMWORD PTR [rsi+rax], xmm6 - movdqa XMMWORD PTR [rsi], xmm3 -%else - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] + punpcklbw xmm2, xmm2 ; exfxgxhx + psraw xmm4, 11 ; sign extended shift right by 3 - movq MMWORD PTR [rsi], xmm6 ; p0 - psrldq xmm6, 8 - movq MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - psrldq xmm3, 8 - movq MMWORD PTR [rdi + rcx], xmm3 -%endif + packsswb xmm5, xmm7 ; Filter2 >>=3; + psraw xmm2, 11 ; sign extended shift right by 3 - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor xmm1, xmm1 - pxor xmm2, xmm2 + packsswb xmm2, xmm4 ; Filter1 >>=3; + movdqa xmm7, xmm1 - punpcklbw xmm1, xmm4 - punpckhbw xmm2, xmm4 + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + movdqa xmm4, xmm1 - pmulhw xmm1, [s18 GLOBAL] - pmulhw xmm2, [s18 GLOBAL] + psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 + movdqa xmm5, xmm0 - paddw xmm1, [s63 GLOBAL] - paddw xmm2, [s63 GLOBAL] + movdqa xmm2, xmm5 + paddw xmm0, [s63 GLOBAL] ; Filter 2 (hi) * 9 + 63 - psraw xmm1, 7 - psraw xmm2, 7 + paddw xmm1, [s63 GLOBAL] ; Filter 2 (lo) * 9 + 63 + paddw xmm5, xmm5 ; Filter 2 (hi) * 18 - packsswb xmm1, xmm2 + paddw xmm7, xmm7 ; Filter 2 (lo) * 18 + paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 -%if %1 - movdqa xmm3, XMMWORD PTR [rdi] - movdqa xmm6, XMMWORD PTR [rsi+rax*2] ; p1 -%else - movdqa xmm3, q1 ; q1 - movdqa xmm6, p1 ; p1 -%endif + paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 + paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 - pxor xmm3, [t80 GLOBAL] - pxor xmm6, [t80 GLOBAL] + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 + psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 - paddsb xmm6, xmm1 - psubsb xmm3, xmm1 + psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 + psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 - pxor xmm6, [t80 GLOBAL] - pxor xmm3, [t80 GLOBAL] + packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 -%if %1 - movdqa XMMWORD PTR [rdi], xmm3 - movdqa XMMWORD PTR [rsi+rax*2],xmm6 -%else - movq MMWORD PTR [rsi + rcx*2],xmm3 ; q1 - psrldq xmm3, 8 - movq MMWORD PTR [rdi + rcx*2],xmm3 + psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 + + packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - movq MMWORD PTR [rsi + rax], xmm6 ; p1 - psrldq xmm6, 8 - movq MMWORD PTR [rdi + rax], xmm6 + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + +%if %1 == 0 + movdqa xmm5, q2 ; q2 + movdqa xmm1, q1 ; q1 + movdqa xmm4, p1 ; p1 + movdqa xmm7, p2 ; p2 + +%elif %1 == 1 + movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2 + movdqa xmm1, XMMWORD PTR [rdi] ; q1 + movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1 + movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2 +%elif %1 == 2 + movdqa xmm5, XMMWORD PTR [rdx+96] ; q2 + movdqa xmm1, XMMWORD PTR [rdx+80] ; q1 + movdqa xmm4, XMMWORD PTR [rdx+32] ; p1 + movdqa xmm7, XMMWORD PTR [rdx+16] ; p2 %endif - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor xmm1, xmm1 - pxor xmm2, xmm2 - punpcklbw xmm1, xmm4 - punpckhbw xmm2, xmm4 + pxor xmm3, [t80 GLOBAL] ; *oq0 = sq^0x80 + pxor xmm6, [t80 GLOBAL] ; *oq0 = sp^0x80 - pmulhw xmm1, [s9 GLOBAL] - pmulhw xmm2, [s9 GLOBAL] + pxor xmm1, [t80 GLOBAL] + pxor xmm4, [t80 GLOBAL] - paddw xmm1, [s63 GLOBAL] - paddw xmm2, [s63 GLOBAL] + psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) + paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) - psraw xmm1, 7 - psraw xmm2, 7 + pxor xmm1, [t80 GLOBAL] ; *oq1 = sq^0x80; + pxor xmm4, [t80 GLOBAL] ; *op1 = sp^0x80; - packsswb xmm1, xmm2 + pxor xmm7, [t80 GLOBAL] + pxor xmm5, [t80 GLOBAL] -%if %1 - movdqa xmm6, XMMWORD PTR [rdi+rax*4] - neg rax + paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) - movdqa xmm3, XMMWORD PTR [rdi+rax] -%else - movdqa xmm6, p2 ; p2 - movdqa xmm3, q2 ; q2 -%endif + pxor xmm7, [t80 GLOBAL] ; *op2 = sp^0x80; + pxor xmm5, [t80 GLOBAL] ; *oq2 = sq^0x80; - pxor xmm6, [t80 GLOBAL] - pxor xmm3, [t80 GLOBAL] +%if %1 == 0 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] - paddsb xmm6, xmm1 - psubsb xmm3, xmm1 + movq MMWORD PTR [rsi], xmm6 ; p0 + movhps MMWORD PTR [rdi], xmm6 + movq MMWORD PTR [rsi + rcx], xmm3 ; q0 + movhps MMWORD PTR [rdi + rcx], xmm3 - pxor xmm6, [t80 GLOBAL] - pxor xmm3, [t80 GLOBAL] -%if %1 - movdqa XMMWORD PTR [rdi+rax ],xmm3 - neg rax + movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1 + movhps MMWORD PTR [rdi+rcx*2], xmm1 - movdqa XMMWORD PTR [rdi+rax*4],xmm6 -%else - movq MMWORD PTR [rsi+rax*2], xmm6 ; p2 - psrldq xmm6, 8 - movq MMWORD PTR [rdi+rax*2], xmm6 + movq MMWORD PTR [rsi + rax], xmm4 ; p1 + movhps MMWORD PTR [rdi + rax], xmm4 + + movq MMWORD PTR [rsi+rax*2], xmm7 ; p2 + movhps MMWORD PTR [rdi+rax*2], xmm7 lea rsi, [rsi + rcx] lea rdi, [rdi + rcx] - movq MMWORD PTR [rsi+rcx*2 ],xmm3 ; q2 - psrldq xmm3, 8 - movq MMWORD PTR [rdi+rcx*2 ],xmm3 + movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2 + movhps MMWORD PTR [rdi+rcx*2], xmm5 +%elif %1 == 1 + movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2 + movdqa XMMWORD PTR [rdi], xmm1 ; q1 + movdqa XMMWORD PTR [rsi], xmm3 ; q0 + movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0 + movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1 + movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2 +%elif %1 == 2 + movdqa XMMWORD PTR [rdx+80], xmm1 ; q1 + movdqa XMMWORD PTR [rdx+64], xmm3 ; q0 + movdqa XMMWORD PTR [rdx+48], xmm6 ; p0 + movdqa XMMWORD PTR [rdx+32], xmm4 ; p1 %endif + %endmacro @@ -638,16 +603,10 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - ; calculate breakout conditions - LFH_FILTER_MASK 1 - - ; calculate high edge variance - LFH_HEV_MASK - - ; start work on filters - MBH_FILTER 1 - ; write back the result - MBH_WRITEBACK 1 + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 1 add rsp, 32 pop rsp @@ -702,16 +661,10 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): lea rsi, [rsi + rcx] lea rdi, [rdi + rcx] - ; calculate breakout conditions - LFH_FILTER_MASK 0 - - ; calculate high edge variance - LFH_HEV_MASK - - ; start work on filters - MBH_FILTER 0 - ; write back the result - MBH_WRITEBACK 0 + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 0 add rsp, 96 pop rsp @@ -725,64 +678,80 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ret -%macro TRANSPOSE_16X8_1 0 - movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm7, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - - punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 +%macro TRANSPOSE_16X8 2 + movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 - movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 - movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 +%if %1 + lea rsi, [rsi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif - movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 +%if %1 + lea rdi, [rdi+rax*8] +%else + lea rsi, [rsi - 4] +%endif punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 +%if %1 + lea rdx, srct +%else + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 movdqa t0, xmm2 ; save to free XMM2 -%endmacro + movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 -%macro TRANSPOSE_16X8_2 1 - movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm5, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 - movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 - movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 movdqa xmm6, xmm1 ; @@ -792,75 +761,81 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 movdqa xmm0, xmm5 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 - punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 -%if %1 +%if %2 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa [rdx], xmm2 ; save 2 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 movdqa [rdx+16], xmm3 ; save 3 + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 movdqa [rdx+32], xmm4 ; save 4 movdqa [rdx+48], xmm5 ; save 5 - movdqa xmm1, t0 ; get - movdqa xmm2, xmm1 ; + movdqa xmm2, xmm1 ; punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 %else movdqa [rdx+112], xmm7 ; save 7 - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 movdqa [rdx+96], xmm6 ; save 6 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa [rdx+32], xmm2 ; save 2 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 movdqa [rdx+48], xmm3 ; save 3 + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 movdqa [rdx+64], xmm4 ; save 4 movdqa [rdx+80], xmm5 ; save 5 - movdqa xmm1, t0 ; get - movdqa xmm2, xmm1 + movdqa xmm2, xmm1 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 movdqa [rdx+16], xmm1 + movdqa [rdx], xmm2 %endif %endmacro -%macro LFV_FILTER_MASK 1 +%macro LFV_FILTER_MASK_HEV_MASK 1 movdqa xmm0, xmm6 ; q2 psubusb xmm0, xmm7 ; q2-q3 @@ -899,10 +874,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm1, xmm2 ; p1 psubusb xmm2, xmm3 ; p1-p0 + lea rdx, srct + por xmm2, xmm7 ; abs(p1-p0) movdqa t0, xmm2 ; save abs(p1-p0) - lea rdx, srct pmaxub xmm0, xmm2 @@ -913,136 +889,70 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm5, [rdx+64] ; q0 movdqa xmm7, [rdx+80] ; q1 %endif + mov rdx, arg(3) ; limit + movdqa xmm6, xmm5 ; q0 movdqa xmm2, xmm7 ; q1 - psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm5, xmm7 ; q0-q1 psubusb xmm7, xmm6 ; q1-q0 + por xmm7, xmm5 ; abs(q1-q0) movdqa t1, xmm7 ; save abs(q1-q0) - mov rdx, arg(3) ; limit - movdqa xmm4, [rdx] ; limit + movdqa xmm4, XMMWORD PTR [rdx]; limit pmaxub xmm0, xmm7 - psubusb xmm0, xmm4 + mov rdx, arg(2) ; flimit + psubusb xmm0, xmm4 movdqa xmm5, xmm2 ; q1 + psubusb xmm5, xmm1 ; q1-=p1 psubusb xmm1, xmm2 ; p1-=q1 + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 - mov rdx, arg(2) ; flimit - movdqa xmm2, [rdx] ; flimit + movdqa xmm2, XMMWORD PTR [rdx]; flimit - movdqa xmm1, xmm3 ; p0 - movdqa xmm7, xmm6 ; q0 - psubusb xmm1, xmm7 ; p0-q0 - psubusb xmm7, xmm3 ; q0-p0 - por xmm1, xmm7 ; abs(q0-p0) - paddusb xmm1, xmm1 ; abs(q0-p0)*2 - paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + mov rdx, arg(4) ; get thresh + por xmm1, xmm6 ; abs(q0-p0) paddb xmm2, xmm2 ; flimit*2 (less than 255) - paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit - por xmm1, xmm0; ; mask - pxor xmm0, xmm0 - pcmpeqb xmm1, xmm0 -%endmacro - -%macro LFV_HEV_MASK 0 - mov rdx, arg(4) ; get thresh - movdqa xmm7, XMMWORD PTR [rdx] + movdqa xmm6, t0 ; get abs (q1 - q0) - movdqa xmm4, t0 ; get abs (q1 - q0) - psubusb xmm4, xmm7 ; abs(q1 - q0) > thresh + paddusb xmm1, xmm1 ; abs(q0-p0)*2 movdqa xmm3, t1 ; get abs (p1 - p0) - psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - - por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb xmm4, xmm0 - - pcmpeqb xmm0, xmm0 - pxor xmm4, xmm0 -%endmacro - -%macro BV_FILTER 0 - lea rdx, srct - - movdqa xmm2, [rdx] ; p1 lea rsi, [rsi+rcx*8] - movdqa xmm7, [rdx+48] ; q1 - movdqa xmm6, [rdx+16] ; p0 - movdqa xmm0, [rdx+32] ; q0 - - pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values - pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) - - pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values - pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values - - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 - paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - - paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - punpckhbw xmm5, xmm2 - punpcklbw xmm2, xmm2 - - psraw xmm5, 11 - psraw xmm2, 11 - - packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - punpcklbw xmm0, xmm1 ; exfxgxhx - - punpckhbw xmm1, xmm1 ; axbxcxdx - psraw xmm0, 11 ; sign extended shift right by 3 - - psraw xmm1, 11 ; sign extended shift right by 3 - movdqa xmm5, xmm0 ; save results - - packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw xmm5, [ones GLOBAL] - - paddsw xmm1, [ones GLOBAL] - psraw xmm5, 1 ; partial shifted one more time for 2nd tap - - psraw xmm1, 1 ; partial shifted one more time for 2nd tap - packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm7, XMMWORD PTR [rdx] - pandn xmm4, xmm5 ; high edge variance additive + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - paddsb xmm6, xmm2 ; p0+= p0 add - pxor xmm6, [t80 GLOBAL] ; unoffset + paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) + psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - movdqa xmm1, [rdx] ; p1 - pxor xmm1, [t80 GLOBAL] ; reoffset + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - paddsb xmm1, xmm4 ; p1+= p1 add - pxor xmm1, [t80 GLOBAL] ; unoffset + por xmm1, xmm0 ; mask + pcmpeqb xmm6, xmm0 - psubsb xmm3, xmm0 ; q0-= q0 add - pxor xmm3, [t80 GLOBAL] ; unoffset + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 - psubsb xmm7, xmm4 ; q1-= q1 add - pxor xmm7, [t80 GLOBAL] ; unoffset + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm6 %endmacro %macro BV_TRANSPOSE 0 @@ -1057,6 +967,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 @@ -1066,6 +977,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 @@ -1132,20 +1044,13 @@ sym(vp8_loop_filter_vertical_edge_sse2): lea rcx, [rax*2+rax] ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8_1 + TRANSPOSE_16X8 1, 1 - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - lea rdx, srct - TRANSPOSE_16X8_2 1 - - ; calculate filter mask - LFV_FILTER_MASK 1 - ; calculate high edge variance - LFV_HEV_MASK + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 ; start work on filters - BV_FILTER + B_FILTER 2 ; tranpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE @@ -1205,23 +1110,16 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing lea rcx, [rax+2*rax] - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8_1 - - mov rsi, arg(5) ; v_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rdx, srct - TRANSPOSE_16X8_2 1 - ; calculate filter mask - LFV_FILTER_MASK 1 - ; calculate high edge variance - LFV_HEV_MASK + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 ; start work on filters - BV_FILTER + B_FILTER 2 ; tranpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE @@ -1247,174 +1145,12 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): pop rbp ret - -%macro MBV_FILTER 0 - lea rdx, srct - - movdqa xmm2, [rdx+32] ; p1 - movdqa xmm7, [rdx+80] ; q1 - movdqa xmm6, [rdx+48] ; p0 - movdqa xmm0, [rdx+64] ; q0 - - pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values - pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values - pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values - pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - - movdqa xmm3, xmm0 ; q0 - - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - - paddsb xmm2, xmm0 ; 2 * (q0 - p0) - paddsb xmm2, xmm0 ; 3 * (q0 - p0)+ (p1 - q1) - - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 ; vp8_filter - pand xmm2, xmm4; ; Filter2 = vp8_filter & hev - - movdqa xmm5, xmm2 - paddsb xmm5, [t3 GLOBAL] - - punpckhbw xmm7, xmm5 ; axbxcxdx - punpcklbw xmm5, xmm5 ; exfxgxhx - - psraw xmm7, 11 ; sign extended shift right by 3 - psraw xmm5, 11 ; sign extended shift right by 3 - - packsswb xmm5, xmm7 ; Filter2 >>=3; - - paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) - - punpcklbw xmm0, xmm2 ; exfxgxhx - punpckhbw xmm7, xmm2 ; axbxcxdx - - psraw xmm0, 11 ; sign extended shift right by 3 - psraw xmm7, 11 ; sign extended shift right by 3 - - packsswb xmm0, xmm7 ; Filter2 >>=3; - - psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 - paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 - - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn xmm4, xmm1 ; vp8_filter&=~hev - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor xmm1, xmm1 - - pxor xmm2, xmm2 - punpcklbw xmm1, xmm4 - - punpckhbw xmm2, xmm4 - pmulhw xmm1, [s27 GLOBAL] - - pmulhw xmm2, [s27 GLOBAL] - paddw xmm1, [s63 GLOBAL] - - paddw xmm2, [s63 GLOBAL] - psraw xmm1, 7 - - psraw xmm2, 7 - packsswb xmm1, xmm2 - - psubsb xmm3, xmm1 - paddsb xmm6, xmm1 - - pxor xmm3, [t80 GLOBAL] - pxor xmm6, [t80 GLOBAL] - - movdqa [rdx+48], xmm6 - movdqa [rdx+64], xmm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor xmm1, xmm1 - pxor xmm2, xmm2 - - punpcklbw xmm1, xmm4 - punpckhbw xmm2, xmm4 - - pmulhw xmm1, [s18 GLOBAL] - pmulhw xmm2, [s18 GLOBAL] - - paddw xmm1, [s63 GLOBAL] - paddw xmm2, [s63 GLOBAL] - - psraw xmm1, 7 - psraw xmm2, 7 - - packsswb xmm1, xmm2 - - movdqa xmm3, [rdx + 80] ; q1 - movdqa xmm6, [rdx + 32] ; p1 - - pxor xmm3, [t80 GLOBAL] - pxor xmm6, [t80 GLOBAL] - - paddsb xmm6, xmm1 - psubsb xmm3, xmm1 - - pxor xmm6, [t80 GLOBAL] - pxor xmm3, [t80 GLOBAL] - - movdqa [rdx + 80], xmm3 - movdqa [rdx + 32], xmm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor xmm1, xmm1 - pxor xmm2, xmm2 - - punpcklbw xmm1, xmm4 - punpckhbw xmm2, xmm4 - - pmulhw xmm1, [s9 GLOBAL] - pmulhw xmm2, [s9 GLOBAL] - - paddw xmm1, [s63 GLOBAL] - paddw xmm2, [s63 GLOBAL] - - psraw xmm1, 7 - psraw xmm2, 7 - - packsswb xmm1, xmm2 - - movdqa xmm6, [rdx+16] - movdqa xmm3, [rdx+96] - - pxor xmm6, [t80 GLOBAL] - pxor xmm3, [t80 GLOBAL] - - paddsb xmm6, xmm1 - psubsb xmm3, xmm1 - - pxor xmm6, [t80 GLOBAL] ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - pxor xmm3, [t80 GLOBAL] ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06 -%endmacro - %macro MBV_TRANSPOSE 0 movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 @@ -1422,10 +1158,10 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - movdqa xmm5, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm5, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 @@ -1434,7 +1170,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - movdqa xmm6, xmm3 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 @@ -1449,70 +1185,53 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): %macro MBV_WRITEBACK_1 0 movq QWORD PTR [rsi], xmm0 - psrldq xmm0, 8 - - movq QWORD PTR [rdi], xmm0 + movhps QWORD PTR [rdi], xmm0 movq QWORD PTR [rsi+2*rax], xmm6 - psrldq xmm6, 8 - - movq QWORD PTR [rdi+2*rax], xmm6 + movhps QWORD PTR [rdi+2*rax], xmm6 - movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 - punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 movq QWORD PTR [rsi+4*rax], xmm0 - psrldq xmm0, 8 + movhps QWORD PTR [rdi+4*rax], xmm0 - movq QWORD PTR [rdi+4*rax], xmm0 - - movq QWORD PTR [rsi+2*rcx], xmm5 - psrldq xmm5, 8 - - movq QWORD PTR [rdi+2*rcx], xmm5 + movq QWORD PTR [rsi+2*rcx], xmm3 + movhps QWORD PTR [rdi+2*rcx], xmm3 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - punpckhbw xmm3, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 movdqa xmm0, xmm2 - punpcklwd xmm0, xmm3 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 - punpckhwd xmm2, xmm3 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 - movdqa xmm3, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 - punpckhdq xmm3, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 + punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 %endmacro %macro MBV_WRITEBACK_2 0 - movq QWORD PTR [rsi], xmm1 - psrldq xmm1, 8 - - movq QWORD PTR [rdi], xmm1 + movq QWORD PTR [rsi], xmm1 + movhps QWORD PTR [rdi], xmm1 - movq QWORD PTR [rsi+2*rax], xmm3 - psrldq xmm3, 8 - - movq QWORD PTR [rdi+2*rax], xmm3 + movq QWORD PTR [rsi+2*rax], xmm5 + movhps QWORD PTR [rdi+2*rax], xmm5 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 - punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 - movq QWORD PTR [rsi+4*rax], xmm1 - psrldq xmm1, 8 - - movq QWORD PTR [rdi+4*rax], xmm1 + movq QWORD PTR [rsi+4*rax], xmm1 + movhps QWORD PTR [rdi+4*rax], xmm1 movq QWORD PTR [rsi+2*rcx], xmm4 - psrldq xmm4, 8 - - movq QWORD PTR [rdi+2*rcx], xmm4 + movhps QWORD PTR [rdi+2*rcx], xmm4 %endmacro @@ -1550,21 +1269,14 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): lea rcx, [rax*2+rax] ; Transpose - TRANSPOSE_16X8_1 + TRANSPOSE_16X8 1, 0 - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - lea rdx, srct - TRANSPOSE_16X8_2 0 - - ; calculate filter mask - LFV_FILTER_MASK 0 - ; calculate high edge variance - LFV_HEV_MASK + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 0 neg rax ; start work on filters - MBV_FILTER + MB_FILTER_AND_WRITEBACK 2 lea rsi, [rsi+rax*8] lea rdi, [rdi+rax*8] @@ -1625,23 +1337,16 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing lea rcx, [rax+2*rax] - ; Transpose - TRANSPOSE_16X8_1 - - ; XMM3 XMM4 XMM7 in use - mov rsi, arg(5) ; v_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] lea rdx, srct - TRANSPOSE_16X8_2 0 - ; calculate filter mask - LFV_FILTER_MASK 0 - ; calculate high edge variance - LFV_HEV_MASK + ; Transpose + TRANSPOSE_16X8 0, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 0 ; start work on filters - MBV_FILTER + MB_FILTER_AND_WRITEBACK 2 ; transpose and write back MBV_TRANSPOSE @@ -2068,12 +1773,6 @@ align 16 ones: times 8 dw 0x0001 align 16 -s27: - times 8 dw 0x1b00 -align 16 -s18: - times 8 dw 0x1200 -align 16 s9: times 8 dw 0x0900 align 16 -- 2.7.4