From: Scott LaVarnway Date: Thu, 8 Dec 2011 19:37:59 +0000 (-0500) Subject: Improved mmx/sse2 versions of iwalsh X-Git-Tag: 1.0_branch~205^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9fa6132fc5852ce7f2a03331545e36439404dd95;p=profile%2Fivi%2Flibvpx.git Improved mmx/sse2 versions of iwalsh Removed unnecessary transposes. Change-Id: I029fbaf8afafee34d54a4f3333c22023c15003c3 --- diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm index 3ab066b..6582687 100644 --- a/vp8/common/x86/iwalsh_mmx.asm +++ b/vp8/common/x86/iwalsh_mmx.asm @@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi ; end prolog - mov rax, 3 - mov rsi, arg(0) - mov rdi, arg(1) - shl rax, 16 + mov rdx, arg(0) + mov rax, 30003h - movq mm0, [rsi + 0] ;ip[0] - movq mm1, [rsi + 8] ;ip[4] - or rax, 3 ;00030003h + movq mm0, [rdx + 0] ;ip[0] + movq mm1, [rdx + 8] ;ip[4] + movd mm7, rax - movq mm2, [rsi + 16] ;ip[8] - movq mm3, [rsi + 24] ;ip[12] + movq mm2, [rdx + 16] ;ip[8] + movq mm3, [rdx + 24] ;ip[12] + punpcklwd mm7, mm7 ;0003000300030003h + mov rdx, arg(1) - movq mm7, rax - movq mm4, mm0 + movq mm4, mm0 + movq mm5, mm1 - punpcklwd mm7, mm7 ;0003000300030003h - movq mm5, mm1 + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl + movq mm6, mm4 ;temp al + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl - movq mm6, mm4 ;temp al + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl + movq mm5, mm0 ;temp dl + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl ; 03 02 01 00 ; 13 12 11 10 ; 23 22 21 20 ; 33 32 31 30 - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] ;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl + movq mm1, mm0 + movq mm5, mm4 + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + paddw mm1, mm7 + paddw mm6, mm7 + psraw mm1, 3 + psraw mm6, 3 + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl + paddw mm0, mm7 + paddw mm5, mm7 + psraw mm0, 3 + psraw mm5, 3 ;~~~~~~~~~~~~~~~~~~~~~ - movq mm3, mm1 ; 03 02 01 00 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm4, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm4, mm5 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] - - paddw mm0, mm7 - paddw mm1, mm7 - paddw mm2, mm7 - paddw mm3, mm7 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - -; movq [rdi + 0], mm0 -; movq [rdi + 8], mm1 -; movq [rdi + 16], mm2 -; movq [rdi + 24], mm3 - - movd eax, mm0 - psrlq mm0, 32 - mov word ptr[rdi+32*0], ax - shr eax, 16 - mov word ptr[rdi+32*1], ax - movd eax, mm0 - mov word ptr[rdi+32*2], ax - shr eax, 16 - mov word ptr[rdi+32*3], ax - - movd ecx, mm1 - psrlq mm1, 32 - mov word ptr[rdi+32*4], cx - shr ecx, 16 - mov word ptr[rdi+32*5], cx - movd ecx, mm1 - mov word ptr[rdi+32*6], cx - shr ecx, 16 - mov word ptr[rdi+32*7], cx - - movd eax, mm2 - psrlq mm2, 32 - mov word ptr[rdi+32*8], ax - shr eax, 16 - mov word ptr[rdi+32*9], ax - movd eax, mm2 - mov word ptr[rdi+32*10], ax - shr eax, 16 - mov word ptr[rdi+32*11], ax - - movd ecx, mm3 - psrlq mm3, 32 - mov word ptr[rdi+32*12], cx - shr ecx, 16 - mov word ptr[rdi+32*13], cx - movd ecx, mm3 - mov word ptr[rdi+32*14], cx - shr ecx, 16 - mov word ptr[rdi+32*15], cx + + movd eax, mm1 + movd ecx, mm0 + psrlq mm0, 32 + psrlq mm1, 32 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*1], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*5], cx + movd eax, mm1 + movd ecx, mm0 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*9], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*13], cx + + movd eax, mm6 + movd ecx, mm5 + psrlq mm5, 32 + psrlq mm6, 32 + mov word ptr[rdx+32*2], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*6], ax + mov word ptr[rdx+32*7], cx + movd eax, mm6 + movd ecx, mm5 + mov word ptr[rdx+32*10], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*14], ax + mov word ptr[rdx+32*15], cx ; begin epilog - pop rdi - pop rsi UNSHADOW_ARGS pop rbp ret diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm index 5a7133d..51cb5e2 100644 --- a/vp8/common/x86/iwalsh_sse2.asm +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - SAVE_XMM 6 - push rsi - push rdi ; end prolog - mov rsi, arg(0) - mov rdi, arg(1) - mov rax, 3 + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h - movdqa xmm0, [rsi + 0] ;ip[4] ip[0] - movdqa xmm1, [rsi + 16] ;ip[12] ip[8] + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] - shl rax, 16 - or rax, 3 ;00030003h - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm0 ;ip[4] ip[0] + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] - paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - movdqa xmm4, xmm0 + movdqa xmm4, xmm0 punpcklqdq xmm0, xmm3 ;d1 a1 punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm6, eax - movdqa xmm1, xmm4 ;c1 b1 - paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;;;temp output -;; movdqu [rdi + 0], xmm4 -;; movdqu [rdi + 16], xmm3 - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; 13 12 11 10 03 02 01 00 ; ; 33 32 31 30 23 22 21 20 ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] - pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - movdqa xmm5, xmm4 + movdqa xmm5, xmm4 punpcklqdq xmm4, xmm3 ;d1 a1 punpckhqdq xmm5, xmm3 ;c1 b1 - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm6 - paddw xmm1, xmm6 - - psraw xmm5, 3 - psraw xmm1, 3 - -;; movdqa [rdi + 0], xmm5 -;; movdqa [rdi + 16], xmm1 - - movd eax, xmm5 - psrldq xmm5, 4 - mov word ptr[rdi+32*0], ax - shr eax, 16 - mov word ptr[rdi+32*1], ax - movd eax, xmm5 - psrldq xmm5, 4 - mov word ptr[rdi+32*2], ax - shr eax, 16 - mov word ptr[rdi+32*3], ax - - movd eax, xmm5 - psrldq xmm5, 4 - mov word ptr[rdi+32*4], ax - shr eax, 16 - mov word ptr[rdi+32*5], ax - movd eax, xmm5 - mov word ptr[rdi+32*6], ax - shr eax, 16 - mov word ptr[rdi+32*7], ax - - movd eax, xmm1 - psrldq xmm1, 4 - mov word ptr[rdi+32*8], ax - shr eax, 16 - mov word ptr[rdi+32*9], ax - movd eax, xmm1 - psrldq xmm1, 4 - mov word ptr[rdi+32*10], ax - shr eax, 16 - mov word ptr[rdi+32*11], ax - - movd eax, xmm1 - psrldq xmm1, 4 - mov word ptr[rdi+32*12], ax - shr eax, 16 - mov word ptr[rdi+32*13], ax - movd eax, xmm1 - mov word ptr[rdi+32*14], ax - shr eax, 16 - mov word ptr[rdi+32*15], ax + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx ; begin epilog - pop rdi - pop rsi - RESTORE_XMM UNSHADOW_ARGS pop rbp ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004