From 40dab58941695ee52b70e0bcb4b93da668e42d6b Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 4 Nov 2015 20:01:34 -0800 Subject: [PATCH] convolve_copy_sse2: replace SSE w/SSE2 code this should be neutral or slightly faster on modern (P4+) architectures Change-Id: Iec4c080275941eb8c9e05a66a2daf0405d86a69b --- vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 70 +++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 9c5b414..abc0270 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -13,15 +13,21 @@ SECTION .text %macro convolve_fn 1-2 -INIT_XMM sse2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd @@ -152,27 +158,30 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ jnz .loop16 RET -INIT_MMX sse .w8: mov r4d, dword hm lea r5q, [src_strideq*3] lea r6q, [dst_strideq*3] .loop8: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+dst_strideq] - pavg m2, [dstq+dst_strideq*2] - pavg m3, [dstq+r6q] + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 %endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 lea dstq, [dstq+dst_strideq*4] sub r4d, 4 jnz .loop8 @@ -184,25 +193,25 @@ INIT_MMX sse lea r5q, [src_strideq*3] lea r6q, [dst_strideq*3] .loop4: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] pavg m0, m4 pavg m1, m5 pavg m2, m6 pavg m3, m7 %endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 lea dstq, [dstq+dst_strideq*4] sub r4d, 4 jnz .loop4 @@ -210,6 +219,7 @@ INIT_MMX sse %endif %endmacro +INIT_XMM sse2 convolve_fn copy convolve_fn avg %if CONFIG_VP9_HIGHBITDEPTH -- 2.7.4