From c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 7 Apr 2011 13:17:22 -0400 Subject: [PATCH] Add save/restore xmm registers in x86 assembly code Went through the code and fixed it. Verified on Windows. Where possible, remove dependencies on xmm[67] Current code relies on pushing rbp to the stack to get 16 byte alignment. This broke when rbp wasn't pushed (vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned memory accesses. Revisit this and the offsets in vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM. Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877 --- vp8/common/x86/idctllm_sse2.asm | 50 +++++++++++---------- vp8/common/x86/subpixel_ssse3.asm | 12 +++++ vp8/encoder/x86/dct_sse2.asm | 2 + vp8/encoder/x86/encodeopt.asm | 82 +++++++++++++++++----------------- vp8/encoder/x86/sad_sse2.asm | 18 ++++---- vp8/encoder/x86/sad_sse3.asm | 10 +++-- vp8/encoder/x86/sad_ssse3.asm | 4 ++ vp8/encoder/x86/variance_impl_sse2.asm | 12 +++++ vpx_ports/x86_abi_support.asm | 8 ++-- 9 files changed, 120 insertions(+), 78 deletions(-) diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index edee157..c873869 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2): mov rdx, arg(1) ; dequant mov rax, arg(0) ; qcoeff - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - movd xmm4, [rax] movd xmm5, [rdx] @@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2): pmullw xmm4, xmm5 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + ; clear coeffs - movd [rax], xmm7 - movd [rax+32], xmm7 + movd [rax], xmm5 + movd [rax+32], xmm5 ;pshufb pshuflw xmm4, xmm4, 00000000b pshufhw xmm4, xmm4, 00000000b @@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2): lea rcx, [3*rcx] movq xmm3, [rax+rcx] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 mov rax, arg(3) ; dst movsxd rdx, dword ptr arg(4) ; dst_stride @@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; store blocks back out movq [rax], xmm0 @@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2): mov rdi, arg(3) ; dst mov rdx, arg(5) ; dc - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 ; load up 2 dc words here == 2*16 = doubleword movd xmm4, [rdx] @@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2): psraw xmm4, 3 ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 ; Add to predict buffer paddw xmm0, xmm4 @@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; Load destination stride before writing out, ; doesn't need to persist @@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 0ec18de..1db3d62 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 652dd98..287ad48 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -33,6 +33,7 @@ %define input rcx %define output rdx %define pitch r8 + SAVE_XMM %else %define input rdi %define output rsi @@ -53,6 +54,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index c0f06bb..e142a75 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -22,33 +22,33 @@ sym(vp8_block_error_xmm): ; end prologue mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] - movdqa xmm5, [rsi+16] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - movdqa xmm6, [rdi+16] - psubw xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm2, xmm3 - psubw xmm5, xmm6 - pmaddwd xmm3, xmm3 - pmaddwd xmm5, xmm5 + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 - paddd xmm3, xmm5 + paddd xmm0, xmm2 - pxor xmm7, xmm7 - movdqa xmm0, xmm3 + pxor xmm5, xmm5 + movdqa xmm1, xmm0 - punpckldq xmm0, xmm7 - punpckhdq xmm3, xmm7 + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 - paddd xmm0, xmm3 - movdqa xmm3, xmm0 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 psrldq xmm0, 8 - paddd xmm0, xmm3 + paddd xmm0, xmm1 movq rax, xmm0 @@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 + SAVE_XMM ; 6 push rsi push rdi ; end prolog mov rsi, arg(0) ;coeff_ptr - pxor xmm7, xmm7 + pxor xmm6, xmm6 mov rdi, arg(1) ;dcoef_ptr - pxor xmm2, xmm2 + pxor xmm4, xmm4 - movd xmm1, dword ptr arg(2) ;dc - por xmm1, xmm2 + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 - pcmpeqw xmm1, xmm7 + pcmpeqw xmm5, xmm6 mov rcx, 16 mberror_loop: - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] - movdqa xmm5, [rsi+16] - movdqa xmm6, [rdi+16] + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - psubw xmm5, xmm6 - pmaddwd xmm5, xmm5 + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 - psubw xmm3, xmm4 - pand xmm3, xmm1 + psubw xmm0, xmm1 + pand xmm0, xmm5 - pmaddwd xmm3, xmm3 + pmaddwd xmm0, xmm0 add rsi, 32 add rdi, 32 sub rcx, 1 - paddd xmm2, xmm5 + paddd xmm4, xmm2 - paddd xmm2, xmm3 + paddd xmm4, xmm0 jnz mberror_loop - movdqa xmm0, xmm2 - punpckldq xmm0, xmm7 + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 - punpckhdq xmm2, xmm7 - paddd xmm0, xmm2 + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 movdqa xmm1, xmm0 psrldq xmm0, 8 @@ -265,6 +266,7 @@ mberror_loop: pop rdi pop rsi ; begin epilog + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl): mov rdi, arg(1) ;d_ptr mov rcx, 16 - pxor xmm7, xmm7 + pxor xmm3, xmm3 mbuverror_loop: @@ -352,7 +354,7 @@ mbuverror_loop: psubw xmm1, xmm2 pmaddwd xmm1, xmm1 - paddd xmm7, xmm1 + paddd xmm3, xmm1 add rsi, 16 add rdi, 16 @@ -361,7 +363,7 @@ mbuverror_loop: jnz mbuverror_loop pxor xmm0, xmm0 - movdqa xmm1, xmm7 + movdqa xmm1, xmm3 movdqa xmm2, xmm1 punpckldq xmm1, xmm0 diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index cc6bc3c..d9ac3ff 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM ; 6 push rsi push rdi ; end prolog @@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] - pxor xmm7, xmm7 + pxor xmm6, xmm6 x16x16sad_wmt_loop: @@ -52,32 +53,33 @@ x16x16sad_wmt_loop: punpcklbw xmm1, xmm3 psadbw xmm0, xmm1 - movq xmm6, QWORD PTR [rsi+rax+8] + movq xmm2, QWORD PTR [rsi+rax+8] movq xmm3, QWORD PTR [rdi+rdx+8] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm6 + punpcklbw xmm4, xmm2 punpcklbw xmm5, xmm3 psadbw xmm4, xmm5 - paddw xmm7, xmm0 - paddw xmm7, xmm4 + paddw xmm6, xmm0 + paddw xmm6, xmm4 cmp rsi, rcx jne x16x16sad_wmt_loop - movq xmm0, xmm7 - psrldq xmm7, 8 + movq xmm0, xmm6 + psrldq xmm6, 8 - paddw xmm0, xmm7 + paddw xmm0, xmm6 movq rax, xmm0 ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index f0336ab..6668792 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -39,8 +39,9 @@ %define ref_stride r9 %define end_ptr r10 %define ret_var r11 - %define result_ptr [rsp+8+4*8] - %define max_err [rsp+8+4*8] + %define result_ptr [rsp+40+4*8] + %define max_err [rsp+40+4*8] + SAVE_XMM %else %define src_ptr rdi %define src_stride rsi @@ -72,6 +73,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret @@ -113,7 +115,8 @@ %define r2_ptr r11 %define r3_ptr r8 %define ref_stride r9 - %define result_ptr [rsp+16+4*8] + %define result_ptr [rsp+48+4*8] + SAVE_XMM push rsi LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr @@ -151,6 +154,7 @@ %else %ifidn __OUTPUT_FORMAT__,x64 pop rsi + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 69c5eae..7c7cd0a 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM push rsi push rdi push rcx @@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM push rsi push rdi push rcx @@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index c2c30de..2c0e170 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM push rbx push rsi push rdi @@ -206,6 +207,7 @@ var16loop: pop rdi pop rsi pop rbx + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -321,6 +324,7 @@ var16peloop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm index be64cd7..60dff49 100644 --- a/vpx_ports/x86_abi_support.asm +++ b/vpx_ports/x86_abi_support.asm @@ -260,12 +260,12 @@ %ifidn __OUTPUT_FORMAT__,x64 %macro SAVE_XMM 0 sub rsp, 32 - movdqa XMMWORD PTR [rsp], xmm6 - movdqa XMMWORD PTR [rsp+16], xmm7 + movdqu XMMWORD PTR [rsp], xmm6 + movdqu XMMWORD PTR [rsp+16], xmm7 %endmacro %macro RESTORE_XMM 0 - movdqa xmm6, XMMWORD PTR [rsp] - movdqa xmm7, XMMWORD PTR [rsp+16] + movdqu xmm6, XMMWORD PTR [rsp] + movdqu xmm7, XMMWORD PTR [rsp+16] add rsp, 32 %endmacro %else -- 2.7.4