From 4a2b684ef4b361b805be8e0db972cbe9b7e24752 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 15 Apr 2011 10:05:20 -0400 Subject: [PATCH] modify SAVE_XMM for potential 64bit use the win64 abi requires saving and restoring xmm6:xmm15. currently SAVE_XMM and RESTORE XMM only allow for saving xmm6:xmm7. allow specifying the highest register used and if the stack is unaligned. Change-Id: Ica5699622ffe3346d3a486f48eef0206c51cf867 --- vp8/common/x86/idctllm_sse2.asm | 4 +-- vp8/common/x86/iwalsh_sse2.asm | 10 +++--- vp8/common/x86/loopfilter_sse2.asm | 20 +++++------ vp8/common/x86/postproc_sse2.asm | 6 ++-- vp8/common/x86/recon_sse2.asm | 2 +- vp8/common/x86/subpixel_sse2.asm | 20 +++++------ vp8/common/x86/subpixel_ssse3.asm | 15 ++++---- vp8/encoder/x86/dct_sse2.asm | 2 +- vp8/encoder/x86/encodeopt.asm | 2 +- vp8/encoder/x86/fwalsh_sse2.asm | 2 +- vp8/encoder/x86/quantize_sse2.asm | 2 +- vp8/encoder/x86/sad_sse2.asm | 2 +- vp8/encoder/x86/sad_sse3.asm | 10 +++--- vp8/encoder/x86/sad_ssse3.asm | 4 +-- vp8/encoder/x86/ssim_opt.asm | 4 +-- vp8/encoder/x86/subtract_sse2.asm | 2 +- vp8/encoder/x86/temporal_filter_apply_sse2.asm | 2 +- vp8/encoder/x86/variance_impl_sse2.asm | 20 +++++------ vp8/encoder/x86/variance_impl_ssse3.asm | 2 +- vpx_ports/x86_abi_support.asm | 47 ++++++++++++++++++++------ 20 files changed, 101 insertions(+), 77 deletions(-) diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index c873869..34a7e18 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -102,7 +102,7 @@ sym(idct_dequant_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -443,7 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm index 83c97df..1da4fd8 100644 --- a/vp8/common/x86/iwalsh_sse2.asm +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -17,7 +17,7 @@ sym(vp8_short_inv_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - SAVE_XMM + SAVE_XMM 6 push rsi push rdi ; end prolog @@ -41,7 +41,7 @@ sym(vp8_short_inv_walsh4x4_sse2): movdqa xmm4, xmm0 punpcklqdq xmm0, xmm3 ;d1 a1 punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm7, eax + movd xmm6, eax movdqa xmm1, xmm4 ;c1 b1 paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] @@ -66,7 +66,7 @@ sym(vp8_short_inv_walsh4x4_sse2): pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] movdqa xmm3, xmm4 ;ip[4] ip[0] - pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03 + pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 @@ -90,8 +90,8 @@ sym(vp8_short_inv_walsh4x4_sse2): punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm7 - paddw xmm1, xmm7 + paddw xmm5, xmm6 + paddw xmm1, xmm6 psraw xmm5, 3 psraw xmm1, 3 diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 849133d..c2ce1a1 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -288,7 +288,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -338,7 +338,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -584,7 +584,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -634,7 +634,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1024,7 +1024,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1091,7 +1091,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1249,7 +1249,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1318,7 +1318,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1386,7 +1386,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1503,7 +1503,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx ; save callee-saved reg push rsi push rdi diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm index 30b4bf5..06d51ec 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vp8/common/x86/postproc_sse2.asm @@ -26,7 +26,7 @@ sym(vp8_post_proc_down_and_across_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -256,7 +256,7 @@ sym(vp8_mbpost_proc_down_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -456,7 +456,7 @@ sym(vp8_mbpost_proc_across_ip_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 4ad3973..67b6420 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -67,7 +67,7 @@ sym(vp8_recon4b_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM + SAVE_XMM 7 push rsi push rdi ; end prolog diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index b87cad2..83e3b14 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -37,7 +37,7 @@ sym(vp8_filter_block1d8_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -157,7 +157,7 @@ sym(vp8_filter_block1d16_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -333,7 +333,7 @@ sym(vp8_filter_block1d8_v6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -428,7 +428,7 @@ sym(vp8_filter_block1d16_v6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -538,7 +538,7 @@ sym(vp8_filter_block1d8_h6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -651,7 +651,7 @@ sym(vp8_filter_block1d16_h6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -816,7 +816,7 @@ sym(vp8_filter_block1d8_v6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -908,7 +908,6 @@ sym(vp8_unpack_block1d16_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - ;SAVE_XMM ;xmm6, xmm7 are not used here. GET_GOT rbx push rsi push rdi @@ -948,7 +947,6 @@ unpack_block1d16_h6_sse2_rowloop: pop rdi pop rsi RESTORE_GOT - ;RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -969,7 +967,7 @@ sym(vp8_bilinear_predict16x16_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1238,7 +1236,7 @@ sym(vp8_bilinear_predict8x8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 1db3d62..1ddbc54 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -39,7 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -182,7 +182,7 @@ sym(vp8_filter_block1d16_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -289,7 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -418,7 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -606,7 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -819,7 +819,6 @@ vp8_filter_block1d4_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT - RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -886,7 +885,7 @@ sym(vp8_bilinear_predict16x16_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1149,7 +1148,7 @@ sym(vp8_bilinear_predict8x8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 287ad48..3d52a5d 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -33,7 +33,7 @@ %define input rcx %define output rdx %define pitch r8 - SAVE_XMM + SAVE_XMM 7, u %else %define input rdi %define output rsi diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index e142a75..9946294 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -208,7 +208,7 @@ sym(vp8_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 - SAVE_XMM ; 6 + SAVE_XMM 6 push rsi push rdi ; end prolog diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm index 39439f0..71efd56 100644 --- a/vp8/encoder/x86/fwalsh_sse2.asm +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -17,7 +17,7 @@ sym(vp8_short_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 7b7ae70..056b64c 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -20,7 +20,7 @@ global sym(vp8_regular_quantize_b_sse2) sym(vp8_regular_quantize_b_sse2): push rbp mov rbp, rsp - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx %if ABI_IS_32BIT diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index d9ac3ff..04ee72f 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -21,7 +21,7 @@ sym(vp8_sad16x16_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM ; 6 + SAVE_XMM 6 push rsi push rdi ; end prolog diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 6668792..2dbcc7d 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -33,15 +33,15 @@ movsxd rdx, dword ptr arg(3) ; ref_stride %else %ifidn __OUTPUT_FORMAT__,x64 + SAVE_XMM 7, u %define src_ptr rcx %define src_stride rdx %define ref_ptr r8 %define ref_stride r9 %define end_ptr r10 %define ret_var r11 - %define result_ptr [rsp+40+4*8] - %define max_err [rsp+40+4*8] - SAVE_XMM + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_err [rsp+xmm_stack_space+8+4*8] %else %define src_ptr rdi %define src_stride rsi @@ -108,6 +108,7 @@ xchg rbx, rax %else %ifidn __OUTPUT_FORMAT__,x64 + SAVE_XMM 7, u %define src_ptr rcx %define src_stride rdx %define r0_ptr rsi @@ -115,8 +116,7 @@ %define r2_ptr r11 %define r3_ptr r8 %define ref_stride r9 - %define result_ptr [rsp+48+4*8] - SAVE_XMM + %define result_ptr [rsp+xmm_stack_space+16+4*8] push rsi LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 7c7cd0a..6ecf081 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -157,7 +157,7 @@ sym(vp8_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 push rsi push rdi push rcx @@ -270,7 +270,7 @@ sym(vp8_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 push rsi push rdi push rcx diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index d6cebf3..d5d267a 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -66,7 +66,7 @@ sym(vp8_ssim_parms_16x16_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 15 push rsi push rdi ; end prolog @@ -156,7 +156,7 @@ sym(vp8_ssim_parms_8x8_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 15 push rsi push rdi ; end prolog diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 3fb23d0..95888f6 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -77,7 +77,7 @@ sym(vp8_subtract_mby_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index 0127b01..30674c8 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -26,7 +26,7 @@ sym(vp8_temporal_filter_apply_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 2c0e170..5becc73 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,7 +85,7 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 push rbx push rsi push rdi @@ -225,7 +225,7 @@ sym(vp8_get16x16pred_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -345,7 +345,7 @@ sym(vp8_get8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -534,7 +534,7 @@ sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -811,7 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -933,7 +933,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1049,7 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1156,7 +1156,7 @@ sym(vp8_half_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1264,7 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1369,7 +1369,7 @@ sym(vp8_half_horiz_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm index 3c0fef9..a582f8d 100644 --- a/vp8/encoder/x86/variance_impl_ssse3.asm +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -34,7 +34,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm index 60dff49..37a3205 100644 --- a/vpx_ports/x86_abi_support.asm +++ b/vpx_ports/x86_abi_support.asm @@ -255,21 +255,48 @@ %define UNSHADOW_ARGS mov rsp, rbp %endif -; must keep XMM6:XMM15 (libvpx uses XMM6 and XMM7) on Win64 ABI -; rsp register has to be aligned +; Win64 ABI requires that XMM6:XMM15 are callee saved +; SAVE_XMM n, [u] +; store registers 6-n on the stack +; if u is specified, use unaligned movs. +; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return +; value. Typically we follow this up with 'push rbp' - re-aligning the stack - +; but in some cases this is not done and unaligned movs must be used. %ifidn __OUTPUT_FORMAT__,x64 -%macro SAVE_XMM 0 - sub rsp, 32 - movdqu XMMWORD PTR [rsp], xmm6 - movdqu XMMWORD PTR [rsp+16], xmm7 +%macro SAVE_XMM 1-2 a + %if %1 < 6 + %error Only xmm registers 6-15 must be preserved + %else + %assign last_xmm %1 + %define movxmm movdq %+ %2 + %assign xmm_stack_space ((last_xmm - 5) * 16) + sub rsp, xmm_stack_space + %assign i 6 + %rep (last_xmm - 5) + movxmm [rsp + ((i - 6) * 16)], xmm %+ i + %assign i i+1 + %endrep + %endif %endmacro %macro RESTORE_XMM 0 - movdqu xmm6, XMMWORD PTR [rsp] - movdqu xmm7, XMMWORD PTR [rsp+16] - add rsp, 32 + %ifndef last_xmm + %error RESTORE_XMM must be paired with SAVE_XMM n + %else + %assign i last_xmm + %rep (last_xmm - 5) + movxmm xmm %+ i, [rsp +((i - 6) * 16)] + %assign i i-1 + %endrep + add rsp, xmm_stack_space + ; there are a couple functions which return from multiple places. + ; otherwise, we could uncomment these: + ; %undef last_xmm + ; %undef xmm_stack_space + ; %undef movxmm + %endif %endmacro %else -%macro SAVE_XMM 0 +%macro SAVE_XMM 1-2 %endmacro %macro RESTORE_XMM 0 %endmacro -- 2.7.4