From c852ce398109e12f9b18871102295f29eecb1f1b Mon Sep 17 00:00:00 2001 From: traits Date: Tue, 18 Oct 2011 10:23:17 +0800 Subject: [PATCH] Ref #65. Fixed 64-bit Windows calling convention bug in cdot and zdot. According to 64-bit Windows calling convention, the return value is in %rax instead of %xmm0 in cdot kernel. In zdot, the caller allocates a memory space for return value and sets this memory address to the first hidden parameter. Thus, the callee (zdot) should assign the result to this memory space and return the memory address in %rax. --- kernel/x86_64/zdot_sse.S | 4 ++++ kernel/x86_64/zdot_sse2.S | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index 3302b90..13804e0 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -3483,6 +3483,10 @@ subss %xmm3, %xmm1 #endif unpcklps %xmm1, %xmm0 + +#ifdef WINDOWS_ABI + movq %xmm0, %rax +#endif RESTOREREGISTERS diff --git a/kernel/x86_64/zdot_sse2.S b/kernel/x86_64/zdot_sse2.S index 77fa8e3..63acecc 100644 --- a/kernel/x86_64/zdot_sse2.S +++ b/kernel/x86_64/zdot_sse2.S @@ -39,14 +39,19 @@ #define ASSEMBLER #include "common.h" +#ifndef WINDOWS_ABI #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ -#ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else -#define INCY %r10 +#define RESULT_ADDRESS ARG1 /*rcx*/ +#define N ARG2 /* rdx */ +#define X ARG3 /* r8 */ +#define INCX ARG4 /* r9*/ +#define Y %r10 +#define INCY %r11 #endif #include "l1param.h" @@ -64,7 +69,8 @@ PROFCODE #ifdef WINDOWS_ABI - movq 40(%rsp), INCY + movq 40(%rsp), Y + movq 48(%rsp), INCY #endif SAVEREGISTERS @@ -1544,6 +1550,12 @@ subsd %xmm3, %xmm1 #endif +#ifdef WINDOWS_ABI + movq RESULT_ADDRESS, %rax + movsd %xmm0, (%rax) + movsd %xmm1, 8(%rax) +#endif + RESTOREREGISTERS ret -- 2.7.4