According to 64-bit Windows calling convention, the return value is in %rax instead of %xmm0 in cdot kernel.
In zdot, the caller allocates a memory space for return value and sets this memory address to the first hidden parameter. Thus, the callee (zdot) should assign the result to this memory space and return the memory address in %rax.
subss %xmm3, %xmm1
#endif
unpcklps %xmm1, %xmm0
+
+#ifdef WINDOWS_ABI
+ movq %xmm0, %rax
+#endif
RESTOREREGISTERS
#define ASSEMBLER
#include "common.h"
+#ifndef WINDOWS_ABI
#define N ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
-#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
-#define INCY %r10
+#define RESULT_ADDRESS ARG1 /*rcx*/
+#define N ARG2 /* rdx */
+#define X ARG3 /* r8 */
+#define INCX ARG4 /* r9*/
+#define Y %r10
+#define INCY %r11
#endif
#include "l1param.h"
PROFCODE
#ifdef WINDOWS_ABI
- movq 40(%rsp), INCY
+ movq 40(%rsp), Y
+ movq 48(%rsp), INCY
#endif
SAVEREGISTERS
subsd %xmm3, %xmm1
#endif
+#ifdef WINDOWS_ABI
+ movq RESULT_ADDRESS, %rax
+ movsd %xmm0, (%rax)
+ movsd %xmm1, 8(%rax)
+#endif
+
RESTOREREGISTERS
ret