It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large.
Thank @wangqian for this patch.
#ifndef WINDOWS_ABI
-#define STACKSIZE 64
+#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
#define STACK_Y 16 + STACKSIZE(%rsp)
#define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
-
+#define MMM 56(%rsp)
+#define NN 64(%rsp)
+#define AA 72(%rsp)
+#define LDAX 80(%rsp)
#else
#define STACKSIZE 256
movq OLD_LDA, LDA
movq OLD_X, X
#else
- movq OLD_M, M
- movq OLD_N, N
- movq OLD_A, A
- movq OLD_LDA, LDA
+ movq OLD_M, MMM
+ movq OLD_N, NN
+ movq OLD_A, AA
+ movq OLD_LDA, LDAX
+#endif
+#ifdef HAVE_SSE3
+#ifndef WINDOWS_ABI
+ movddup %xmm0, ALPHA
+#else
+ movddup %xmm3, ALPHA
#endif
+#else
+#ifndef WINDOWS_ABI
+ movapd %xmm0, ALPHA
+#else
+ movapd %xmm3, ALPHA
+#endif
+ unpcklpd ALPHA, ALPHA
+#endif
+
+
+.L0x:
+ xorq M,M
+ addq $1,M
+ salq $22,M
+ subq M,MMM
+ jge .L00
+
+ movq MMM,%rax
+ addq M,%rax
+ jle .L999x
+ movq %rax,M
+
+.L00:
+ movq LDAX,LDA
+ movq NN,N
+ movq AA,A
movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
subq $-16 * SIZE, A
-#ifdef HAVE_SSE3
-#ifndef WINDOWS_ABI
- movddup %xmm0, ALPHA
-#else
- movddup %xmm3, ALPHA
-#endif
-#else
-#ifndef WINDOWS_ABI
- movapd %xmm0, ALPHA
-#else
- movapd %xmm3, ALPHA
-#endif
- unpcklpd ALPHA, ALPHA
-#endif
-
testq M, M
jle .L999
testq N, N
.L21:
#endif
-
subq $4, N
leaq 16 * SIZE(BUFFER), X1
ALIGN_4
.L999:
+ leaq (, M, SIZE), %rax
+ addq %rax,AA
+ jmp .L0x;
+ ALIGN_4
+
+.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12