#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 20 + STACKSIZE(%esp)
-#define STACK_LDA 24 + STACKSIZE(%esp)
-#define STACK_X 28 + STACKSIZE(%esp)
-#define STACK_INCX 32 + STACKSIZE(%esp)
-#define Y 36 + STACKSIZE(%esp)
-#define STACK_INCY 40 + STACKSIZE(%esp)
-#define BUFFER 44 + STACKSIZE(%esp)
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 20 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
+#define STACK_X 28 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
+#define Y 36 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
+#define BUFFER 44 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+STACKSIZE(%esp)
+#define NN 4+STACKSIZE(%esp)
+#define AA 8+STACKSIZE(%esp)
+#define LDAX 12+STACKSIZE(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
movl STACK_LDA, LDA
+ movl LDA,LDAX # backup LDA
+ movl N,J
+ movl J,NN # backup N
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # mov M to MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $23,J # J=2^22
+ subl J,MMM # MMM=MMM-J
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A # mov AA to A
+
+ movl NN,%eax
+ movl %eax,N # reset N
+
+
+ movl LDAX, LDA # reset LDA
+
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
ALIGN_4
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 24 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
+#define STACK_X 32 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
+#define Y 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
+#define BUFFER 48 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+STACKSIZE(%esp)
+#define AA 4+STACKSIZE(%esp)
+#define LDAX 8+STACKSIZE(%esp)
+#define NN 12+STACKSIZE(%esp)
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 24 + STACKSIZE(%esp)
-#define STACK_LDA 28 + STACKSIZE(%esp)
-#define STACK_X 32 + STACKSIZE(%esp)
-#define STACK_INCX 36 + STACKSIZE(%esp)
-#define Y 40 + STACKSIZE(%esp)
-#define STACK_INCY 44 + STACKSIZE(%esp)
-#define BUFFER 48 + STACKSIZE(%esp)
-
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
+
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+
movl STACK_LDA, LDA
+ movl LDA,LDAX # backup LDA
+ movl N,J
+ movl J,NN # backup N
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # mov M to MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $22,J # J=2^22
+ subl J,MMM # MMM=MMM-J
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A # mov AA to A
+
+ movl NN,%eax
+ movl %eax,N # reset N
+
+
+ movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
leal (,INCY, SIZE), INCY
leal (,LDA, SIZE), LDA
+
subl $-16 * SIZE, A
cmpl $0, N
ALIGN_4
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+
+ addl $ARGS,%esp
ret
EPILOGUE
#ifndef WINDOWS_ABI
-#define STACKSIZE 64
+#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
#define STACK_Y 16 + STACKSIZE(%rsp)
#define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
+#define MMM 56(%rsp)
+#define NN 64(%rsp)
+#define AA 72(%rsp)
+#define LDAX 80(%rsp)
#else
#define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
+#defien MMM 216(%rsp)
+#defien NN 224(%rsp)
+#define AA 232(%rsp)
+#define LDAX 240(%rsp)
#endif
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
- movq OLD_M, M
- movq OLD_N, N
- movq OLD_A, A
- movq OLD_LDA, LDA
+ movq OLD_M, MMM
+ movq OLD_N, NN
+ movq OLD_A, AA
+ movq OLD_LDA, LDAX
movq OLD_X, X
#else
- movq OLD_M, M
- movq OLD_N, N
- movq OLD_A, A
- movq OLD_LDA, LDA
+ movq OLD_M, MMM
+ movq OLD_N, NN
+ movq OLD_A, AA
+ movq OLD_LDA, LDAX
#endif
-
- movq STACK_INCX, INCX
- movq STACK_Y, Y
- movq STACK_INCY, INCY
- movq STACK_BUFFER, BUFFER
-
#ifndef WINDOWS_ABI
pshufd $0, %xmm0, ALPHA
#else
pshufd $0, %xmm3, ALPHA
#endif
+
+.L0t:
+ xorq M,M
+ addq $1,M
+ salq $22,M
+ subq M,MMM
+ jge .L00t
+ ALIGN_4
+
+ movq MMM,%rax
+ addq M,%rax
+ jle .L999x
+ movq %rax,M
+
+.L00t:
+ movq LDAX,LDA
+ movq NN,N
+ movq AA,A
+ movq STACK_INCX, INCX
+ movq STACK_Y, Y
+ movq STACK_INCY, INCY
+ movq STACK_BUFFER, BUFFER
+
leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA
ALIGN_4
.L999:
+ leaq (,M,SIZE),%rax
+ addq %rax,AA
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12