#define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
+
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
-#define LDAX 12+ARGS(%esp)
#define I %eax
#define J %ebx
movl YY,J
movl J,Y
- movl STACK_LDA, LDA
+ movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
- movl YY,J
- addl %eax,J
- movl J,YY
+ movl STACK_INCY,INCY
+ imull INCY,%eax
+ addl %eax,YY
jmp .L0t
ALIGN_4
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
- movl YY,J
- addl %eax,J
- movl J,YY
+ movl STACK_INCY,INCY
+ imull INCY,%eax
+ addl %eax,YY
jmp .L0t
ALIGN_4
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
-#define MMM 0+STACKSIZE(%esp)
-#define NN 4+STACKSIZE(%esp)
-#define AA 8+STACKSIZE(%esp)
-#define LDAX 12+STACKSIZE(%esp)
-#define XX 16+STACKSIZE(%esp)
+#define MMM 0+ARGS(%esp)
+#define AA 4+ARGS(%esp)
+#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
PROFCODE
- movl STACK_LDA, LDA
- movl LDA,LDAX # backup LDA
movl STACK_X, X
movl X,XX
- movl N,J
- movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
- # Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
movl AA,%eax
movl %eax,A # mov AA to A
- movl NN,%eax
- movl %eax,N # reset N
-
-
- movl LDAX, LDA # reset LDA
- movl XX,X
+ movl XX,%eax
+ movl %eax,X
+ movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
- movl XX,J
- addl %eax,J
- movl J,XX
+ movl STACK_INCX,INCX
+ imull INCX,%eax
+ addl %eax,XX
jmp .L0t
ALIGN_4
#endif
#define STACKSIZE 16
-#define ARGS 16
+#define ARGS 20
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
-#define MMM 0+STACKSIZE(%esp)
-#define AA 4+STACKSIZE(%esp)
-#define LDAX 8+STACKSIZE(%esp)
-#define NN 12+STACKSIZE(%esp)
+#define MMM 0+ARGS(%esp)
+#define AA 4+ARGS(%esp)
+#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
PROFCODE
- movl STACK_LDA, LDA
- movl LDA,LDAX # backup LDA
- movl N,J
- movl J,NN # backup N
+ movl STACK_X, X
+ movl X,XX
movl A,J
movl J,AA # backup A
movl M,J
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
- # Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
movl %eax,M
.L00t:
+ movl XX,%eax
+ movl %eax, X
+
movl AA,%eax
movl %eax,A # mov AA to A
- movl NN,%eax
- movl %eax,N # reset N
-
-
- movl LDAX, LDA # reset LDA
- movl STACK_X, X
+ movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
+ movl STACK_INCX,INCX
+ imull INCX,%eax
+ addl %eax,XX
jmp .L0t
ALIGN_4
#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA_R 16 + STACKSIZE(%esp)
-#define ALPHA_I 20 + STACKSIZE(%esp)
-#define A 24 + STACKSIZE(%esp)
-#define STACK_LDA 28 + STACKSIZE(%esp)
-#define STACK_X 32 + STACKSIZE(%esp)
-#define STACK_INCX 36 + STACKSIZE(%esp)
-#define Y 40 + STACKSIZE(%esp)
-#define STACK_INCY 44 + STACKSIZE(%esp)
-#define BUFFER 48 + STACKSIZE(%esp)
+#define ARGS 20
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
+#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
+#define A 24 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
+#define STACK_X 32 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
+#define Y 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
+#define BUFFER 48 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+ARGS(%esp)
+#define YY 4+ARGS(%esp)
+#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+ movl Y,J
+ movl J,YY
+ movl A,J
+ movl J,AA
+ movl M,J
+ movl J,MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $20,J
+ subl J,MMM
+ movl J,M
+ jge .L00t
+ ALIGN_3
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A
+
+ movl YY,J
+ movl J,Y
+
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
ALIGN_3
.L999:
+ movl M,%eax
+ sall $ZBASE_SHIFT,%eax
+ addl %eax,AA
+ movl STACK_INCY,INCY
+ imull INCY,%eax
+ addl %eax,YY
+ jmp .L0t
+ ALIGN_3
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
+#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
+#define A 32 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
+#define STACK_X 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
+#define Y 48 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
+#define BUFFER 56 + STACKSIZE+ARGS(%esp)
+#define MMM 0 + ARGS(%esp)
+#define YY 4 + ARGS(%esp)
+#define AA 8 + ARGS(%esp)
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA_R 16 + STACKSIZE(%esp)
-#define ALPHA_I 24 + STACKSIZE(%esp)
-#define A 32 + STACKSIZE(%esp)
-#define STACK_LDA 36 + STACKSIZE(%esp)
-#define STACK_X 40 + STACKSIZE(%esp)
-#define STACK_INCX 44 + STACKSIZE(%esp)
-#define Y 48 + STACKSIZE(%esp)
-#define STACK_INCY 52 + STACKSIZE(%esp)
-#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+ movl Y,J
+ movl J,YY
+ movl A,J
+ movl J,AA
+ movl M,J
+ movl J,MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $18,J
+ subl J,MMM
+ movl J,M
+ jge .L00t
+ ALIGN_3
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A
+
+ movl YY,J
+ movl J,Y
+
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
ALIGN_3
.L999:
+ movl M,%eax
+ sall $ZBASE_SHIFT,%eax
+ addl %eax,AA
+ movl STACK_INCY,INCY
+ imull INCY,%eax
+ addl %eax,YY
+ jmp .L0t
+ ALIGN_3
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA_R 16 + STACKSIZE(%esp)
-#define ALPHA_I 20 + STACKSIZE(%esp)
-#define A 24 + STACKSIZE(%esp)
-#define STACK_LDA 28 + STACKSIZE(%esp)
-#define STACK_X 32 + STACKSIZE(%esp)
-#define STACK_INCX 36 + STACKSIZE(%esp)
-#define Y 40 + STACKSIZE(%esp)
-#define STACK_INCY 44 + STACKSIZE(%esp)
-#define BUFFER 48 + STACKSIZE(%esp)
+#define ARGS 20
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
+#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
+#define A 24 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
+#define STACK_X 32 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
+#define Y 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
+#define BUFFER 48 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+ARGS(%esp)
+#define XX 4+ARGS(%esp)
+#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
- movl STACK_LDA, LDA
movl STACK_X, X
+ movl X,XX
+ movl A,J
+ movl J,AA #backup A
+ movl M,J
+ movl J,MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $20,J
+ subl $8,J
+ subl J,MMM #MMM-=J
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A
+
+ movl XX,%eax
+ movl %eax,X
+
+ movl STACK_LDA,LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
ALIGN_4
.L999:
+ movl M,%eax
+ sall $ZBASE_SHIFT, %eax
+ addl %eax,AA
+ movl STACK_INCX,INCX
+ imull INCX,%eax
+ addl %eax,XX
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
+#define ARGS 20
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
+#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
+#define A 32 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
+#define STACK_X 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
+#define Y 48 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
+#define BUFFER 56 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0 + ARGS(%esp)
+#define AA 4 + ARGS(%esp)
+#define XX 8 + ARGS(%esp)
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA_R 16 + STACKSIZE(%esp)
-#define ALPHA_I 24 + STACKSIZE(%esp)
-#define A 32 + STACKSIZE(%esp)
-#define STACK_LDA 36 + STACKSIZE(%esp)
-#define STACK_X 40 + STACKSIZE(%esp)
-#define STACK_INCX 44 + STACKSIZE(%esp)
-#define Y 48 + STACKSIZE(%esp)
-#define STACK_INCY 52 + STACKSIZE(%esp)
-#define BUFFER 56 + STACKSIZE(%esp)
-
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+ movl STACK_X, X
+ movl X, XX
+ movl A,J
+ movl J,AA
+ movl M,J
+ movl J,MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $18,J
+ subl $4,J
+ subl J,MMM
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax, M
+
+.L00t:
+ movl XX, %eax
+ movl %eax, X
+
+ movl AA,%eax
+ movl %eax,A
+
movl STACK_LDA, LDA
- movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
movl Y, Y1
movl N, J
- ALIGN_3
+ ALIGN_4
.L11:
movl BUFFER, X
ALIGN_4
.L999:
+ movl M,%eax
+ sall $ZBASE_SHIFT,%eax
+ addl %eax,AA
+ movl STACK_INCX,INCX
+ imull INCX,%eax
+ addl %eax,XX
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+ addl $ARGS,%esp
ret
EPILOGUE