From 6a728409450ebb018dc19db0fa40ff3c54385136 Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 29 May 2013 13:23:12 +0800 Subject: [PATCH] Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86. --- kernel/x86/gemv_n_sse.S | 10 +++---- kernel/x86/gemv_n_sse2.S | 6 ++-- kernel/x86/gemv_t_sse.S | 28 ++++++------------ kernel/x86/gemv_t_sse2.S | 29 +++++++++---------- kernel/x86/zgemv_n_sse.S | 68 ++++++++++++++++++++++++++++++++++++-------- kernel/x86/zgemv_n_sse2.S | 66 +++++++++++++++++++++++++++++++++++-------- kernel/x86/zgemv_t_sse.S | 71 +++++++++++++++++++++++++++++++++++++--------- kernel/x86/zgemv_t_sse2.S | 72 ++++++++++++++++++++++++++++++++++++++--------- 8 files changed, 257 insertions(+), 93 deletions(-) diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index 3ff9203..f3a388f 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -101,10 +101,10 @@ #define Y 36 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp) + #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) -#define LDAX 12+ARGS(%esp) #define I %eax #define J %ebx @@ -153,8 +153,8 @@ movl YY,J movl J,Y - movl STACK_LDA, LDA + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -688,9 +688,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA - movl YY,J - addl %eax,J - movl J,YY + movl STACK_INCY,INCY + imull INCY,%eax + addl %eax,YY jmp .L0t ALIGN_4 diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 980797d..eeb3c25 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -714,9 +714,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA - movl YY,J - addl %eax,J - movl J,YY + movl STACK_INCY,INCY + imull INCY,%eax + addl %eax,YY jmp .L0t ALIGN_4 diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 326584b..48193f1 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -102,11 +102,9 @@ #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp) -#define MMM 0+STACKSIZE(%esp) -#define NN 4+STACKSIZE(%esp) -#define AA 8+STACKSIZE(%esp) -#define LDAX 12+STACKSIZE(%esp) -#define XX 16+STACKSIZE(%esp) +#define MMM 0+ARGS(%esp) +#define AA 4+ARGS(%esp) +#define XX 8+ARGS(%esp) #define I %eax #define J %ebx @@ -129,12 +127,8 @@ PROFCODE - movl STACK_LDA, LDA - movl LDA,LDAX # backup LDA movl STACK_X, X movl X,XX - movl N,J - movl J,NN # backup N movl A,J movl J,AA # backup A movl M,J @@ -144,7 +138,6 @@ addl $1,J sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) subl $8, J # Don't use last 8 float in the buffer. - # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t @@ -159,13 +152,10 @@ movl AA,%eax movl %eax,A # mov AA to A - movl NN,%eax - movl %eax,N # reset N - - - movl LDAX, LDA # reset LDA - movl XX,X + movl XX,%eax + movl %eax,X + movl STACK_LDA, LDA movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -688,9 +678,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA - movl XX,J - addl %eax,J - movl J,XX + movl STACK_INCX,INCX + imull INCX,%eax + addl %eax,XX jmp .L0t ALIGN_4 diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 60d6ef2..75ed89a 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -76,7 +76,7 @@ #endif #define STACKSIZE 16 -#define ARGS 16 +#define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) @@ -89,10 +89,9 @@ #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp) -#define MMM 0+STACKSIZE(%esp) -#define AA 4+STACKSIZE(%esp) -#define LDAX 8+STACKSIZE(%esp) -#define NN 12+STACKSIZE(%esp) +#define MMM 0+ARGS(%esp) +#define AA 4+ARGS(%esp) +#define XX 8+ARGS(%esp) #define I %eax #define J %ebx @@ -117,10 +116,8 @@ PROFCODE - movl STACK_LDA, LDA - movl LDA,LDAX # backup LDA - movl N,J - movl J,NN # backup N + movl STACK_X, X + movl X,XX movl A,J movl J,AA # backup A movl M,J @@ -130,7 +127,6 @@ addl $1,J sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) subl $4, J # Don't use last 4 double in the buffer. - # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t @@ -142,15 +138,13 @@ movl %eax,M .L00t: + movl XX,%eax + movl %eax, X + movl AA,%eax movl %eax,A # mov AA to A - movl NN,%eax - movl %eax,N # reset N - - - movl LDAX, LDA # reset LDA - movl STACK_X, X + movl STACK_LDA, LDA movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -605,6 +599,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA + movl STACK_INCX,INCX + imull INCX,%eax + addl %eax,XX jmp .L0t ALIGN_4 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 0087ac6..b0f686a 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -89,18 +89,23 @@ #endif #define STACKSIZE 16 - -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA_R 16 + STACKSIZE(%esp) -#define ALPHA_I 20 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define ARGS 20 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA_R 16 + STACKSIZE+ARGS(%esp) +#define ALPHA_I 20 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -123,6 +128,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -130,6 +136,33 @@ PROFCODE + movl Y,J + movl J,YY + movl A,J + movl J,AA + movl M,J + movl J,MMM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_3 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -595,10 +628,21 @@ ALIGN_3 .L999: + movl M,%eax + sall $ZBASE_SHIFT,%eax + addl %eax,AA + movl STACK_INCY,INCY + imull INCY,%eax + addl %eax,YY + jmp .L0t + ALIGN_3 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index f0f2dc0..bb33d26 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -76,18 +76,23 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA_R 16 + STACKSIZE+ARGS(%esp) +#define ALPHA_I 24 + STACKSIZE+ARGS(%esp) +#define A 32 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 36 + STACKSIZE+ARGS(%esp) +#define STACK_X 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 44 + STACKSIZE+ARGS(%esp) +#define Y 48 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 52 + STACKSIZE+ARGS(%esp) +#define BUFFER 56 + STACKSIZE+ARGS(%esp) +#define MMM 0 + ARGS(%esp) +#define YY 4 + ARGS(%esp) +#define AA 8 + ARGS(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA_R 16 + STACKSIZE(%esp) -#define ALPHA_I 24 + STACKSIZE(%esp) -#define A 32 + STACKSIZE(%esp) -#define STACK_LDA 36 + STACKSIZE(%esp) -#define STACK_X 40 + STACKSIZE(%esp) -#define STACK_INCX 44 + STACKSIZE(%esp) -#define Y 48 + STACKSIZE(%esp) -#define STACK_INCY 52 + STACKSIZE(%esp) -#define BUFFER 56 + STACKSIZE(%esp) #define I %eax #define J %ebx @@ -110,6 +115,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -117,6 +123,33 @@ PROFCODE + movl Y,J + movl J,YY + movl A,J + movl J,AA + movl M,J + movl J,MMM +.L0t: + xorl J,J + addl $1,J + sall $18,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_3 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -458,10 +491,21 @@ ALIGN_3 .L999: + movl M,%eax + sall $ZBASE_SHIFT,%eax + addl %eax,AA + movl STACK_INCY,INCY + imull INCY,%eax + addl %eax,YY + jmp .L0t + ALIGN_3 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index c7ad912..a7a7abd 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -89,18 +89,23 @@ #endif #define STACKSIZE 16 - -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA_R 16 + STACKSIZE(%esp) -#define ALPHA_I 20 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define ARGS 20 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA_R 16 + STACKSIZE+ARGS(%esp) +#define ALPHA_I 20 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define XX 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -123,6 +128,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -130,8 +136,35 @@ PROFCODE - movl STACK_LDA, LDA movl STACK_X, X + movl X,XX + movl A,J + movl J,AA #backup A + movl M,J + movl J,MMM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl $8,J + subl J,MMM #MMM-=J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl XX,%eax + movl %eax,X + + movl STACK_LDA,LDA movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -513,10 +546,22 @@ ALIGN_4 .L999: + movl M,%eax + sall $ZBASE_SHIFT, %eax + addl %eax,AA + movl STACK_INCX,INCX + imull INCX,%eax + addl %eax,XX + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 6c48428..86f5976 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -76,19 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 20 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA_R 16 + STACKSIZE+ARGS(%esp) +#define ALPHA_I 24 + STACKSIZE+ARGS(%esp) +#define A 32 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 36 + STACKSIZE+ARGS(%esp) +#define STACK_X 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 44 + STACKSIZE+ARGS(%esp) +#define Y 48 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 52 + STACKSIZE+ARGS(%esp) +#define BUFFER 56 + STACKSIZE+ARGS(%esp) + +#define MMM 0 + ARGS(%esp) +#define AA 4 + ARGS(%esp) +#define XX 8 + ARGS(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA_R 16 + STACKSIZE(%esp) -#define ALPHA_I 24 + STACKSIZE(%esp) -#define A 32 + STACKSIZE(%esp) -#define STACK_LDA 36 + STACKSIZE(%esp) -#define STACK_X 40 + STACKSIZE(%esp) -#define STACK_INCX 44 + STACKSIZE(%esp) -#define Y 48 + STACKSIZE(%esp) -#define STACK_INCY 52 + STACKSIZE(%esp) -#define BUFFER 56 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -110,6 +115,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -117,8 +123,35 @@ PROFCODE + movl STACK_X, X + movl X, XX + movl A,J + movl J,AA + movl M,J + movl J,MMM +.L0t: + xorl J,J + addl $1,J + sall $18,J + subl $4,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax, M + +.L00t: + movl XX, %eax + movl %eax, X + + movl AA,%eax + movl %eax,A + movl STACK_LDA, LDA - movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -188,7 +221,7 @@ movl Y, Y1 movl N, J - ALIGN_3 + ALIGN_4 .L11: movl BUFFER, X @@ -395,10 +428,21 @@ ALIGN_4 .L999: + movl M,%eax + sall $ZBASE_SHIFT,%eax + addl %eax,AA + movl STACK_INCX,INCX + imull INCX,%eax + addl %eax,XX + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE -- 2.7.4