From 3a5d8dbff986f6fcda11075057119bcdfa5c5c15 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 3 Sep 2014 15:34:30 +0200 Subject: [PATCH] optimized sgemv_n_4.c --- kernel/x86_64/sgemv_n_4.c | 34 ++++++++++++++---------- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 5 +++- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f8401607..31d841dd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -174,9 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n1; BLASLONG m1; BLASLONG m2; + BLASLONG m3; BLASLONG n2; - BLASLONG lda4 = 4 * lda; - BLASLONG lda8 = 8 * lda; + BLASLONG lda4 = lda << 2; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8],*ybuffer; if ( m < 1 ) return(0); @@ -186,19 +187,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - n1 = n / 8 ; - n2 = n % 8 ; + n1 = n >> 3 ; + n2 = n & 7 ; } else { - n1 = n / 4 ; - n2 = n % 4 ; + n1 = n >> 2 ; + n2 = n & 3 ; } - m1 = m - ( m % 4 ); - m2 = (m % NBMAX) - (m % 4) ; - + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; BLASLONG NB = NBMAX; @@ -237,8 +240,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO x_ptr += 8; } -/* - for( i = 0; i < n1 ; i++) + + if ( n2 & 4 ) { sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); ap[0] += lda4; @@ -248,8 +251,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; x_ptr += 4; } -*/ - for( i = 0; i < n2 ; i++) + + for( i = 0; i < ( n2 & 3 ) ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -296,8 +299,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a += NB; y_ptr += NB * inc_y; } + + if ( m3 == 0 ) return; + j=0; - while ( j < (m % 4)) + while ( j < m3 ) { a_ptr = a; x_ptr = x; diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index accc529b..f87cfa42 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -58,13 +58,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "xorps %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm14, %%xmm10 \n\t" @@ -78,6 +80,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "movups (%5,%8,4), %%xmm9 \n\t" "movups (%6,%8,4), %%xmm10 \n\t" "movups (%7,%8,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" -- 2.34.1