USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
-* BLASTEST : OK
-* CTEST : OK
-* TEST : OK
-* LAPACK-TEST : OK
-**************************************************************************************/
-
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}
+static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
+{
+
+ double *a0;
+ double *a1;
+ double *a2;
+ double *a3;
+ double *a4;
+ double *a5;
+ double *a6;
+ double *a7;
+ long tmp;
+ __asm__
+ (
+ "lxvp 34, 0( %15) \n\t" // x0, x1
+ "lxvp 38, 32( %15) \n\t" // x4, x5
+
+ XXSPLTD_S(58,%x14,0) // alpha, alpha
+ "sldi %10, %17, 3 \n\t" // lda * sizeof (double)
+ "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha
+ "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha
+ "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha
+ "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha
+
+ "li %11, 32 \n\t"
+
+ "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
+ "add %10, %10, %10 \n\t" // 2 * lda
+ XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
+ XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
+ XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
+ XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
+ XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha
+ XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
+ XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
+ XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
+
+ "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
+ "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
+ "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda
+ "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda
+ "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda
+ "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda
+
+ "lxvp 40, 0( %3) \n\t" // a0[0], a0[1]
+ "lxvp 42, 0( %4) \n\t" // a1[0], a1[1]
+ "lxvp 44, 0( %5) \n\t" // a2[0], a2[1]
+ "lxvp 46, 0( %6) \n\t" // a3[0], a3[1]
+ "lxvp 50, 0( %7) \n\t" // a4[0]
+ "lxvp 52, 0( %8) \n\t" // a5[0]
+ "lxvp 54, 0( %9) \n\t" // a6[0]
+ "lxvp 56, 0( %10) \n\t" // a7[0]
+
+
+ "addic. %1, %1, -4 \n\t"
+ "ble two%= \n\t"
+
+ ".align 5 \n"
+ "one%=: \n\t"
+
+ "lxvp 36, 0( %2) \n\t" // y0, y1
+
+ "xvmaddadp 36, 40, 34 \n\t"
+ "xvmaddadp 37, 41, 34 \n\t"
+ "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
+ "xvmaddadp 36, 42, 35 \n\t"
+ "xvmaddadp 37, 43, 35 \n\t"
+ "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
+ "xvmaddadp 36, 44, 32 \n\t"
+ "xvmaddadp 37, 45, 32 \n\t"
+ "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
+ "xvmaddadp 36, 46, 33 \n\t"
+ "xvmaddadp 37, 47, 33 \n\t"
+ "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
+ "xvmaddadp 36, 50, 48 \n\t"
+ "xvmaddadp 37, 51, 48 \n\t"
+ "lxvpx 50, %7, %11 \n\t" // a4[0]
+ "xvmaddadp 36, 52, 49 \n\t"
+ "xvmaddadp 37, 53, 49 \n\t"
+ "lxvpx 52, %8, %11 \n\t" // a5[0]
+ "xvmaddadp 36, 54, 38 \n\t"
+ "xvmaddadp 37, 55, 38 \n\t"
+ "lxvpx 54, %9, %11 \n\t" // a6[0]
+ "xvmaddadp 36, 56, 39 \n\t"
+ "xvmaddadp 37, 57, 39 \n\t"
+ "lxvpx 56, %10, %11 \n\t" // a7[0]
+ "addi %11, %11, 32 \n\t"
+
+ "stxvp 36, 0( %2) \n\t" // y0, y1
+ "addi %2, %2, 32 \n\t"
+
+ "addic. %1, %1, -4 \n\t"
+ "bgt one%= \n"
+
+ "two%=: \n\t"
+
+ "lxvp 36, 0( %2) \n\t" // y0, y1
+ "xvmaddadp 36, 40, 34 \n\t"
+ "xvmaddadp 37, 41, 34 \n\t"
+ "xvmaddadp 36, 42, 35 \n\t"
+ "xvmaddadp 37, 43, 35 \n\t"
+ "xvmaddadp 36, 44, 32 \n\t"
+ "xvmaddadp 37, 45, 32 \n\t"
+ "xvmaddadp 36, 46, 33 \n\t"
+ "xvmaddadp 37, 47, 33 \n\t"
+ "xvmaddadp 36, 50, 48 \n\t"
+ "xvmaddadp 37, 51, 48 \n\t"
+ "xvmaddadp 36, 52, 49 \n\t"
+ "xvmaddadp 37, 53, 49 \n\t"
+ "xvmaddadp 36, 54, 38 \n\t"
+ "xvmaddadp 37, 55, 38 \n\t"
+ "xvmaddadp 36, 56, 39 \n\t"
+ "xvmaddadp 37, 57, 39 \n\t"
+ "stxvp 36, 0( %2) \n\t" // y0, y1
+
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (y), // 2
+ "=b" (a0), // 3
+ "=b" (a1), // 4
+ "=&b" (a2), // 5
+ "=&b" (a3), // 6
+ "=&b" (a4), // 7
+ "=&b" (a5), // 8
+ "=&b" (a6), // 9
+ "=&b" (a7), // 10
+ "=b" (tmp)
+ :
+ "m" (*x),
+ "m" (*ap),
+ "d" (alpha), // 14
+ "r" (x), // 15
+ "3" (ap), // 16
+ "4" (lda) // 17
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
+ "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
+ );
+}
*****************************************************************************/
#include "common.h"
-#include <altivec.h>
-
-typedef __vector unsigned char vec_t;
-typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
-typedef __vector_pair __attribute__((aligned(8))) vecp_t;
#include "dgemv_n_microk_power10.c"
-#define MMA(X, APTR, ACC) \
- rX = (vec_t *) & X; \
- rowA = *((vecp_t*)((void*)&APTR)); \
- __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
-
-#define SAVE(ACC, Z) \
- rowC = (v4sf_t *) &y[Z]; \
- __builtin_mma_disassemble_acc ((void *)result, ACC); \
- result[0][1] = result[1][0]; \
- result[2][1] = result[3][0]; \
- rowC[0] += valpha * result[0]; \
- rowC[1] += valpha * result[2];
-
-void
-dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
- FLOAT * y, FLOAT alpha)
-{
- BLASLONG i, j, tmp;
- FLOAT *a0 = a_ptr;
- FLOAT *x1 = xo;
- vector double valpha = { alpha, alpha };
- v4sf_t *rowC;
- __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- v4sf_t result[4];
- vecp_t rowA;
- vec_t *rX;
- tmp = (n / 32) * 32;
- for (i = 0; i < tmp; i += 32)
- {
- xo = x1;
- a0 = a_ptr;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
- __builtin_mma_xxsetaccz (&acc2);
- __builtin_mma_xxsetaccz (&acc3);
- __builtin_mma_xxsetaccz (&acc4);
- __builtin_mma_xxsetaccz (&acc5);
- __builtin_mma_xxsetaccz (&acc6);
- __builtin_mma_xxsetaccz (&acc7);
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + 0 + j * lda], &acc0);
- MMA (xo[j], a0[i + 4 + j * lda], &acc1);
- MMA (xo[j], a0[i + 8 + j * lda], &acc2);
- MMA (xo[j], a0[i + 12 + j * lda], &acc3);
- MMA (xo[j], a0[i + 16 + j * lda], &acc4);
- MMA (xo[j], a0[i + 20 + j * lda], &acc5);
- MMA (xo[j], a0[i + 24 + j * lda], &acc6);
- MMA (xo[j], a0[i + 28 + j * lda], &acc7);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + 0 + j * lda], &acc0);
- MMA (xo[j], a0[i + 4 + j * lda], &acc1);
- MMA (xo[j], a0[i + 8 + j * lda], &acc2);
- MMA (xo[j], a0[i + 12 + j * lda], &acc3);
- MMA (xo[j], a0[i + 16 + j * lda], &acc4);
- MMA (xo[j], a0[i + 20 + j * lda], &acc5);
- MMA (xo[j], a0[i + 24 + j * lda], &acc6);
- MMA (xo[j], a0[i + 28 + j * lda], &acc7);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + 0 + j * lda], &acc0);
- MMA (xo[j], a0[i + 4 + j * lda], &acc1);
- MMA (xo[j], a0[i + 8 + j * lda], &acc2);
- MMA (xo[j], a0[i + 12 + j * lda], &acc3);
- MMA (xo[j], a0[i + 16 + j * lda], &acc4);
- MMA (xo[j], a0[i + 20 + j * lda], &acc5);
- MMA (xo[j], a0[i + 24 + j * lda], &acc6);
- MMA (xo[j], a0[i + 28 + j * lda], &acc7);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + 0 + j * lda], &acc0);
- MMA (xo[j], a0[i + 4 + j * lda], &acc1);
- MMA (xo[j], a0[i + 8 + j * lda], &acc2);
- MMA (xo[j], a0[i + 12 + j * lda], &acc3);
- MMA (xo[j], a0[i + 16 + j * lda], &acc4);
- MMA (xo[j], a0[i + 20 + j * lda], &acc5);
- MMA (xo[j], a0[i + 24 + j * lda], &acc6);
- MMA (xo[j], a0[i + 28 + j * lda], &acc7);
- }
- xo += 32;
- a0 += lda << 5;
- SAVE (&acc0, i + 0);
- SAVE (&acc1, i + 4);
- SAVE (&acc2, i + 8);
- SAVE (&acc3, i + 12);
- SAVE (&acc4, i + 16);
- SAVE (&acc5, i + 20);
- SAVE (&acc6, i + 24);
- SAVE (&acc7, i + 28);
-
- }
- for (i = tmp; i < n; i += 4)
- {
- xo = x1;
- a0 = a_ptr;
- __builtin_mma_xxsetaccz (&acc0);
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + j * lda], &acc0);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + j * lda], &acc0);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + j * lda], &acc0);
- }
- xo += 32;
- a0 += lda << 5;
- for (j = 0; j < 32; j++)
- {
- __builtin_prefetch (xo+j);
- __builtin_prefetch (a0+i+j+lda);
- MMA (xo[j], a0[i + j * lda], &acc0);
- }
- xo += 32;
- a0 += lda << 5;
- SAVE (&acc0, i);
- }
-}
-
-
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x4
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
- BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
- BLASLONG lda128 = lda << 7;
+ BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8] __attribute__ ((aligned (16)));
FLOAT *ybuffer;
if ( n < 1 ) return(0);
ybuffer = buffer;
- BLASLONG n128 = n >> 7;
- n1 = (n - (n128 * 128)) >> 2;
- n2 = (n - (n128 * 128)) & 3;
+ BLASLONG n8 = n >> 3;
+ n2 = n & 3;
m3 = m & 3 ;
m1 = m & -4 ;
if ( inc_x == 1 )
{
- for( i = 0; i < n128 ; i++)
+ for( i = 0; i < n8 ; i++)
{
- dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
- a_ptr += lda128;
- x_ptr += 128;
+ dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+ a_ptr += lda8;
+ x_ptr += 8;
}
- for( i = 0; i < n1 ; i++)
+ if( n & 4 )
{
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda4;
}
else
{
- for( i = 0; i < n128 ; i++)
+ for( i = 0; i < n8 ; i++)
{
- FLOAT xbuffer[128] __attribute__ ((aligned (16)));
BLASLONG j;
- for ( j = 0; j < 128 ; j++)
+ for ( j = 0; j < 8 ; j++)
{
xbuffer[j] = x_ptr[0];
x_ptr += inc_x;
}
- dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
- a_ptr += lda128;
+ dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+ a_ptr += lda8;
}
- for( i = 0; i < n1 ; i++)
+ if( n & 4 )
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;