From 81daf6bc380c22bcc7ce228952e5435bc79bb0ce Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:30:38 +0200 Subject: [PATCH] [ZARCH] Format source code, Fix constraints --- kernel/zarch/camax.c | 370 ++++++------ kernel/zarch/camin.c | 370 ++++++------ kernel/zarch/casum.c | 236 ++++---- kernel/zarch/caxpy.c | 232 ++++---- kernel/zarch/ccopy.c | 102 ++-- kernel/zarch/cdot.c | 254 ++++----- kernel/zarch/cgemv_n_4.c | 1263 ++++++++++++++++++++--------------------- kernel/zarch/cgemv_t_4.c | 1179 +++++++++++++++++++------------------- kernel/zarch/crot.c | 413 +++++++------- kernel/zarch/cscal.c | 684 +++++++++++------------ kernel/zarch/cswap.c | 263 ++++----- kernel/zarch/damax.c | 220 ++++---- kernel/zarch/damax_z13.c | 292 +++++----- kernel/zarch/damin.c | 220 ++++---- kernel/zarch/damin_z13.c | 292 +++++----- kernel/zarch/dasum.c | 248 ++++---- kernel/zarch/daxpy.c | 253 ++++----- kernel/zarch/dcopy.c | 76 ++- kernel/zarch/ddot.c | 196 +++---- kernel/zarch/dgemv_n_4.c | 1200 ++++++++++++++++++--------------------- kernel/zarch/dgemv_t_4.c | 1397 ++++++++++++++++++++++------------------------ kernel/zarch/dmax.c | 214 ++++--- kernel/zarch/dmax_z13.c | 252 ++++----- kernel/zarch/dmin.c | 214 ++++--- kernel/zarch/dmin_z13.c | 252 ++++----- kernel/zarch/drot.c | 381 ++++++------- kernel/zarch/dscal.c | 278 +++++---- kernel/zarch/dsdot.c | 246 ++++---- kernel/zarch/dswap.c | 228 ++++---- kernel/zarch/icamax.c | 515 +++++++++-------- kernel/zarch/icamin.c | 515 +++++++++-------- kernel/zarch/idamax.c | 411 +++++++------- kernel/zarch/idamin.c | 411 +++++++------- kernel/zarch/idmax.c | 385 ++++++------- kernel/zarch/idmin.c | 385 ++++++------- kernel/zarch/isamax.c | 496 ++++++++-------- kernel/zarch/isamin.c | 496 ++++++++-------- kernel/zarch/ismax.c | 458 ++++++++------- kernel/zarch/ismin.c | 458 ++++++++------- kernel/zarch/izamax.c | 409 +++++++------- kernel/zarch/izamin.c | 409 +++++++------- kernel/zarch/samax.c | 225 ++++---- kernel/zarch/samin.c | 225 ++++---- kernel/zarch/sasum.c | 252 ++++----- kernel/zarch/saxpy.c | 253 ++++----- kernel/zarch/scopy.c | 76 ++- kernel/zarch/sdot.c | 188 ++++--- kernel/zarch/sgemv_n_4.c | 1157 ++++++++++++++++++-------------------- kernel/zarch/sgemv_t_4.c | 1380 ++++++++++++++++++++++----------------------- kernel/zarch/smax.c | 219 ++++---- kernel/zarch/smin.c | 219 ++++---- kernel/zarch/srot.c | 381 ++++++------- kernel/zarch/sscal.c | 268 ++++----- kernel/zarch/sswap.c | 230 ++++---- kernel/zarch/zamax.c | 333 ++++++----- kernel/zarch/zamax_z13.c | 352 ++++++------ kernel/zarch/zamin.c | 317 +++++------ kernel/zarch/zamin_z13.c | 336 ++++++----- kernel/zarch/zasum.c | 232 ++++---- kernel/zarch/zaxpy.c | 232 ++++---- kernel/zarch/zcopy.c | 102 ++-- kernel/zarch/zdot.c | 246 ++++---- kernel/zarch/zgemv_n_4.c | 1147 ++++++++++++++++++------------------- kernel/zarch/zgemv_t_4.c | 1099 ++++++++++++++++++------------------ kernel/zarch/zrot.c | 413 +++++++------- kernel/zarch/zscal.c | 676 +++++++++++----------- kernel/zarch/zswap.c | 263 ++++----- 67 files changed, 13393 insertions(+), 14601 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2e96486..40a9903 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = camax_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + maxf = camax_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index aec5905..842635a 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = camin_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + minf = camin_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f4ebc21..f59e5a2 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if ( inc_x == 1 ) - { + if (n <= 0 || inc_x <= 0) + return (sumf); - n1 = n & -32; - if ( n1 > 0 ) - { + if (inc_x == 1) { - sumf = casum_kernel_32(n1, x); - i=n1; - ip=2*n1; - } + n1 = n & -32; + if (n1 > 0) { - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = casum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index fe5568c..d86342b 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%3) \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v1,4(%3),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%3),1 \n\t" - "vlef %%v1,4(%3),3 \n\t" -#else - "vlef %%v0,0(%3),1 \n\t" - "vlef %%v0,0(%3),3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v0,0(%3),2 \n\t" - "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" +#else + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if (n1) { - da[0] = da_r; - da[1] = da_i; - caxpy_kernel_16(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index fc0b8d6..1b93a81 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - ccopy_kernel_32(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + ccopy_kernel_32(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 3eda297..64d81ae 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v24,%%v24,%%v28 \n\t" - "vfasb %%v24,%%v24,%%v30 \n\t" - "vrepg %%v26,%%v24,1 \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vfasb %%v25,%%v25,%%v29 \n\t" - "vfasb %%v25,%%v25,%%v31 \n\t" - "vrepg %%v27,%%v25,1 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vstef %%v24,0(%3),0 \n\t" - "vstef %%v24,4(%3),1 \n\t" - "vstef %%v25,8(%3),1 \n\t" - "vstef %%v25,12(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -16; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - cdot_kernel_16(n1, x, y, dot); + BLASLONG n1 = n & -16; - i = n1; - BLASLONG j = i * 2; + if (n1) + cdot_kernel_16(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index ed81325..db91d90 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,719 +25,720 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 2048 -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%3),0 \n\t" - "vlef %%v28,0(%%r1,%3),1 \n\t" - "vlef %%v28,8(%%r1,%3),2 \n\t" - "vlef %%v28,8(%%r1,%3),3 \n\t" - "vlef %%v29,4(%%r1,%3),0 \n\t" - "vlef %%v29,4(%%r1,%3),1 \n\t" - "vlef %%v29,12(%%r1,%3),2 \n\t" - "vlef %%v29,12(%%r1,%3),3 \n\t" - "vlef %%v30,0(%%r1,%4),0 \n\t" - "vlef %%v30,0(%%r1,%4),1 \n\t" - "vlef %%v30,8(%%r1,%4),2 \n\t" - "vlef %%v30,8(%%r1,%4),3 \n\t" - "vlef %%v31,4(%%r1,%4),0 \n\t" - "vlef %%v31,4(%%r1,%4),1 \n\t" - "vlef %%v31,12(%%r1,%4),2 \n\t" - "vlef %%v31,12(%%r1,%4),3 \n\t" - - "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" - "vlef %%v1,%4,3 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" - "vlrepf %%v1,%4 \n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,0(%%r1,%2) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - - "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" - - "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" - - "vst %%v22,0(%%r1,%2) \n\t" - "vst %%v23,16(%%r1,%2) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return(0); - } + return (0); + } + if (m3 == 1) { - if ( m3 == 1 ) - { + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index f04a624..669d78a 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - crot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 0c15c5a..a2d5bf2 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "vlef %%v1,4(%1),0 \n\t" - "vlef %%v1,4(%1),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%1),1 \n\t" - "vlef %%v1,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - "verllg %%v28,%%v20,32 \n\t" - "verllg %%v29,%%v21,32 \n\t" - "verllg %%v30,%%v22,32 \n\t" - "verllg %%v31,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlef %%v0,4(%1),0 \n\t" - "vlef %%v0,4(%1),2 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,4(%1),1 \n\t" - "vlef %%v0,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v16,%%v16,32 \n\t" - "verllg %%v17,%%v17,32 \n\t" - "verllg %%v18,%%v18,32 \n\t" - "verllg %%v19,%%v19,32 \n\t" - "verllg %%v20,%%v20,32 \n\t" - "verllg %%v21,%%v21,32 \n\t" - "verllg %%v22,%%v22,32 \n\t" - "verllg %%v23,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlef %%v0,4(%[alpha]),0\n\t" + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -16; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -16; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - cscal_kernel_16_zero(n1, x); - else - cscal_kernel_16_zero_r(n1, alpha, x); - else - if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); - else - cscal_kernel_16(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 256995d..92a8159 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - cswap_kernel_32(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + cswap_kernel_32(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 8274671..37008f7 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 95b94ee..530d6e5 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 821f9ec..a017917 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,8 \n\t" - "vfmindb %%v17,%%v17,%%v25,8 \n\t" - "vfmindb %%v18,%%v18,%%v26,8 \n\t" - "vfmindb %%v19,%%v19,%%v27,8 \n\t" - "vfmindb %%v20,%%v20,%%v28,8 \n\t" - "vfmindb %%v21,%%v21,%%v29,8 \n\t" - "vfmindb %%v22,%%v22,%%v30,8 \n\t" - "vfmindb %%v23,%%v23,%%v31,8 \n\t" - - "vfmindb %%v16,%%v16,%%v20,8 \n\t" - "vfmindb %%v17,%%v17,%%v21,8 \n\t" - "vfmindb %%v18,%%v18,%%v22,8 \n\t" - "vfmindb %%v19,%%v19,%%v23,8 \n\t" - - "vfmindb %%v16,%%v16,%%v18,8 \n\t" - "vfmindb %%v17,%%v17,%%v19,8 \n\t" - - "vfmindb %%v16,%%v16,%%v17,8 \n\t" - - "vfmindb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 538690e..2172b6d 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index fea431c..9f69a99 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabs + +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -32; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = dasum_kernel_32(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -32; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = dasum_kernel_32(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index e882374..179ef88 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepg %%v0,%3 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepg %%v0,%[alpha]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - daxpy_kernel_32(n1, x, y , &da); + if (n1) + daxpy_kernel_32(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index bb53256..f7cbf54 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - dcopy_kernel_32(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index ff4c347..f5f6017 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = ddot_kernel_16(n1, x, y); + if (n1) + dot = ddot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { - - dot += y[i] * x[i] ; - i++ ; - - } - return(dot); + i = n1; + while (i < n) { + dot += y[i] * x[i]; + i++; } + return (dot); - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; + } - BLASLONG n1 = n & -4; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = y[iy] * x[ix] ; - FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + while (i < n1) { - FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; - FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + FLOAT m1 = y[iy] * x[ix]; + FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; + FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; + FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; - temp1 += m1+m3; - temp2 += m2+m4; + ix += inc_x * 4; + iy += inc_y * 4; - i+=4 ; + temp1 += m1 + m3; + temp2 += m2 + m4; - } + i += 4; - while(i < n) - { + } - temp1 += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - dot = temp1 + temp2; - return(dot); - -} + temp1 += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + dot = temp1 + temp2; + return (dot); +} diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca4fd61..c93ff9b 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,663 +29,579 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%5) \n\t" - "vlrepg %%v1,8(%5) \n\t" - "vlrepg %%v2,16(%5) \n\t" - "vlrepg %%v3,24(%5) \n\t" - "vlrepg %%v4,%7 \n\t" - "vfmdb %%v0,%%v0,%%v4 \n\t" - "vfmdb %%v1,%%v1,%%v4 \n\t" - "vfmdb %%v2,%%v2,%%v4 \n\t" - "vfmdb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,8(%3) \n\t" - "vlrepg %%v2,%5 \n\t" - "vfmdb %%v0,%%v0,%%v2 \n\t" - "vfmdb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%2) \n\t" - "vlrepg %%v1,%4 \n\t" - "vfmdb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 8); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 2d8fa0d..24680cf 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,795 +29,724 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "adbr %%f0,%%f4 \n\t" - "std %%f0,0(%6) \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "adbr %%f1,%%f4 \n\t" - "std %%f1,8(%6) \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "adbr %%f2,%%f4 \n\t" - "std %%f2,16(%6) \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "adbr %%f3,%%f4 \n\t" - "std %%f3,24(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "adbr %%f0,%%f2 \n\t" - "std %%f0,0(%4) \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "adbr %%f1,%%f2 \n\t" - "std %%f1,8(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "std %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepg %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - yp = ytemp; + y_ptr = y; + a_ptr = a; + x_ptr = x; - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - if ( n2 & 2 ) - { + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + yp = ytemp; + for (i = 0; i < nb1; i++) { + dgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 1 ) - { - - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + } - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + yp = ytemp; - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - if ( inc_y == 1 ) - { + if (n2 & 2) { - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; + dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - for ( j=0; j< ( n & -4 ); j+=4 ) - { + } - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; - y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; - y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; - y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; - aj += lda4; - } + if (n2 & 1) { - for ( ; j< n ; j++ ) - { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; - aj += lda; - } + } + a += NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; - } - else - { + FLOAT *aj = a_ptr; + y_ptr = y; - for ( j=0; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83e7b02..87bccbe 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT max; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return max; +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT max; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return max; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) return (maxf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 0732891..518cc26 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v17,%%v17,%%v25,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v19,%%v19,%%v27,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v21,%%v21,%%v29,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - "vfmindb %%v23,%%v23,%%v31,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v17,%%v17,%%v21,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfmindb %%v19,%%v19,%%v23,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfmindb %%v17,%%v17,%%v19,0 \n\t" - - "vfmindb %%v16,%%v16,%%v17,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index e64f90e..9156199 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index c91f958..8f0197f 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - drot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + drot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index ccc6dd9..c944990 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepg %%v0,%[da]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - - dscal_kernel_16_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - dscal_kernel_16(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + + if (da == 0.0) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + dscal_kernel_16_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + dscal_kernel_16(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -4; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -4; - x[i]=0.0; - x[i + inc_x]=0.0; - x[i + 2 * inc_x]=0.0; - x[i + 3 * inc_x]=0.0; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = 0.0; + x[i + inc_x] = 0.0; + x[i + 2 * inc_x] = 0.0; + x[i + 3 * inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 4; + j += 4; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -4; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -4; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; - x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; - x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; - } + i += inc_x * 4; + j += 4; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 72950c9..1ac02d4 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - double dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v16,4(%%r1,%2),2 \n\t" - "vlef %%v17,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),2 \n\t" - "vlef %%v18,16(%%r1,%2),0 \n\t" - "vlef %%v18,20(%%r1,%2),2 \n\t" - "vlef %%v19,24(%%r1,%2),0 \n\t" - "vlef %%v19,28(%%r1,%2),2 \n\t" - "vlef %%v20,32(%%r1,%2),0 \n\t" - "vlef %%v20,36(%%r1,%2),2 \n\t" - "vlef %%v21,40(%%r1,%2),0 \n\t" - "vlef %%v21,44(%%r1,%2),2 \n\t" - "vlef %%v22,48(%%r1,%2),0 \n\t" - "vlef %%v22,52(%%r1,%2),2 \n\t" - "vlef %%v23,56(%%r1,%2),0 \n\t" - "vlef %%v23,60(%%r1,%2),2 \n\t" - - "vflls %%v16,%%v16 \n\t" - "vflls %%v17,%%v17 \n\t" - "vflls %%v18,%%v18 \n\t" - "vflls %%v19,%%v19 \n\t" - "vflls %%v20,%%v20 \n\t" - "vflls %%v21,%%v21 \n\t" - "vflls %%v22,%%v22 \n\t" - "vflls %%v23,%%v23 \n\t" - - "vlef %%v24,0(%%r1,%3),0 \n\t" - "vlef %%v24,4(%%r1,%3),2 \n\t" - "vflls %%v24,%%v24 \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vlef %%v25,8(%%r1,%3),0 \n\t" - "vlef %%v25,12(%%r1,%3),2 \n\t" - "vflls %%v25,%%v25 \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vlef %%v26,16(%%r1,%3),0 \n\t" - "vlef %%v26,20(%%r1,%3),2 \n\t" - "vflls %%v26,%%v26 \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vlef %%v27,24(%%r1,%3),0 \n\t" - "vlef %%v27,28(%%r1,%3),2 \n\t" - "vflls %%v27,%%v27 \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vlef %%v28,32(%%r1,%3),0 \n\t" - "vlef %%v28,36(%%r1,%3),2 \n\t" - "vflls %%v28,%%v28 \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vlef %%v29,40(%%r1,%3),0 \n\t" - "vlef %%v29,44(%%r1,%3),2 \n\t" - "vflls %%v29,%%v29 \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vlef %%v30,48(%%r1,%3),0 \n\t" - "vlef %%v30,52(%%r1,%3),2 \n\t" - "vflls %%v30,%%v30 \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vlef %%v31,56(%%r1,%3),0 \n\t" - "vlef %%v31,60(%%r1,%3),2 \n\t" - "vflls %%v31,%%v31 \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + double dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - double dot = 0.0 ; + double dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = dsdot_kernel_16(n1,x,y); + if (n1) + dot = dsdot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += (double) y[i] * (double) x[i] ; - i++ ; + dot += (double) y[i] * (double) x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += (double) y[iy] * (double) x[ix]; - dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += (double) y[iy] * (double) x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += (double) y[iy] * (double) x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 8070ef4..60ba40b 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1 )) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - dswap_kernel_32(n1, x, y); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dswap_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - + + } + return (0); } diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 5129ca6..1e1040a 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamax; +} - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (max); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + max = icamax_kernel_32(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - return iamax; -} + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (max + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + max = 0; + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - max = icamax_kernel_32(n1, x, &maxf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); + ix += inc_x2 * 4; - } else { - - max = 0; - maxf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 05068b2..d1c0e32 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamin; +} - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (min); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + min = icamin_kernel_32(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - return iamin; -} + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (min + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + min = 0; + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - min = icamin_kernel_32(n1, x, &minf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); + ix += inc_x2 * 4; - } else { - - min = 0; - minf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a..8434c81 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = idamax_kernel_32(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = idamax_kernel_32(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } - max = 0; - maxf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + max = 0; + maxf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f728..80a37e6 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) return (min); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = idamin_kernel_32(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } - min = 0; - minf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + min = 0; + minf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c30407..18cdba4 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imax; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) + return (max); - if (n <= 0 || inc_x <= 0) return (max); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + max = idmax_kernel_32(n1, x, &maxf); - max = idmax_kernel_32(n1, x, &maxf); + i = n1; + } else { + maxf = x[0]; + i++; + } - i = n1; - } - else - { - maxf = x[0]; - i++; - } + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + } else { - } else { + max = 0; + maxf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a..02ca427 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imin; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) + return (min); - if (n <= 0 || inc_x <= 0) return (min); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + min = idmin_kernel_32(n1, x, &minf); - min = idmin_kernel_32(n1, x, &minf); + i = n1; + } else { + minf = x[0]; + i++; + } - i = n1; - } - else - { - minf = x[0]; - i++; - } + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + } else { - } else { + min = 0; + minf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6e0aaa1..bbb4012 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = isamax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = isamax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - max = 0; - maxf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 266c48f..e8b34b9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = isamin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = isamin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - min = 0; - minf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index c968ce6..a565df5 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; + return imax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = ismax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + max = ismax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - max = 0; - maxf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = x[0]; - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 0145b31..ff72b2c 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; + return imin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = ismin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + min = ismin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - min = 0; - minf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = x[0]; - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2d1cc23..48afb82 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamax; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamax; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (max); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + max = izamax_kernel_16(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - max = izamax_kernel_16(n1, x, &maxf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (max + 1); + return (max + 1); + + } else { - } else { - max = 0; - maxf = CABS1(x,0); + maxf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 676fd7c..3edbe3d 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamin; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamin; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (min); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + min = izamin_kernel_16(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - min = izamin_kernel_16(n1, x, &minf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (min + 1); + return (min + 1); + + } else { - } else { - min = 0; - minf = CABS1(x,0); + minf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index b629d64..efbc031 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = samax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = samax_kernel_64(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 7ce6ee6..138836c 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,8 \n\t" - "vfminsb %%v17,%%v17,%%v25,8 \n\t" - "vfminsb %%v18,%%v18,%%v26,8 \n\t" - "vfminsb %%v19,%%v19,%%v27,8 \n\t" - "vfminsb %%v20,%%v20,%%v28,8 \n\t" - "vfminsb %%v21,%%v21,%%v29,8 \n\t" - "vfminsb %%v22,%%v22,%%v30,8 \n\t" - "vfminsb %%v23,%%v23,%%v31,8 \n\t" - - "vfminsb %%v16,%%v16,%%v20,8 \n\t" - "vfminsb %%v17,%%v17,%%v21,8 \n\t" - "vfminsb %%v18,%%v18,%%v22,8 \n\t" - "vfminsb %%v19,%%v19,%%v23,8 \n\t" - - "vfminsb %%v16,%%v16,%%v18,8 \n\t" - "vfminsb %%v17,%%v17,%%v19,8 \n\t" - - "vfminsb %%v16,%%v16,%%v17,8 \n\t" - - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = samin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = samin_kernel_64(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 2c59ab2..0c3057a 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabsf + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = sasum_kernel_64(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -64; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = sasum_kernel_64(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index 26ead31..e41e87a 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepf %%v0,%3 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepf %%v0,%[alpha]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -64; - if ( n1 ) - saxpy_kernel_64(n1, x, y , &da); + if (n1) + saxpy_kernel_64(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index ff42275..44d27b0 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,6 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - scopy_kernel_64(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index 5ddbc69..f659b0c 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "vrepf %%v3,%%v0,3 \n\t" - "aebr %%f0,%%f1 \n\t" - "aebr %%f0,%%f2 \n\t" - "aebr %%f0,%%f3 \n\t" - "ler %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - dot = sdot_kernel_32(n1,x,y); + if (n1) + dot = sdot_kernel_32(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += y[i] * x[i] ; - i++ ; + dot += y[i] * x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 01d8414..86ac249 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,640 +29,559 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%5) \n\t" - "vlrepf %%v1,4(%5) \n\t" - "vlrepf %%v2,8(%5) \n\t" - "vlrepf %%v3,12(%5) \n\t" - "vlrepf %%v4,%7 \n\t" - "vfmsb %%v0,%%v0,%%v4 \n\t" - "vfmsb %%v1,%%v1,%%v4 \n\t" - "vfmsb %%v2,%%v2,%%v4 \n\t" - "vfmsb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%3) \n\t" - "vlrepf %%v1,4(%3) \n\t" - "vlrepf %%v2,%5 \n\t" - "vfmsb %%v0,%%v0,%%v2 \n\t" - "vfmsb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%2) \n\t" - "vlrepf %%v1,%4 \n\t" - "vfmsb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index a313672..6ae9b6d 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,783 +29,717 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepf %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - } + y_ptr = y; + a_ptr = a; + x_ptr = x; + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - yp = ytemp; + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + + yp = ytemp; + for (i = 0; i < nb1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 2 ) - { + } - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + } - } + yp = ytemp; - if ( n2 & 1 ) - { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; + } + if (n1 > 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + if (n2 & 2) { + + sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + if (n2 & 1) { - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = smax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = smax_kernel_64(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e7d8344..2e9c793 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v17,%%v17,%%v25,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v19,%%v19,%%v27,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v21,%%v21,%%v29,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - "vfminsb %%v23,%%v23,%%v31,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v17,%%v17,%%v21,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfminsb %%v19,%%v19,%%v23,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfminsb %%v17,%%v17,%%v19,0 \n\t" - - "vfminsb %%v16,%%v16,%%v17,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = smin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = smin_kernel_64(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 763cc66..5b21a19 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - srot_kernel_64(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -64; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index c18a7e5..07e6845 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepf %%v0,%[da]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - - sscal_kernel_32_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - sscal_kernel_32(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + if (da == 0.0) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + sscal_kernel_32_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + sscal_kernel_32(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -2; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -2; - x[i]=0.0; - x[i + inc_x]=0.0; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = 0.0; + x[i + inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 2; + j += 2; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -2; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -2; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; - } + i += inc_x * 2; + j += 2; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index d0c0dc3..dc71131 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - sswap_kernel_64(n1, x, y); - i=n1; - } + if (n <= 0) + return (0); - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + sswap_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index cc63471..531e47a 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index ae711c1..cac2da9 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 18610da..940d81d 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index f82c57e..7417e0b 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 8faaf20..43ae8ff 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) + return (sumf); - if ( inc_x == 1 ) - { + if (inc_x == 1) { - n1 = n & -16; - if ( n1 > 0 ) - { + n1 = n & -16; + if (n1 > 0) { - sumf = zasum_kernel_16(n1, x); - i=n1; - ip=2*n1; - } - - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = zasum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index f0e993d..3154984 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%3) \n\t" - "vleg %%v1,8(%3),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%3),1 \n\t" -#else - "vleg %%v0,0(%3),1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,0(%3),0 \n\t" - "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" +#else + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -8; - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 8c940bb..2f80ced 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,4 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zcopy_kernel_16(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -16; + if (n1 > 0) { + zcopy_kernel_16(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index aab18e2..7a67ef7 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v24,%%v24,%%v26 \n\t" - "vfadb %%v24,%%v24,%%v28 \n\t" - "vfadb %%v24,%%v24,%%v30 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vfadb %%v25,%%v25,%%v29 \n\t" - "vfadb %%v25,%%v25,%%v31 \n\t" - "vsteg %%v24,0(%3),0 \n\t" - "vsteg %%v24,8(%3),1 \n\t" - "vsteg %%v25,16(%3),1 \n\t" - "vsteg %%v25,24(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -8; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - zdot_kernel_8(n1, x, y, dot); + BLASLONG n1 = n & -8; - i = n1; - BLASLONG j = i * 2; + if (n1) + zdot_kernel_8(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 9472b5d..7f21985 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,691 +25,632 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 1024 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%5) \n\t" - "vl %%v17,16(%5) \n\t" - "vl %%v18,32(%5) \n\t" - "vl %%v19,48(%5) \n\t" +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%5),0 \n\t" - "wflcdb %%v20,%%v20 \n\t" - "vleg %%v20,0(%5),1 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "wflcdb %%v21,%%v21 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "wflcdb %%v22,%%v22 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vleg %%v23,56(%5),0 \n\t" - "wflcdb %%v23,%%v23 \n\t" - "vleg %%v23,48(%5),1 \n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%5),1 \n\t" - "vflcdb %%v20,%%v20 \n\t" - "vleg %%v20,8(%5),0 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vflcdb %%v21,%%v21 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vflcdb %%v22,%%v22 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "vleg %%v23,48(%5),1 \n\t" - "vflcdb %%v23,%%v23 \n\t" - "vleg %%v23,56(%5),0 \n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlrepg %%v24,0(%%r1,%1) \n\t" - "vlrepg %%v25,8(%%r1,%1) \n\t" - "vlrepg %%v26,0(%%r1,%2) \n\t" - "vlrepg %%v27,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,0(%%r1,%3) \n\t" - "vlrepg %%v29,8(%%r1,%3) \n\t" - "vlrepg %%v30,0(%%r1,%4) \n\t" - "vlrepg %%v31,8(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "vlrepg %%v24,16(%%r1,%1) \n\t" - "vlrepg %%v25,24(%%r1,%1) \n\t" - "vlrepg %%v26,16(%%r1,%2) \n\t" - "vlrepg %%v27,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,16(%%r1,%3) \n\t" - "vlrepg %%v29,24(%%r1,%3) \n\t" - "vlrepg %%v30,16(%%r1,%4) \n\t" - "vlrepg %%v31,24(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%3) \n\t" - "vl %%v17,16(%3) \n\t" +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%3),0 \n\t" - "wflcdb %%v18,%%v18 \n\t" - "vleg %%v18,0(%3),1 \n\t" - "vleg %%v19,24(%3),0 \n\t" - "wflcdb %%v19,%%v19 \n\t" - "vleg %%v19,16(%3),1 \n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%3),1 \n\t" - "vflcdb %%v18,%%v18 \n\t" - "vleg %%v18,8(%3),0 \n\t" - "vleg %%v19,16(%3),1 \n\t" - "vflcdb %%v19,%%v19 \n\t" - "vleg %%v19,24(%3),0 \n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlrepg %%v20,0(%%r1,%1) \n\t" - "vlrepg %%v21,8(%%r1,%1) \n\t" - "vlrepg %%v22,0(%%r1,%2) \n\t" - "vlrepg %%v23,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "vlrepg %%v20,16(%%r1,%1) \n\t" - "vlrepg %%v21,24(%%r1,%1) \n\t" - "vlrepg %%v22,16(%%r1,%2) \n\t" - "vlrepg %%v23,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%2) \n\t" +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%2),0 \n\t" - "wflcdb %%v17,%%v17 \n\t" - "vleg %%v17,0(%2),1 \n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%2),1 \n\t" - "vflcdb %%v17,%%v17 \n\t" - "vleg %%v17,8(%2),0 \n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlrepg %%v18,0(%%r1,%1) \n\t" - "vlrepg %%v19,8(%%r1,%1) \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "vlrepg %%v18,16(%%r1,%1) \n\t" - "vlrepg %%v19,24(%%r1,%1) \n\t" - - "vl %%v0,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepg %%v0,%3 \n\t" - "vleg %%v1,%4,0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,%4,1 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%3,1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,%3,0 \n\t" - "vlrepg %%v1,%4 \n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - if ( m3 == 1 ) - { + if (m3 == 1) { - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 75027a0..aa7f166 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - zrot_kernel_16(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + zrot_kernel_16(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4d8ee96..fbcc0c5 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "vleg %%v1,8(%1),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vpdi %%v28,%%v20,%%v20,4 \n\t" - "vpdi %%v29,%%v21,%%v21,4 \n\t" - "vpdi %%v30,%%v22,%%v22,4 \n\t" - "vpdi %%v31,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vleg %%v0,8(%1),0 \n\t" - "wflcdb %%v0,%%v0 \n\t" - "vleg %%v0,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v16,%%v16,%%v16,4 \n\t" - "vpdi %%v17,%%v17,%%v17,4 \n\t" - "vpdi %%v18,%%v18,%%v18,4 \n\t" - "vpdi %%v19,%%v19,%%v19,4 \n\t" - "vpdi %%v20,%%v20,%%v20,4 \n\t" - "vpdi %%v21,%%v21,%%v21,4 \n\t" - "vpdi %%v22,%%v22,%%v22,4 \n\t" - "vpdi %%v23,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vleg %%v0,8(%[alpha]),0\n\t" + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -8; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -8; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - zscal_kernel_8_zero(n1, x); - else - zscal_kernel_8_zero_r(n1, alpha, x); - else - if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); - else - zscal_kernel_8(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, alpha, x); + else if (da_i == 0) + zscal_kernel_8_zero_i(n1, alpha, x); + else + zscal_kernel_8(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index a16b87c..0f38103 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zswap_kernel_16(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + zswap_kernel_16(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} -- 2.7.4