[ZARCH] fix cgemv_n_4.c
authormaamountki <m.tk1@hotmail.com>
Fri, 4 Jan 2019 15:45:56 +0000 (17:45 +0200)
committerGitHub <noreply@github.com>
Fri, 4 Jan 2019 15:45:56 +0000 (17:45 +0200)
kernel/zarch/cgemv_n_4.c

index 4c32537..c939aea 100644 (file)
@@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
     __asm__ volatile (
-               "vlrepg     %%v16,0(%5)           \n\t"
-        "vlrepg     %%v17,8(%5)           \n\t"
-               "vlrepg     %%v18,16(%5)          \n\t"
-        "vlrepg     %%v19,24(%5)          \n\t"
+       "vlrepg     %%v16,0(%5)           \n\t"
+       "vlrepg     %%v17,8(%5)           \n\t"
+       "vlrepg     %%v18,16(%5)          \n\t"
+       "vlrepg     %%v19,24(%5)          \n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        "vlef   %%v20,4(%5),0             \n\t"
-               "vlef   %%v20,4(%5),2            \n\t"
-        "vflcsb %%v20,%%v20               \n\t"
-        "vlef   %%v20,0(%5),1             \n\t"
-               "vlef   %%v20,0(%5),3             \n\t"
-
-               "vlef   %%v21,12(%5),0            \n\t"
-               "vlef   %%v21,12(%5),2            \n\t"
-        "vflcsb %%v21,%%v21               \n\t"
-        "vlef   %%v21,8(%5),1            \n\t"
-               "vlef   %%v21,8(%5),3            \n\t"
-
-               "vlef   %%v22,20(%5),0            \n\t"
-               "vlef   %%v22,20(%5),2            \n\t"
-        "vflcsb %%v22,%%v22               \n\t"
-        "vlef   %%v22,16(%5),1            \n\t"
-               "vlef   %%v22,16(%5),3            \n\t"
-
-               "vlef   %%v23,28(%5),0            \n\t"
-               "vlef   %%v23,28(%5),2            \n\t"
-        "vflcsb %%v23,%%v23               \n\t"
-        "vlef   %%v23,24(%5),1            \n\t"
-               "vlef   %%v23,24(%5),3            \n\t"
+       "vlef   %%v20,4(%5),0             \n\t"
+       "vlef   %%v20,4(%5),2             \n\t"
+       "vflcsb %%v20,%%v20               \n\t"
+       "vlef   %%v20,0(%5),1             \n\t"
+       "vlef   %%v20,0(%5),3             \n\t"
+
+       "vlef   %%v21,12(%5),0            \n\t"
+       "vlef   %%v21,12(%5),2            \n\t"
+       "vflcsb %%v21,%%v21               \n\t"
+       "vlef   %%v21,8(%5),1             \n\t"
+       "vlef   %%v21,8(%5),3             \n\t"
+
+       "vlef   %%v22,20(%5),0            \n\t"
+       "vlef   %%v22,20(%5),2            \n\t"
+       "vflcsb %%v22,%%v22               \n\t"
+       "vlef   %%v22,16(%5),1            \n\t"
+       "vlef   %%v22,16(%5),3            \n\t"
+
+       "vlef   %%v23,28(%5),0            \n\t"
+       "vlef   %%v23,28(%5),2            \n\t"
+       "vflcsb %%v23,%%v23               \n\t"
+       "vlef   %%v23,24(%5),1            \n\t"
+       "vlef   %%v23,24(%5),3            \n\t"
 #else
-        "vlef   %%v20,0(%5),1             \n\t"
-               "vlef   %%v20,0(%5),3             \n\t"
-        "vflcsb %%v20,%%v20               \n\t"
-        "vlef   %%v20,4(%5),0             \n\t"
-               "vlef   %%v20,4(%5),2            \n\t"
-
-               "vlef   %%v21,8(%5),1            \n\t"
-               "vlef   %%v21,8(%5),3            \n\t"
-        "vflcsb %%v21,%%v21               \n\t"
-        "vlef   %%v21,12(%5),0            \n\t"
-               "vlef   %%v21,12(%5),2            \n\t"
-
-               "vlef   %%v22,16(%5),1            \n\t"
-               "vlef   %%v22,16(%5),3            \n\t"
-        "vflcsb %%v22,%%v22               \n\t"
-        "vlef   %%v22,20(%5),0            \n\t"
-               "vlef   %%v22,20(%5),2            \n\t"
-
-               "vlef   %%v23,24(%5),1            \n\t"
-               "vlef   %%v23,24(%5),3            \n\t"
-        "vflcsb %%v23,%%v23               \n\t"
-        "vlef   %%v23,28(%5),0            \n\t"
-               "vlef   %%v23,28(%5),2            \n\t"
+       "vlef   %%v20,0(%5),1             \n\t"
+       "vlef   %%v20,0(%5),3             \n\t"
+       "vflcsb %%v20,%%v20               \n\t"
+       "vlef   %%v20,4(%5),0             \n\t"
+       "vlef   %%v20,4(%5),2             \n\t"
+
+       "vlef   %%v21,8(%5),1             \n\t"
+       "vlef   %%v21,8(%5),3             \n\t"
+       "vflcsb %%v21,%%v21               \n\t"
+       "vlef   %%v21,12(%5),0            \n\t"
+       "vlef   %%v21,12(%5),2            \n\t"
+
+       "vlef   %%v22,16(%5),1            \n\t"
+       "vlef   %%v22,16(%5),3            \n\t"
+       "vflcsb %%v22,%%v22               \n\t"
+       "vlef   %%v22,20(%5),0            \n\t"
+       "vlef   %%v22,20(%5),2            \n\t"
+
+       "vlef   %%v23,24(%5),1            \n\t"
+       "vlef   %%v23,24(%5),3            \n\t"
+       "vflcsb %%v23,%%v23               \n\t"
+       "vlef   %%v23,28(%5),0            \n\t"
+       "vlef   %%v23,28(%5),2            \n\t"
 #endif
-        "xgr   %%r1,%%r1                  \n\t"
-        "srlg  %%r0,%%r0,1                \n\t"
-        "0:                               \n\t"
-        "pfd 1,1024(%%r1,%1)              \n\t"
-        "pfd 1,1024(%%r1,%2)              \n\t"
-        "pfd 1,1024(%%r1,%3)              \n\t"
-        "pfd 1,1024(%%r1,%4)              \n\t"
-        "pfd 2,1024(%%r1,%6)              \n\t"
-
-               "vlef   %%v24,0(%%r1,%1),0        \n\t"
-               "vlef   %%v24,0(%%r1,%1),1        \n\t"
-               "vlef   %%v24,8(%%r1,%1),2        \n\t"
-               "vlef   %%v24,8(%%r1,%1),3        \n\t"
-               "vlef   %%v25,4(%%r1,%1),0        \n\t"
-               "vlef   %%v25,4(%%r1,%1),1        \n\t"
-               "vlef   %%v25,12(%%r1,%1),2       \n\t"
-               "vlef   %%v25,12(%%r1,%1),3       \n\t"
-               "vlef   %%v26,0(%%r1,%2),0        \n\t"
-               "vlef   %%v26,0(%%r1,%2),1        \n\t"
-               "vlef   %%v26,8(%%r1,%2),2        \n\t"
-               "vlef   %%v26,8(%%r1,%2),3        \n\t"
-               "vlef   %%v27,4(%%r1,%2),0        \n\t"
-               "vlef   %%v27,4(%%r1,%2),1        \n\t"
-               "vlef   %%v27,12(%%r1,%2),2       \n\t"
-               "vlef   %%v27,12(%%r1,%2),3       \n\t"
-
-        "vl  %%v0,0(%%r1,%6)              \n\t"
-        "vfmasb   %%v0,%%v24,%%v16,%%v0   \n\t"
-        "vfmasb   %%v0,%%v25,%%v20,%%v0   \n\t"
-        "vfmasb   %%v0,%%v26,%%v17,%%v0   \n\t"
-        "vfmasb   %%v0,%%v27,%%v21,%%v0   \n\t"
-
-               "vlef   %%v28,0(%%r1,%1),0        \n\t"
-               "vlef   %%v28,0(%%r1,%1),1        \n\t"
-               "vlef   %%v28,8(%%r1,%1),2        \n\t"
-               "vlef   %%v28,8(%%r1,%1),3        \n\t"
-               "vlef   %%v29,4(%%r1,%1),0        \n\t"
-               "vlef   %%v29,4(%%r1,%1),1        \n\t"
-               "vlef   %%v29,12(%%r1,%1),2       \n\t"
-               "vlef   %%v29,12(%%r1,%1),3       \n\t"
-               "vlef   %%v30,0(%%r1,%2),0        \n\t"
-               "vlef   %%v30,0(%%r1,%2),1        \n\t"
-               "vlef   %%v30,8(%%r1,%2),2        \n\t"
-               "vlef   %%v30,8(%%r1,%2),3        \n\t"
-               "vlef   %%v31,4(%%r1,%2),0        \n\t"
-               "vlef   %%v31,4(%%r1,%2),1        \n\t"
-               "vlef   %%v31,12(%%r1,%2),2       \n\t"
-               "vlef   %%v31,12(%%r1,%2),3       \n\t"
+       "xgr   %%r1,%%r1                  \n\t"
+       "srlg  %%r0,%0,1                  \n\t"
+       "0:                               \n\t"
+       "pfd 1,1024(%%r1,%1)              \n\t"
+       "pfd 1,1024(%%r1,%2)              \n\t"
+       "pfd 1,1024(%%r1,%3)              \n\t"
+       "pfd 1,1024(%%r1,%4)              \n\t"
+       "pfd 2,1024(%%r1,%6)              \n\t"
+
+       "vlef   %%v24,0(%%r1,%1),0        \n\t"
+       "vlef   %%v24,0(%%r1,%1),1        \n\t"
+       "vlef   %%v24,8(%%r1,%1),2        \n\t"
+       "vlef   %%v24,8(%%r1,%1),3        \n\t"
+       "vlef   %%v25,4(%%r1,%1),0        \n\t"
+       "vlef   %%v25,4(%%r1,%1),1        \n\t"
+       "vlef   %%v25,12(%%r1,%1),2       \n\t"
+       "vlef   %%v25,12(%%r1,%1),3       \n\t"
+       "vlef   %%v26,0(%%r1,%2),0        \n\t"
+       "vlef   %%v26,0(%%r1,%2),1        \n\t"
+       "vlef   %%v26,8(%%r1,%2),2        \n\t"
+       "vlef   %%v26,8(%%r1,%2),3        \n\t"
+       "vlef   %%v27,4(%%r1,%2),0        \n\t"
+       "vlef   %%v27,4(%%r1,%2),1        \n\t"
+       "vlef   %%v27,12(%%r1,%2),2       \n\t"
+       "vlef   %%v27,12(%%r1,%2),3       \n\t"
+
+       "vl  %%v0,0(%%r1,%6)              \n\t"
+       "vfmasb   %%v0,%%v24,%%v16,%%v0   \n\t"
+       "vfmasb   %%v0,%%v25,%%v20,%%v0   \n\t"
+       "vfmasb   %%v0,%%v26,%%v17,%%v0   \n\t"
+       "vfmasb   %%v0,%%v27,%%v21,%%v0   \n\t"
+
+       "vlef   %%v28,0(%%r1,%1),0        \n\t"
+       "vlef   %%v28,0(%%r1,%1),1        \n\t"
+       "vlef   %%v28,8(%%r1,%1),2        \n\t"
+       "vlef   %%v28,8(%%r1,%1),3        \n\t"
+       "vlef   %%v29,4(%%r1,%1),0        \n\t"
+       "vlef   %%v29,4(%%r1,%1),1        \n\t"
+       "vlef   %%v29,12(%%r1,%1),2       \n\t"
+       "vlef   %%v29,12(%%r1,%1),3       \n\t"
+       "vlef   %%v30,0(%%r1,%2),0        \n\t"
+       "vlef   %%v30,0(%%r1,%2),1        \n\t"
+       "vlef   %%v30,8(%%r1,%2),2        \n\t"
+       "vlef   %%v30,8(%%r1,%2),3        \n\t"
+       "vlef   %%v31,4(%%r1,%2),0        \n\t"
+       "vlef   %%v31,4(%%r1,%2),1        \n\t"
+       "vlef   %%v31,12(%%r1,%2),2       \n\t"
+       "vlef   %%v31,12(%%r1,%2),3       \n\t"
 
         "vfmasb   %%v0,%%v28,%%v18,%%v0   \n\t"
         "vfmasb   %%v0,%%v29,%%v22,%%v0   \n\t"
@@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
     __asm__ volatile (
-               "vlrepg     %%v16,0(%3)           \n\t"
-        "vlrepg     %%v17,8(%3)           \n\t"
+       "vlrepg     %%v16,0(%3)           \n\t"
+       "vlrepg     %%v17,8(%3)           \n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        "vlef   %%v18,4(%3),0             \n\t"
-               "vlef   %%v18,4(%3),2             \n\t"
-        "vflcsb %%v18,%%v18               \n\t"
-        "vlef   %%v18,0(%3),1             \n\t"
-               "vlef   %%v18,0(%3),3             \n\t"
-
-               "vlef   %%v19,12(%3),0            \n\t"
-               "vlef   %%v19,12(%3),2            \n\t"
-        "vflcsb %%v19,%%v19               \n\t"
-        "vlef   %%v19,8(%3),1             \n\t"
-               "vlef   %%v19,8(%3),3             \n\t"
+       "vlef   %%v18,4(%3),0             \n\t"
+       "vlef   %%v18,4(%3),2             \n\t"
+       "vflcsb %%v18,%%v18               \n\t"
+       "vlef   %%v18,0(%3),1             \n\t"
+       "vlef   %%v18,0(%3),3             \n\t"
+
+       "vlef   %%v19,12(%3),0            \n\t"
+       "vlef   %%v19,12(%3),2            \n\t"
+       "vflcsb %%v19,%%v19               \n\t"
+       "vlef   %%v19,8(%3),1             \n\t"
+       "vlef   %%v19,8(%3),3             \n\t"
 #else
-        "vlef   %%v18,0(%3),1             \n\t"
-               "vlef   %%v18,0(%3),3             \n\t"
-        "vflcsb %%v18,%%v18               \n\t"
-        "vlef   %%v18,4(%3),0             \n\t"
-               "vlef   %%v18,4(%3),2             \n\t"
-
-               "vlef   %%v19,8(%3),1             \n\t"
-               "vlef   %%v19,8(%3),3             \n\t"
-        "vflcsb %%v19,%%v19               \n\t"
-        "vlef   %%v19,12(%3),0            \n\t"
-               "vlef   %%v19,12(%3),2            \n\t"
+       "vlef   %%v18,0(%3),1             \n\t"
+       "vlef   %%v18,0(%3),3             \n\t"
+       "vflcsb %%v18,%%v18               \n\t"
+       "vlef   %%v18,4(%3),0             \n\t"
+       "vlef   %%v18,4(%3),2             \n\t"
+
+       "vlef   %%v19,8(%3),1             \n\t"
+       "vlef   %%v19,8(%3),3             \n\t"
+       "vflcsb %%v19,%%v19               \n\t"
+       "vlef   %%v19,12(%3),0            \n\t"
+       "vlef   %%v19,12(%3),2            \n\t"
 #endif
-        "xgr   %%r1,%%r1                  \n\t"
-        "srlg  %%r0,%%r0,1                \n\t"
-        "0:                               \n\t"
-        "pfd 1,1024(%%r1,%1)              \n\t"
-        "pfd 1,1024(%%r1,%2)              \n\t"
-        "pfd 2,1024(%%r1,%4)              \n\t"
-
-               "vlef   %%v20,0(%%r1,%1),0        \n\t"
-               "vlef   %%v20,0(%%r1,%1),1        \n\t"
-               "vlef   %%v20,8(%%r1,%1),2        \n\t"
-               "vlef   %%v20,8(%%r1,%1),3        \n\t"
-               "vlef   %%v21,4(%%r1,%1),0        \n\t"
-               "vlef   %%v21,4(%%r1,%1),1        \n\t"
-               "vlef   %%v21,12(%%r1,%1),2       \n\t"
-               "vlef   %%v21,12(%%r1,%1),3       \n\t"
-               "vlef   %%v22,0(%%r1,%2),0        \n\t"
-               "vlef   %%v22,0(%%r1,%2),1        \n\t"
-               "vlef   %%v22,8(%%r1,%2),2        \n\t"
-               "vlef   %%v22,8(%%r1,%2),3        \n\t"
-               "vlef   %%v23,4(%%r1,%2),0        \n\t"
-               "vlef   %%v23,4(%%r1,%2),1        \n\t"
-               "vlef   %%v23,12(%%r1,%2),2       \n\t"
-               "vlef   %%v23,12(%%r1,%2),3       \n\t"
+       "xgr   %%r1,%%r1                  \n\t"
+       "srlg  %%r0,%0,1                  \n\t"
+       "0:                               \n\t"
+       "pfd 1,1024(%%r1,%1)              \n\t"
+       "pfd 1,1024(%%r1,%2)              \n\t"
+       "pfd 2,1024(%%r1,%4)              \n\t"
+
+       "vlef   %%v20,0(%%r1,%1),0        \n\t"
+       "vlef   %%v20,0(%%r1,%1),1        \n\t"
+       "vlef   %%v20,8(%%r1,%1),2        \n\t"
+       "vlef   %%v20,8(%%r1,%1),3        \n\t"
+       "vlef   %%v21,4(%%r1,%1),0        \n\t"
+       "vlef   %%v21,4(%%r1,%1),1        \n\t"
+       "vlef   %%v21,12(%%r1,%1),2       \n\t"
+       "vlef   %%v21,12(%%r1,%1),3       \n\t"
+       "vlef   %%v22,0(%%r1,%2),0        \n\t"
+       "vlef   %%v22,0(%%r1,%2),1        \n\t"
+       "vlef   %%v22,8(%%r1,%2),2        \n\t"
+       "vlef   %%v22,8(%%r1,%2),3        \n\t"
+       "vlef   %%v23,4(%%r1,%2),0        \n\t"
+       "vlef   %%v23,4(%%r1,%2),1        \n\t"
+       "vlef   %%v23,12(%%r1,%2),2       \n\t"
+       "vlef   %%v23,12(%%r1,%2),3       \n\t"
 
         "vl  %%v0,0(%%r1,%4)              \n\t"
         "vfmasb   %%v0,%%v20,%%v16,%%v0   \n\t"
@@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
     __asm__ volatile (
-               "vlrepg     %%v16,0(%2)           \n\t"
+       "vlrepg     %%v16,0(%2)           \n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
         "vlef   %%v17,4(%2),0             \n\t"
-               "vlef   %%v17,4(%2),2             \n\t"
+       "vlef   %%v17,4(%2),2             \n\t"
         "vflcsb %%v17,%%v17               \n\t"
         "vlef   %%v17,0(%2),1             \n\t"
-               "vlef   %%v17,0(%2),3             \n\t"
+       "vlef   %%v17,0(%2),3             \n\t"
 #else
         "vlef   %%v17,0(%2),1             \n\t"
-               "vlef   %%v17,0(%2),3             \n\t"
+       "vlef   %%v17,0(%2),3             \n\t"
         "vflcsb %%v17,%%v17               \n\t"
         "vlef   %%v17,4(%2),0             \n\t"
-               "vlef   %%v17,4(%2),2             \n\t"
+       "vlef   %%v17,4(%2),2             \n\t"
 #endif
         "xgr   %%r1,%%r1                  \n\t"
-        "srlg  %%r0,%%r0,1                \n\t"
+        "srlg  %%r0,%0,1                  \n\t"
         "0:                               \n\t"
         "pfd 1,1024(%%r1,%1)              \n\t"
         "pfd 2,1024(%%r1,%3)              \n\t"
 
-               "vlef   %%v18,0(%%r1,%1),0        \n\t"
-               "vlef   %%v18,0(%%r1,%1),1        \n\t"
-               "vlef   %%v18,8(%%r1,%1),2        \n\t"
-               "vlef   %%v18,8(%%r1,%1),3        \n\t"
-               "vlef   %%v19,4(%%r1,%1),0        \n\t"
-               "vlef   %%v19,4(%%r1,%1),1        \n\t"
-               "vlef   %%v19,12(%%r1,%1),2       \n\t"
-               "vlef   %%v19,12(%%r1,%1),3       \n\t"
+       "vlef   %%v18,0(%%r1,%1),0        \n\t"
+       "vlef   %%v18,0(%%r1,%1),1        \n\t"
+       "vlef   %%v18,8(%%r1,%1),2        \n\t"
+       "vlef   %%v18,8(%%r1,%1),3        \n\t"
+       "vlef   %%v19,4(%%r1,%1),0        \n\t"
+       "vlef   %%v19,4(%%r1,%1),1        \n\t"
+       "vlef   %%v19,12(%%r1,%1),2       \n\t"
+       "vlef   %%v19,12(%%r1,%1),3       \n\t"
 
         "vl  %%v0,0(%%r1,%3)              \n\t"
         "vfmasb   %%v0,%%v18,%%v16,%%v0   \n\t"
@@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al
 {
     __asm__ volatile (
 #if !defined(XCONJ) 
-               "vlrepf %%v0,%3                 \n\t"
-               "vlef   %%v1,%4,0               \n\t"
-               "vlef   %%v1,%4,2               \n\t"
+       "vlrepf %%v0,%3                 \n\t"
+       "vlef   %%v1,%4,0               \n\t"
+       "vlef   %%v1,%4,2               \n\t"
         "vflcsb %%v1,%%v1               \n\t"
-               "vlef   %%v1,%4,1               \n\t"
+       "vlef   %%v1,%4,1               \n\t"
         "vlef   %%v1,%4,3               \n\t"
 #else
         "vlef   %%v0,%3,1               \n\t"
-               "vlef   %%v0,%3,3               \n\t"
+       "vlef   %%v0,%3,3               \n\t"
         "vflcsb %%v0,%%v0               \n\t"
         "vlef   %%v0,%3,0               \n\t"
-               "vlef   %%v0,%3,2               \n\t"
+       "vlef   %%v0,%3,2               \n\t"
         "vlrepf %%v1,%4                 \n\t"
 #endif
         "xgr   %%r1,%%r1                \n\t"
@@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al
         "vl   %%v17,16(%%r1,%1)         \n\t"
         "vl   %%v18,0(%%r1,%2)          \n\t"
         "vl   %%v19,16(%%r1,%2)         \n\t"
-               "verllg   %%v20,%%v16,32        \n\t"
+       "verllg   %%v20,%%v16,32        \n\t"
         "verllg   %%v21,%%v17,32        \n\t"
 
         "vfmasb %%v22,%%v16,%%v0,%%v18  \n\t"