[ZARCH] Improve loading performance for camax/icamax
authormaamountki <m.tk1@hotmail.com>
Thu, 31 Jan 2019 16:52:11 +0000 (18:52 +0200)
committerGitHub <noreply@github.com>
Thu, 31 Jan 2019 16:52:11 +0000 (18:52 +0200)
kernel/zarch/camax.c
kernel/zarch/camin.c
kernel/zarch/icamax.c
kernel/zarch/icamin.c
kernel/zarch/zamax.c
kernel/zarch/zamin.c

index f6fa772..2e96486 100644 (file)
@@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
         "vflpsb %%v0,%%v0                 \n\t"
         "vflpsb %%v16,%%v16               \n\t"
         "vfasb  %%v0,%%v0,%%v16           \n\t"
+        "vleib  %%v1,0,0                  \n\t"
+        "vleib  %%v1,1,1                  \n\t"
+        "vleib  %%v1,2,2                  \n\t"
+        "vleib  %%v1,3,3                  \n\t"
+        "vleib  %%v1,8,4                  \n\t"
+        "vleib  %%v1,9,5                  \n\t"
+        "vleib  %%v1,10,6                 \n\t"
+        "vleib  %%v1,11,7                 \n\t"
+        "vleib  %%v1,16,8                 \n\t"
+        "vleib  %%v1,17,9                 \n\t"
+        "vleib  %%v1,18,10                \n\t"
+        "vleib  %%v1,19,11                \n\t"
+        "vleib  %%v1,24,12                \n\t"
+        "vleib  %%v1,25,13                \n\t"
+        "vleib  %%v1,26,14                \n\t"
+        "vleib  %%v1,27,15                \n\t"
         "srlg  %%r0,%1,5                  \n\t"
         "xgr %%r1,%%r1                    \n\t"
         "0:                               \n\t"
         "pfd 1, 1024(%%r1,%2)             \n\t"
 
-        "vlef  %%v16,0(%%r1,%2),0         \n\t"
-        "vlef  %%v17,4(%%r1,%2),0         \n\t"
-        "vlef  %%v16,8(%%r1,%2),1         \n\t"
-        "vlef  %%v17,12(%%r1,%2),1        \n\t"
-        "vlef  %%v16,16(%%r1,%2),2        \n\t"
-        "vlef  %%v17,20(%%r1,%2),2        \n\t"
-        "vlef  %%v16,24(%%r1,%2),3        \n\t"
-        "vlef  %%v17,28(%%r1,%2),3        \n\t"
-
-        "vlef  %%v18,32(%%r1,%2),0        \n\t"
-        "vlef  %%v19,36(%%r1,%2),0        \n\t"
-        "vlef  %%v18,40(%%r1,%2),1        \n\t"
-        "vlef  %%v19,44(%%r1,%2),1        \n\t"
-        "vlef  %%v18,48(%%r1,%2),2        \n\t"
-        "vlef  %%v19,52(%%r1,%2),2        \n\t"
-        "vlef  %%v18,56(%%r1,%2),3        \n\t"
-        "vlef  %%v19,30(%%r1,%2),3        \n\t"
-
-        "vlef  %%v20,64(%%r1,%2),0        \n\t"
-        "vlef  %%v21,68(%%r1,%2),0        \n\t"
-        "vlef  %%v20,72(%%r1,%2),1        \n\t"
-        "vlef  %%v21,76(%%r1,%2),1        \n\t"
-        "vlef  %%v20,80(%%r1,%2),2        \n\t"
-        "vlef  %%v21,84(%%r1,%2),2        \n\t"
-        "vlef  %%v20,88(%%r1,%2),3        \n\t"
-        "vlef  %%v21,92(%%r1,%2),3        \n\t"
-
-        "vlef  %%v22,96(%%r1,%2),0        \n\t"
-        "vlef  %%v23,100(%%r1,%2),0       \n\t"
-        "vlef  %%v22,104(%%r1,%2),1       \n\t"
-        "vlef  %%v23,108(%%r1,%2),1       \n\t"
-        "vlef  %%v22,112(%%r1,%2),2       \n\t"
-        "vlef  %%v23,116(%%r1,%2),2       \n\t"
-        "vlef  %%v22,120(%%r1,%2),3       \n\t"
-        "vlef  %%v23,124(%%r1,%2),3       \n\t"
-
-        "vlef  %%v24,128(%%r1,%2),0       \n\t"
-        "vlef  %%v25,132(%%r1,%2),0       \n\t"
-        "vlef  %%v24,136(%%r1,%2),1       \n\t"
-        "vlef  %%v25,140(%%r1,%2),1       \n\t"
-        "vlef  %%v24,144(%%r1,%2),2       \n\t"
-        "vlef  %%v25,148(%%r1,%2),2       \n\t"
-        "vlef  %%v24,152(%%r1,%2),3       \n\t"
-        "vlef  %%v25,156(%%r1,%2),3       \n\t"
-
-        "vlef  %%v26,160(%%r1,%2),0       \n\t"
-        "vlef  %%v27,164(%%r1,%2),0       \n\t"
-        "vlef  %%v26,168(%%r1,%2),1       \n\t"
-        "vlef  %%v27,172(%%r1,%2),1       \n\t"
-        "vlef  %%v26,176(%%r1,%2),2       \n\t"
-        "vlef  %%v27,180(%%r1,%2),2       \n\t"
-        "vlef  %%v26,184(%%r1,%2),3       \n\t"
-        "vlef  %%v27,188(%%r1,%2),3       \n\t"
-
-        "vlef  %%v28,192(%%r1,%2),0       \n\t"
-        "vlef  %%v29,196(%%r1,%2),0       \n\t"
-        "vlef  %%v28,200(%%r1,%2),1       \n\t"
-        "vlef  %%v29,204(%%r1,%2),1       \n\t"
-        "vlef  %%v28,208(%%r1,%2),2       \n\t"
-        "vlef  %%v29,212(%%r1,%2),2       \n\t"
-        "vlef  %%v28,216(%%r1,%2),3       \n\t"
-        "vlef  %%v29,220(%%r1,%2),3       \n\t"
-
-        "vlef  %%v30,224(%%r1,%2),0       \n\t"
-        "vlef  %%v31,228(%%r1,%2),0       \n\t"
-        "vlef  %%v30,232(%%r1,%2),1       \n\t"
-        "vlef  %%v31,236(%%r1,%2),1       \n\t"
-        "vlef  %%v30,240(%%r1,%2),2       \n\t"
-        "vlef  %%v31,244(%%r1,%2),2       \n\t"
-        "vlef  %%v30,248(%%r1,%2),3       \n\t"
-        "vlef  %%v31,252(%%r1,%2),3       \n\t"
+        "vl    %%v16,0(%%r1,%2)           \n\t"
+        "vl    %%v2,16(%%r1,%2)           \n\t"
+        "vpkg  %%v17,%%v16,%%v2           \n\t"
+        "vperm %%v16,%%v16,%%v2,%%v1      \n\t"
+
+        "vl    %%v18,32(%%r1,%2)          \n\t"
+        "vl    %%v2,48(%%r1,%2)           \n\t"
+        "vpkg  %%v19,%%v18,%%v2           \n\t"
+        "vperm %%v18,%%v18,%%v2,%%v1      \n\t"
+
+        "vl    %%v20,64(%%r1,%2)          \n\t"
+        "vl    %%v2,80(%%r1,%2)           \n\t"
+        "vpkg  %%v21,%%v20,%%v2           \n\t"
+        "vperm %%v20,%%v20,%%v2,%%v1      \n\t"
+
+        "vl    %%v22,96(%%r1,%2)          \n\t"
+        "vl    %%v2,112(%%r1,%2)          \n\t"
+        "vpkg  %%v23,%%v22,%%v2           \n\t"
+        "vperm %%v22,%%v22,%%v2,%%v1      \n\t"
+
+        "vl    %%v24,128(%%r1,%2)         \n\t"
+        "vl    %%v2,144(%%r1,%2)          \n\t"
+        "vpkg  %%v25,%%v24,%%v2           \n\t"
+        "vperm %%v24,%%v24,%%v2,%%v1      \n\t"
+
+        "vl    %%v26,160(%%r1,%2)         \n\t"
+        "vl    %%v2,176(%%r1,%2)          \n\t"
+        "vpkg  %%v27,%%v26,%%v2           \n\t"
+        "vperm %%v26,%%v26,%%v2,%%v1      \n\t"
+
+        "vl    %%v28,192(%%r1,%2)         \n\t"
+        "vl    %%v2,208(%%r1,%2)          \n\t"
+        "vpkg  %%v29,%%v28,%%v2           \n\t"
+        "vperm %%v28,%%v28,%%v2,%%v1      \n\t"
+
+        "vl    %%v30,224(%%r1,%2)         \n\t"
+        "vl    %%v2,240(%%r1,%2)          \n\t"
+        "vpkg  %%v31,%%v30,%%v2           \n\t"
+        "vperm %%v30,%%v30,%%v2,%%v1      \n\t"
 
         "vflpsb  %%v16,%%v16              \n\t"
         "vflpsb  %%v17,%%v17              \n\t"
@@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
         "ler    %0,%%f0                       "
         :"=f"(amax)
         :"r"(n),"ZR"((const FLOAT (*)[n])x)
-        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return amax;
index 4bd6ca1..aec5905 100644 (file)
@@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
         "vflpsb %%v0,%%v0                 \n\t"
         "vflpsb %%v16,%%v16               \n\t"
         "vfasb  %%v0,%%v0,%%v16           \n\t"
+        "vleib  %%v1,0,0                  \n\t"
+        "vleib  %%v1,1,1                  \n\t"
+        "vleib  %%v1,2,2                  \n\t"
+        "vleib  %%v1,3,3                  \n\t"
+        "vleib  %%v1,8,4                  \n\t"
+        "vleib  %%v1,9,5                  \n\t"
+        "vleib  %%v1,10,6                 \n\t"
+        "vleib  %%v1,11,7                 \n\t"
+        "vleib  %%v1,16,8                 \n\t"
+        "vleib  %%v1,17,9                 \n\t"
+        "vleib  %%v1,18,10                \n\t"
+        "vleib  %%v1,19,11                \n\t"
+        "vleib  %%v1,24,12                \n\t"
+        "vleib  %%v1,25,13                \n\t"
+        "vleib  %%v1,26,14                \n\t"
+        "vleib  %%v1,27,15                \n\t"
         "srlg  %%r0,%1,5                  \n\t"
         "xgr %%r1,%%r1                    \n\t"
         "0:                               \n\t"
         "pfd 1, 1024(%%r1,%2)             \n\t"
 
-        "vlef  %%v16,0(%%r1,%2),0         \n\t"
-        "vlef  %%v17,4(%%r1,%2),0         \n\t"
-        "vlef  %%v16,8(%%r1,%2),1         \n\t"
-        "vlef  %%v17,12(%%r1,%2),1        \n\t"
-        "vlef  %%v16,16(%%r1,%2),2        \n\t"
-        "vlef  %%v17,20(%%r1,%2),2        \n\t"
-        "vlef  %%v16,24(%%r1,%2),3        \n\t"
-        "vlef  %%v17,28(%%r1,%2),3        \n\t"
-
-        "vlef  %%v18,32(%%r1,%2),0        \n\t"
-        "vlef  %%v19,36(%%r1,%2),0        \n\t"
-        "vlef  %%v18,40(%%r1,%2),1        \n\t"
-        "vlef  %%v19,44(%%r1,%2),1        \n\t"
-        "vlef  %%v18,48(%%r1,%2),2        \n\t"
-        "vlef  %%v19,52(%%r1,%2),2        \n\t"
-        "vlef  %%v18,56(%%r1,%2),3        \n\t"
-        "vlef  %%v19,30(%%r1,%2),3        \n\t"
-
-        "vlef  %%v20,64(%%r1,%2),0        \n\t"
-        "vlef  %%v21,68(%%r1,%2),0        \n\t"
-        "vlef  %%v20,72(%%r1,%2),1        \n\t"
-        "vlef  %%v21,76(%%r1,%2),1        \n\t"
-        "vlef  %%v20,80(%%r1,%2),2        \n\t"
-        "vlef  %%v21,84(%%r1,%2),2        \n\t"
-        "vlef  %%v20,88(%%r1,%2),3        \n\t"
-        "vlef  %%v21,92(%%r1,%2),3        \n\t"
-
-        "vlef  %%v22,96(%%r1,%2),0        \n\t"
-        "vlef  %%v23,100(%%r1,%2),0       \n\t"
-        "vlef  %%v22,104(%%r1,%2),1       \n\t"
-        "vlef  %%v23,108(%%r1,%2),1       \n\t"
-        "vlef  %%v22,112(%%r1,%2),2       \n\t"
-        "vlef  %%v23,116(%%r1,%2),2       \n\t"
-        "vlef  %%v22,120(%%r1,%2),3       \n\t"
-        "vlef  %%v23,124(%%r1,%2),3       \n\t"
-
-        "vlef  %%v24,128(%%r1,%2),0       \n\t"
-        "vlef  %%v25,132(%%r1,%2),0       \n\t"
-        "vlef  %%v24,136(%%r1,%2),1       \n\t"
-        "vlef  %%v25,140(%%r1,%2),1       \n\t"
-        "vlef  %%v24,144(%%r1,%2),2       \n\t"
-        "vlef  %%v25,148(%%r1,%2),2       \n\t"
-        "vlef  %%v24,152(%%r1,%2),3       \n\t"
-        "vlef  %%v25,156(%%r1,%2),3       \n\t"
-
-        "vlef  %%v26,160(%%r1,%2),0       \n\t"
-        "vlef  %%v27,164(%%r1,%2),0       \n\t"
-        "vlef  %%v26,168(%%r1,%2),1       \n\t"
-        "vlef  %%v27,172(%%r1,%2),1       \n\t"
-        "vlef  %%v26,176(%%r1,%2),2       \n\t"
-        "vlef  %%v27,180(%%r1,%2),2       \n\t"
-        "vlef  %%v26,184(%%r1,%2),3       \n\t"
-        "vlef  %%v27,188(%%r1,%2),3       \n\t"
-
-        "vlef  %%v28,192(%%r1,%2),0       \n\t"
-        "vlef  %%v29,196(%%r1,%2),0       \n\t"
-        "vlef  %%v28,200(%%r1,%2),1       \n\t"
-        "vlef  %%v29,204(%%r1,%2),1       \n\t"
-        "vlef  %%v28,208(%%r1,%2),2       \n\t"
-        "vlef  %%v29,212(%%r1,%2),2       \n\t"
-        "vlef  %%v28,216(%%r1,%2),3       \n\t"
-        "vlef  %%v29,220(%%r1,%2),3       \n\t"
-
-        "vlef  %%v30,224(%%r1,%2),0       \n\t"
-        "vlef  %%v31,228(%%r1,%2),0       \n\t"
-        "vlef  %%v30,232(%%r1,%2),1       \n\t"
-        "vlef  %%v31,236(%%r1,%2),1       \n\t"
-        "vlef  %%v30,240(%%r1,%2),2       \n\t"
-        "vlef  %%v31,244(%%r1,%2),2       \n\t"
-        "vlef  %%v30,248(%%r1,%2),3       \n\t"
-        "vlef  %%v31,252(%%r1,%2),3       \n\t"
+        "vl    %%v16,0(%%r1,%2)           \n\t"
+        "vl    %%v2,16(%%r1,%2)           \n\t"
+        "vpkg  %%v17,%%v16,%%v2           \n\t"
+        "vperm %%v16,%%v16,%%v2,%%v1      \n\t"
+
+        "vl    %%v18,32(%%r1,%2)          \n\t"
+        "vl    %%v2,48(%%r1,%2)           \n\t"
+        "vpkg  %%v19,%%v18,%%v2           \n\t"
+        "vperm %%v18,%%v18,%%v2,%%v1      \n\t"
+
+        "vl    %%v20,64(%%r1,%2)          \n\t"
+        "vl    %%v2,80(%%r1,%2)           \n\t"
+        "vpkg  %%v21,%%v20,%%v2           \n\t"
+        "vperm %%v20,%%v20,%%v2,%%v1      \n\t"
+
+        "vl    %%v22,96(%%r1,%2)          \n\t"
+        "vl    %%v2,112(%%r1,%2)          \n\t"
+        "vpkg  %%v23,%%v22,%%v2           \n\t"
+        "vperm %%v22,%%v22,%%v2,%%v1      \n\t"
+
+        "vl    %%v24,128(%%r1,%2)         \n\t"
+        "vl    %%v2,144(%%r1,%2)          \n\t"
+        "vpkg  %%v25,%%v24,%%v2           \n\t"
+        "vperm %%v24,%%v24,%%v2,%%v1      \n\t"
+
+        "vl    %%v26,160(%%r1,%2)         \n\t"
+        "vl    %%v2,176(%%r1,%2)          \n\t"
+        "vpkg  %%v27,%%v26,%%v2           \n\t"
+        "vperm %%v26,%%v26,%%v2,%%v1      \n\t"
+
+        "vl    %%v28,192(%%r1,%2)         \n\t"
+        "vl    %%v2,208(%%r1,%2)          \n\t"
+        "vpkg  %%v29,%%v28,%%v2           \n\t"
+        "vperm %%v28,%%v28,%%v2,%%v1      \n\t"
+
+        "vl    %%v30,224(%%r1,%2)         \n\t"
+        "vl    %%v2,240(%%r1,%2)          \n\t"
+        "vpkg  %%v31,%%v30,%%v2           \n\t"
+        "vperm %%v30,%%v30,%%v2,%%v1      \n\t"
 
         "vflpsb  %%v16,%%v16              \n\t"
         "vflpsb  %%v17,%%v17              \n\t"
@@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
         "ler    %0,%%f0                       "
         :"=f"(amin)
         :"r"(n),"ZR"((const FLOAT (*)[n])x)
-        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return amin;
index a9e7f91..5129ca6 100644 (file)
@@ -57,6 +57,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
         "vleig  %%v2,3,1                 \n\t"
         "vrepig %%v3,16                  \n\t"
         "vzero  %%v4                     \n\t"
+        "vleib  %%v9,0,0                 \n\t"
+        "vleib  %%v9,1,1                 \n\t"
+        "vleib  %%v9,2,2                 \n\t"
+        "vleib  %%v9,3,3                 \n\t"
+        "vleib  %%v9,8,4                 \n\t"
+        "vleib  %%v9,9,5                 \n\t"
+        "vleib  %%v9,10,6                \n\t"
+        "vleib  %%v9,11,7                \n\t"
+        "vleib  %%v9,16,8                \n\t"
+        "vleib  %%v9,17,9                \n\t"
+        "vleib  %%v9,18,10               \n\t"
+        "vleib  %%v9,19,11               \n\t"
+        "vleib  %%v9,24,12               \n\t"
+        "vleib  %%v9,25,13               \n\t"
+        "vleib  %%v9,26,14               \n\t"
+        "vleib  %%v9,27,15               \n\t"
         "vleif  %%v24,0,0                \n\t"
         "vleif  %%v24,1,1                \n\t"
         "vleif  %%v24,2,2                \n\t"
@@ -78,41 +94,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
         "0:                              \n\t"
         "pfd 1, 1024(%%r1,%3)            \n\t"
 
-        "vlef  %%v16,0(%%r1,%3),0        \n\t"
-        "vlef  %%v17,4(%%r1,%3),0        \n\t"
-        "vlef  %%v16,8(%%r1,%3),1        \n\t"
-        "vlef  %%v17,12(%%r1,%3),1       \n\t"
-        "vlef  %%v16,16(%%r1,%3),2       \n\t"
-        "vlef  %%v17,20(%%r1,%3),2       \n\t"
-        "vlef  %%v16,24(%%r1,%3),3       \n\t"
-        "vlef  %%v17,28(%%r1,%3),3       \n\t"
+        "vl    %%v16,0(%%r1,%3)          \n\t"
+        "vl    %%v28,16(%%r1,%3)         \n\t"
+        "vpkg  %%v17,%%v16,%%v28         \n\t"
+        "vperm %%v16,%%v16,%%v28,%%v9    \n\t"
 
-        "vlef  %%v18,32(%%r1,%3),0       \n\t"
-        "vlef  %%v19,36(%%r1,%3),0       \n\t"
-        "vlef  %%v18,40(%%r1,%3),1       \n\t"
-        "vlef  %%v19,44(%%r1,%3),1       \n\t"
-        "vlef  %%v18,48(%%r1,%3),2       \n\t"
-        "vlef  %%v19,52(%%r1,%3),2       \n\t"
-        "vlef  %%v18,56(%%r1,%3),3       \n\t"
-        "vlef  %%v19,60(%%r1,%3),3       \n\t"
+        "vl    %%v18,32(%%r1,%3)         \n\t"
+        "vl    %%v29,48(%%r1,%3)         \n\t"
+        "vpkg  %%v19,%%v18,%%v29         \n\t"
+        "vperm %%v18,%%v18,%%v29,%%v9    \n\t"
 
-        "vlef  %%v20,64(%%r1,%3),0       \n\t"
-        "vlef  %%v21,68(%%r1,%3),0       \n\t"
-        "vlef  %%v20,72(%%r1,%3),1       \n\t"
-        "vlef  %%v21,76(%%r1,%3),1       \n\t"
-        "vlef  %%v20,80(%%r1,%3),2       \n\t"
-        "vlef  %%v21,84(%%r1,%3),2       \n\t"
-        "vlef  %%v20,88(%%r1,%3),3       \n\t"
-        "vlef  %%v21,92(%%r1,%3),3       \n\t"
+        "vl    %%v20,64(%%r1,%3)         \n\t"
+        "vl    %%v30,80(%%r1,%3)         \n\t"
+        "vpkg  %%v21,%%v20,%%v30         \n\t"
+        "vperm %%v20,%%v20,%%v30,%%v9    \n\t"
 
-        "vlef  %%v22,96(%%r1,%3),0       \n\t"
-        "vlef  %%v23,100(%%r1,%3),0      \n\t"
-        "vlef  %%v22,104(%%r1,%3),1      \n\t"
-        "vlef  %%v23,108(%%r1,%3),1      \n\t"
-        "vlef  %%v22,112(%%r1,%3),2      \n\t"
-        "vlef  %%v23,116(%%r1,%3),2      \n\t"
-        "vlef  %%v22,120(%%r1,%3),3      \n\t"
-        "vlef  %%v23,124(%%r1,%3),3      \n\t"
+        "vl    %%v22,96(%%r1,%3)         \n\t"
+        "vl    %%v31,112(%%r1,%3)        \n\t"
+        "vpkg  %%v23,%%v22,%%v31         \n\t"
+        "vperm %%v22,%%v22,%%v31,%%v9    \n\t"
 
         "vflpsb  %%v16, %%v16            \n\t"
         "vflpsb  %%v17, %%v17            \n\t"
@@ -151,41 +151,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
         "vsel    %%v2,%%v2,%%v6,%%v8     \n\t"
         "vag     %%v4,%%v4,%%v3          \n\t"
 
-        "vlef  %%v16,128(%%r1,%3),0      \n\t"
-        "vlef  %%v17,132(%%r1,%3),0      \n\t"
-        "vlef  %%v16,136(%%r1,%3),1      \n\t"
-        "vlef  %%v17,140(%%r1,%3),1      \n\t"
-        "vlef  %%v16,144(%%r1,%3),2      \n\t"
-        "vlef  %%v17,148(%%r1,%3),2      \n\t"
-        "vlef  %%v16,152(%%r1,%3),3      \n\t"
-        "vlef  %%v17,156(%%r1,%3),3      \n\t"
+        "vl    %%v16,128(%%r1,%3)        \n\t"
+        "vl    %%v28,144(%%r1,%3)        \n\t"
+        "vpkg  %%v17,%%v16,%%v28         \n\t"
+        "vperm %%v16,%%v16,%%v28,%%v9    \n\t"
 
-        "vlef  %%v18,160(%%r1,%3),0      \n\t"
-        "vlef  %%v19,164(%%r1,%3),0      \n\t"
-        "vlef  %%v18,168(%%r1,%3),1      \n\t"
-        "vlef  %%v19,172(%%r1,%3),1      \n\t"
-        "vlef  %%v18,176(%%r1,%3),2      \n\t"
-        "vlef  %%v19,180(%%r1,%3),2      \n\t"
-        "vlef  %%v18,184(%%r1,%3),3      \n\t"
-        "vlef  %%v19,188(%%r1,%3),3      \n\t"
+        "vl    %%v18,160(%%r1,%3)        \n\t"
+        "vl    %%v29,176(%%r1,%3)        \n\t"
+        "vpkg  %%v19,%%v18,%%v29         \n\t"
+        "vperm %%v18,%%v18,%%v29,%%v9    \n\t"
 
-        "vlef  %%v20,192(%%r1,%3),0      \n\t"
-        "vlef  %%v21,196(%%r1,%3),0      \n\t"
-        "vlef  %%v20,200(%%r1,%3),1      \n\t"
-        "vlef  %%v21,204(%%r1,%3),1      \n\t"
-        "vlef  %%v20,208(%%r1,%3),2      \n\t"
-        "vlef  %%v21,212(%%r1,%3),2      \n\t"
-        "vlef  %%v20,216(%%r1,%3),3      \n\t"
-        "vlef  %%v21,220(%%r1,%3),3      \n\t"
+        "vl    %%v20,192(%%r1,%3)        \n\t"
+        "vl    %%v30,208(%%r1,%3)        \n\t"
+        "vpkg  %%v21,%%v20,%%v30         \n\t"
+        "vperm %%v20,%%v20,%%v30,%%v9    \n\t"
 
-        "vlef  %%v22,224(%%r1,%3),0      \n\t"
-        "vlef  %%v23,228(%%r1,%3),0      \n\t"
-        "vlef  %%v22,232(%%r1,%3),1      \n\t"
-        "vlef  %%v23,236(%%r1,%3),1      \n\t"
-        "vlef  %%v22,240(%%r1,%3),2      \n\t"
-        "vlef  %%v23,244(%%r1,%3),2      \n\t"
-        "vlef  %%v22,248(%%r1,%3),3      \n\t"
-        "vlef  %%v23,252(%%r1,%3),3      \n\t"
+        "vl    %%v22,224(%%r1,%3)        \n\t"
+        "vl    %%v31,240(%%r1,%3)        \n\t"
+        "vpkg  %%v23,%%v22,%%v31         \n\t"
+        "vperm %%v22,%%v22,%%v31,%%v9    \n\t"
 
         "vflpsb  %%v16, %%v16            \n\t"
         "vflpsb  %%v17, %%v17            \n\t"
@@ -258,7 +242,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
         "nop                                 "
         :"=r"(iamax),"=m"(*amax)
         :"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
-        :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return iamax;
index faf5f9c..05068b2 100644 (file)
@@ -57,6 +57,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
         "vleig  %%v2,3,1                 \n\t"
         "vrepig %%v3,16                  \n\t"
         "vzero  %%v4                     \n\t"
+        "vleib  %%v9,0,0                 \n\t"
+        "vleib  %%v9,1,1                 \n\t"
+        "vleib  %%v9,2,2                 \n\t"
+        "vleib  %%v9,3,3                 \n\t"
+        "vleib  %%v9,8,4                 \n\t"
+        "vleib  %%v9,9,5                 \n\t"
+        "vleib  %%v9,10,6                \n\t"
+        "vleib  %%v9,11,7                \n\t"
+        "vleib  %%v9,16,8                \n\t"
+        "vleib  %%v9,17,9                \n\t"
+        "vleib  %%v9,18,10               \n\t"
+        "vleib  %%v9,19,11               \n\t"
+        "vleib  %%v9,24,12               \n\t"
+        "vleib  %%v9,25,13               \n\t"
+        "vleib  %%v9,26,14               \n\t"
+        "vleib  %%v9,27,15               \n\t"
         "vleif  %%v24,0,0                \n\t"
         "vleif  %%v24,1,1                \n\t"
         "vleif  %%v24,2,2                \n\t"
@@ -78,41 +94,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
         "0:                              \n\t"
         "pfd 1, 1024(%%r1,%3)            \n\t"
 
-        "vlef  %%v16,0(%%r1,%3),0        \n\t"
-        "vlef  %%v17,4(%%r1,%3),0        \n\t"
-        "vlef  %%v16,8(%%r1,%3),1        \n\t"
-        "vlef  %%v17,12(%%r1,%3),1       \n\t"
-        "vlef  %%v16,16(%%r1,%3),2       \n\t"
-        "vlef  %%v17,20(%%r1,%3),2       \n\t"
-        "vlef  %%v16,24(%%r1,%3),3       \n\t"
-        "vlef  %%v17,28(%%r1,%3),3       \n\t"
+        "vl    %%v16,0(%%r1,%3)          \n\t"
+        "vl    %%v28,16(%%r1,%3)         \n\t"
+        "vpkg  %%v17,%%v16,%%v28         \n\t"
+        "vperm %%v16,%%v16,%%v28,%%v9    \n\t"
 
-        "vlef  %%v18,32(%%r1,%3),0       \n\t"
-        "vlef  %%v19,36(%%r1,%3),0       \n\t"
-        "vlef  %%v18,40(%%r1,%3),1       \n\t"
-        "vlef  %%v19,44(%%r1,%3),1       \n\t"
-        "vlef  %%v18,48(%%r1,%3),2       \n\t"
-        "vlef  %%v19,52(%%r1,%3),2       \n\t"
-        "vlef  %%v18,56(%%r1,%3),3       \n\t"
-        "vlef  %%v19,60(%%r1,%3),3       \n\t"
+        "vl    %%v18,32(%%r1,%3)         \n\t"
+        "vl    %%v29,48(%%r1,%3)         \n\t"
+        "vpkg  %%v19,%%v18,%%v29         \n\t"
+        "vperm %%v18,%%v18,%%v29,%%v9    \n\t"
 
-        "vlef  %%v20,64(%%r1,%3),0       \n\t"
-        "vlef  %%v21,68(%%r1,%3),0       \n\t"
-        "vlef  %%v20,72(%%r1,%3),1       \n\t"
-        "vlef  %%v21,76(%%r1,%3),1       \n\t"
-        "vlef  %%v20,80(%%r1,%3),2       \n\t"
-        "vlef  %%v21,84(%%r1,%3),2       \n\t"
-        "vlef  %%v20,88(%%r1,%3),3       \n\t"
-        "vlef  %%v21,92(%%r1,%3),3       \n\t"
+        "vl    %%v20,64(%%r1,%3)         \n\t"
+        "vl    %%v30,80(%%r1,%3)         \n\t"
+        "vpkg  %%v21,%%v20,%%v30         \n\t"
+        "vperm %%v20,%%v20,%%v30,%%v9    \n\t"
 
-        "vlef  %%v22,96(%%r1,%3),0       \n\t"
-        "vlef  %%v23,100(%%r1,%3),0      \n\t"
-        "vlef  %%v22,104(%%r1,%3),1      \n\t"
-        "vlef  %%v23,108(%%r1,%3),1      \n\t"
-        "vlef  %%v22,112(%%r1,%3),2      \n\t"
-        "vlef  %%v23,116(%%r1,%3),2      \n\t"
-        "vlef  %%v22,120(%%r1,%3),3      \n\t"
-        "vlef  %%v23,124(%%r1,%3),3      \n\t"
+        "vl    %%v22,96(%%r1,%3)         \n\t"
+        "vl    %%v31,112(%%r1,%3)        \n\t"
+        "vpkg  %%v23,%%v22,%%v31         \n\t"
+        "vperm %%v22,%%v22,%%v31,%%v9    \n\t"
 
         "vflpsb  %%v16, %%v16            \n\t"
         "vflpsb  %%v17, %%v17            \n\t"
@@ -151,41 +151,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
         "vsel    %%v2,%%v2,%%v6,%%v8     \n\t"
         "vag     %%v4,%%v4,%%v3          \n\t"
 
-        "vlef  %%v16,128(%%r1,%3),0      \n\t"
-        "vlef  %%v17,132(%%r1,%3),0      \n\t"
-        "vlef  %%v16,136(%%r1,%3),1      \n\t"
-        "vlef  %%v17,140(%%r1,%3),1      \n\t"
-        "vlef  %%v16,144(%%r1,%3),2      \n\t"
-        "vlef  %%v17,148(%%r1,%3),2      \n\t"
-        "vlef  %%v16,152(%%r1,%3),3      \n\t"
-        "vlef  %%v17,156(%%r1,%3),3      \n\t"
+        "vl    %%v16,128(%%r1,%3)        \n\t"
+        "vl    %%v28,144(%%r1,%3)        \n\t"
+        "vpkg  %%v17,%%v16,%%v28         \n\t"
+        "vperm %%v16,%%v16,%%v28,%%v9    \n\t"
 
-        "vlef  %%v18,160(%%r1,%3),0      \n\t"
-        "vlef  %%v19,164(%%r1,%3),0      \n\t"
-        "vlef  %%v18,168(%%r1,%3),1      \n\t"
-        "vlef  %%v19,172(%%r1,%3),1      \n\t"
-        "vlef  %%v18,176(%%r1,%3),2      \n\t"
-        "vlef  %%v19,180(%%r1,%3),2      \n\t"
-        "vlef  %%v18,184(%%r1,%3),3      \n\t"
-        "vlef  %%v19,188(%%r1,%3),3      \n\t"
+        "vl    %%v18,160(%%r1,%3)        \n\t"
+        "vl    %%v29,176(%%r1,%3)        \n\t"
+        "vpkg  %%v19,%%v18,%%v29         \n\t"
+        "vperm %%v18,%%v18,%%v29,%%v9    \n\t"
 
-        "vlef  %%v20,192(%%r1,%3),0      \n\t"
-        "vlef  %%v21,196(%%r1,%3),0      \n\t"
-        "vlef  %%v20,200(%%r1,%3),1      \n\t"
-        "vlef  %%v21,204(%%r1,%3),1      \n\t"
-        "vlef  %%v20,208(%%r1,%3),2      \n\t"
-        "vlef  %%v21,212(%%r1,%3),2      \n\t"
-        "vlef  %%v20,216(%%r1,%3),3      \n\t"
-        "vlef  %%v21,220(%%r1,%3),3      \n\t"
+        "vl    %%v20,192(%%r1,%3)        \n\t"
+        "vl    %%v30,208(%%r1,%3)        \n\t"
+        "vpkg  %%v21,%%v20,%%v30         \n\t"
+        "vperm %%v20,%%v20,%%v30,%%v9    \n\t"
 
-        "vlef  %%v22,224(%%r1,%3),0      \n\t"
-        "vlef  %%v23,228(%%r1,%3),0      \n\t"
-        "vlef  %%v22,232(%%r1,%3),1      \n\t"
-        "vlef  %%v23,236(%%r1,%3),1      \n\t"
-        "vlef  %%v22,240(%%r1,%3),2      \n\t"
-        "vlef  %%v23,244(%%r1,%3),2      \n\t"
-        "vlef  %%v22,248(%%r1,%3),3      \n\t"
-        "vlef  %%v23,252(%%r1,%3),3      \n\t"
+        "vl    %%v22,224(%%r1,%3)        \n\t"
+        "vl    %%v31,240(%%r1,%3)        \n\t"
+        "vpkg  %%v23,%%v22,%%v31         \n\t"
+        "vperm %%v22,%%v22,%%v31,%%v9    \n\t"
 
         "vflpsb  %%v16, %%v16            \n\t"
         "vflpsb  %%v17, %%v17            \n\t"
@@ -258,7 +242,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
         "nop                                 "
         :"=r"(iamin),"=m"(*amin)
         :"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
-        :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return iamin;
index b721478..cc63471 100644 (file)
@@ -132,7 +132,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
         "ldr    %0,%%f0                       "
         :"=f"(amax)
         :"r"(n),"ZR"((const FLOAT (*)[n])x)
-        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return amax;
index d53fdb6..18610da 100644 (file)
@@ -132,7 +132,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
         "ldr    %0,%%f0                       "
         :"=f"(amin)
         :"r"(n),"ZR"((const FLOAT (*)[n])x)
-        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
+        :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
     );
 
     return amin;