Add basic support for the Fujitsu A64FX (#3415)
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Mon, 18 Oct 2021 13:00:19 +0000 (15:00 +0200)
committerGitHub <noreply@github.com>
Mon, 18 Oct 2021 13:00:19 +0000 (15:00 +0200)
* Add initial support for Fujitsu A64FX as generic ARMV8

Makefile.arm64
cpuid_arm64.c
getarch.c
kernel/arm64/KERNEL.A64FX [new file with mode: 0644]

index 3e3466d..e9ae233 100644 (file)
@@ -153,6 +153,15 @@ endif
 endif
 endif
 
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
+ifeq ($(CORE), A64FX)
+CCOMMON_OPT += -march=armv8.2-a -mtune=a64fx
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=a64fx
+endif
+endif
+endif
+
 endif
 
 endif
\ No newline at end of file
index 73a82d1..958e94a 100644 (file)
@@ -55,6 +55,8 @@ size_t length64=sizeof(value64);
 #define CPU_EMAG8180    10
 // Apple
 #define CPU_VORTEX       13
+// Fujitsu
+#define CPU_A64FX       15
 
 static char *cpuname[] = {
   "UNKNOWN",
@@ -71,7 +73,8 @@ static char *cpuname[] = {
   "NEOVERSEN1",
   "THUNDERX3T110",
   "VORTEX",
-  "CORTEXA55"
+  "CORTEXA55",
+  "A64FX"
 };
 
 static char *cpuname_lower[] = {
@@ -89,7 +92,8 @@ static char *cpuname_lower[] = {
   "neoversen1",
   "thunderx3t110",
   "vortex",
-  "cortexa55"
+  "cortexa55",
+  "a64fx"
 };
 
 int get_feature(char *search)
@@ -185,6 +189,9 @@ int detect(void)
     // Ampere
     else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
                         return CPU_EMAG8180;
+    // Fujitsu
+    else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
+                        return CPU_A64FX;
        }
 
        p = (char *) NULL ;
@@ -287,156 +294,166 @@ void get_cpuconfig(void)
        switch (d)
        {
 
-    case CPU_CORTEXA53:
-    case CPU_CORTEXA55:                        
-      printf("#define %s\n", cpuname[d]);
-      // Fall-through
-               case CPU_ARMV8:
-      // Minimum parameters for ARMv8 (based on A53)
-       printf("#define L1_DATA_SIZE 32768\n");
-       printf("#define L1_DATA_LINESIZE 64\n");
-       printf("#define L2_SIZE 262144\n");
-       printf("#define L2_LINESIZE 64\n");
-       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-       printf("#define DTB_SIZE 4096\n");
-       printf("#define L2_ASSOCIATIVE 4\n");
+           case CPU_CORTEXA53:
+           case CPU_CORTEXA55:
+               printf("#define %s\n", cpuname[d]);
+             // Fall-through
+           case CPU_ARMV8:
+             // Minimum parameters for ARMv8 (based on A53)
+               printf("#define L1_DATA_SIZE 32768\n");
+               printf("#define L1_DATA_LINESIZE 64\n");
+               printf("#define L2_SIZE 262144\n");
+               printf("#define L2_LINESIZE 64\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               printf("#define L2_ASSOCIATIVE 4\n");
                        break;
 
-               case CPU_CORTEXA57:
-               case CPU_CORTEXA72:
-               case CPU_CORTEXA73:
+           case CPU_CORTEXA57:
+           case CPU_CORTEXA72:
+           case CPU_CORTEXA73:
       // Common minimum settings for these Arm cores
       // Can change a lot, but we need to be conservative
       // TODO: detect info from /sys if possible
-      printf("#define %s\n", cpuname[d]);
-                       printf("#define L1_CODE_SIZE 49152\n");
-                       printf("#define L1_CODE_LINESIZE 64\n");
-                       printf("#define L1_CODE_ASSOCIATIVE 3\n");
-                       printf("#define L1_DATA_SIZE 32768\n");
-                       printf("#define L1_DATA_LINESIZE 64\n");
-                       printf("#define L1_DATA_ASSOCIATIVE 2\n");
-      printf("#define L2_SIZE 524288\n");
-                       printf("#define L2_LINESIZE 64\n");
-                       printf("#define L2_ASSOCIATIVE 16\n");
-                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-                       printf("#define DTB_SIZE 4096\n");
-                       break;
-               case CPU_NEOVERSEN1:
-                       printf("#define %s\n", cpuname[d]);
-                       printf("#define L1_CODE_SIZE 65536\n");
-                       printf("#define L1_CODE_LINESIZE 64\n");
-                       printf("#define L1_CODE_ASSOCIATIVE 4\n");
-                       printf("#define L1_DATA_SIZE 65536\n");
-                       printf("#define L1_DATA_LINESIZE 64\n");
-                       printf("#define L1_DATA_ASSOCIATIVE 4\n");
-                       printf("#define L2_SIZE 1048576\n");
-                       printf("#define L2_LINESIZE 64\n");
-                       printf("#define L2_ASSOCIATIVE 16\n");
-                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-                       printf("#define DTB_SIZE 4096\n");
-                       break;
-
-    case CPU_FALKOR:
-      printf("#define FALKOR\n");
-      printf("#define L1_CODE_SIZE 65536\n");
-      printf("#define L1_CODE_LINESIZE 64\n");
-      printf("#define L1_DATA_SIZE 32768\n");
-      printf("#define L1_DATA_LINESIZE 128\n");
-      printf("#define L2_SIZE 524288\n");
-      printf("#define L2_LINESIZE 64\n");
-      printf("#define DTB_DEFAULT_ENTRIES 64\n");
-      printf("#define DTB_SIZE 4096\n");
-      printf("#define L2_ASSOCIATIVE 16\n");
-      break;
-
-               case CPU_THUNDERX:
-                       printf("#define THUNDERX\n");
-                       printf("#define L1_DATA_SIZE 32768\n");
-                       printf("#define L1_DATA_LINESIZE 128\n");
-                       printf("#define L2_SIZE 16777216\n");
-                       printf("#define L2_LINESIZE 128\n");
-                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-                       printf("#define DTB_SIZE 4096\n");
-                       printf("#define L2_ASSOCIATIVE 16\n");
-                       break;
-
-               case CPU_THUNDERX2T99:
-                       printf("#define THUNDERX2T99                  \n");
-                       printf("#define L1_CODE_SIZE         32768    \n");
-                       printf("#define L1_CODE_LINESIZE     64       \n");
-                       printf("#define L1_CODE_ASSOCIATIVE  8        \n");
-                       printf("#define L1_DATA_SIZE         32768    \n");
-                       printf("#define L1_DATA_LINESIZE     64       \n");
-                       printf("#define L1_DATA_ASSOCIATIVE  8        \n");
-                       printf("#define L2_SIZE              262144   \n");
-                       printf("#define L2_LINESIZE          64       \n");
-                       printf("#define L2_ASSOCIATIVE       8        \n");
-                       printf("#define L3_SIZE              33554432 \n");
-                       printf("#define L3_LINESIZE          64       \n");
-                       printf("#define L3_ASSOCIATIVE       32       \n");
-                       printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-                       printf("#define DTB_SIZE             4096     \n");
-                       break;
+               printf("#define %s\n", cpuname[d]);
+               printf("#define L1_CODE_SIZE 49152\n");
+               printf("#define L1_CODE_LINESIZE 64\n");
+               printf("#define L1_CODE_ASSOCIATIVE 3\n");
+               printf("#define L1_DATA_SIZE 32768\n");
+               printf("#define L1_DATA_LINESIZE 64\n");
+               printf("#define L1_DATA_ASSOCIATIVE 2\n");
+               printf("#define L2_SIZE 524288\n");
+               printf("#define L2_LINESIZE 64\n");
+               printf("#define L2_ASSOCIATIVE 16\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               break;
+           case CPU_NEOVERSEN1:
+               printf("#define %s\n", cpuname[d]);
+               printf("#define L1_CODE_SIZE 65536\n");
+               printf("#define L1_CODE_LINESIZE 64\n");
+               printf("#define L1_CODE_ASSOCIATIVE 4\n");
+               printf("#define L1_DATA_SIZE 65536\n");
+               printf("#define L1_DATA_LINESIZE 64\n");
+               printf("#define L1_DATA_ASSOCIATIVE 4\n");
+               printf("#define L2_SIZE 1048576\n");
+               printf("#define L2_LINESIZE 64\n");
+               printf("#define L2_ASSOCIATIVE 16\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               break;
+
+           case CPU_FALKOR:
+               printf("#define FALKOR\n");
+               printf("#define L1_CODE_SIZE 65536\n");
+               printf("#define L1_CODE_LINESIZE 64\n");
+               printf("#define L1_DATA_SIZE 32768\n");
+               printf("#define L1_DATA_LINESIZE 128\n");
+               printf("#define L2_SIZE 524288\n");
+               printf("#define L2_LINESIZE 64\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               printf("#define L2_ASSOCIATIVE 16\n");
+               break;
+
+           case CPU_THUNDERX:
+               printf("#define THUNDERX\n");
+               printf("#define L1_DATA_SIZE 32768\n");
+               printf("#define L1_DATA_LINESIZE 128\n");
+               printf("#define L2_SIZE 16777216\n");
+               printf("#define L2_LINESIZE 128\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               printf("#define L2_ASSOCIATIVE 16\n");
+               break;
+
+           case CPU_THUNDERX2T99:
+               printf("#define THUNDERX2T99                  \n");
+               printf("#define L1_CODE_SIZE         32768    \n");
+               printf("#define L1_CODE_LINESIZE     64       \n");
+               printf("#define L1_CODE_ASSOCIATIVE  8        \n");
+               printf("#define L1_DATA_SIZE         32768    \n");
+               printf("#define L1_DATA_LINESIZE     64       \n");
+               printf("#define L1_DATA_ASSOCIATIVE  8        \n");
+               printf("#define L2_SIZE              262144   \n");
+               printf("#define L2_LINESIZE          64       \n");
+               printf("#define L2_ASSOCIATIVE       8        \n");
+               printf("#define L3_SIZE              33554432 \n");
+               printf("#define L3_LINESIZE          64       \n");
+               printf("#define L3_ASSOCIATIVE       32       \n");
+               printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+               printf("#define DTB_SIZE             4096     \n");
+               break;
                        
-               case CPU_TSV110:
-                       printf("#define TSV110                        \n");
-                       printf("#define L1_CODE_SIZE         65536    \n");
-                       printf("#define L1_CODE_LINESIZE     64       \n");
-                       printf("#define L1_CODE_ASSOCIATIVE  4        \n");
-                       printf("#define L1_DATA_SIZE         65536    \n");
-                       printf("#define L1_DATA_LINESIZE     64       \n");
-                       printf("#define L1_DATA_ASSOCIATIVE  4        \n");
-                       printf("#define L2_SIZE              524228   \n");
-                       printf("#define L2_LINESIZE          64       \n");
-                       printf("#define L2_ASSOCIATIVE       8        \n");
-                       printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-                       printf("#define DTB_SIZE             4096     \n");
-                       break;  
-
-               case CPU_EMAG8180:
-      // Minimum parameters for ARMv8 (based on A53)
-       printf("#define EMAG8180\n");
-       printf("#define L1_CODE_SIZE 32768\n");
-       printf("#define L1_DATA_SIZE 32768\n");
-       printf("#define L1_DATA_LINESIZE 64\n");
-       printf("#define L2_SIZE 262144\n");
-       printf("#define L2_LINESIZE 64\n");
-       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-       printf("#define DTB_SIZE 4096\n");
-                       break;
-
-               case CPU_THUNDERX3T110:
-                       printf("#define THUNDERX3T110                 \n");
-                       printf("#define L1_CODE_SIZE         65536    \n");
-                       printf("#define L1_CODE_LINESIZE     64       \n");
-                       printf("#define L1_CODE_ASSOCIATIVE  8        \n");
-                       printf("#define L1_DATA_SIZE         32768    \n");
-                       printf("#define L1_DATA_LINESIZE     64       \n");
-                       printf("#define L1_DATA_ASSOCIATIVE  8        \n");
-                       printf("#define L2_SIZE              524288   \n");
-                       printf("#define L2_LINESIZE          64       \n");
-                       printf("#define L2_ASSOCIATIVE       8        \n");
-                       printf("#define L3_SIZE              94371840 \n");
-                       printf("#define L3_LINESIZE          64       \n");
-                       printf("#define L3_ASSOCIATIVE       32       \n");
-                       printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-                       printf("#define DTB_SIZE             4096     \n");
-                       break;
+           case CPU_TSV110:
+               printf("#define TSV110                        \n");
+               printf("#define L1_CODE_SIZE         65536    \n");
+               printf("#define L1_CODE_LINESIZE     64       \n");
+               printf("#define L1_CODE_ASSOCIATIVE  4        \n");
+               printf("#define L1_DATA_SIZE         65536    \n");
+               printf("#define L1_DATA_LINESIZE     64       \n");
+               printf("#define L1_DATA_ASSOCIATIVE  4        \n");
+               printf("#define L2_SIZE              524228   \n");
+               printf("#define L2_LINESIZE          64       \n");
+               printf("#define L2_ASSOCIATIVE       8        \n");
+               printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+               printf("#define DTB_SIZE             4096     \n");
+               break;  
+
+           case CPU_EMAG8180:
+                // Minimum parameters for ARMv8 (based on A53)
+               printf("#define EMAG8180\n");
+               printf("#define L1_CODE_SIZE 32768\n");
+               printf("#define L1_DATA_SIZE 32768\n");
+               printf("#define L1_DATA_LINESIZE 64\n");
+               printf("#define L2_SIZE 262144\n");
+               printf("#define L2_LINESIZE 64\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               break;
+
+           case CPU_THUNDERX3T110:
+               printf("#define THUNDERX3T110                 \n");
+               printf("#define L1_CODE_SIZE         65536    \n");
+               printf("#define L1_CODE_LINESIZE     64       \n");
+               printf("#define L1_CODE_ASSOCIATIVE  8        \n");
+               printf("#define L1_DATA_SIZE         32768    \n");
+               printf("#define L1_DATA_LINESIZE     64       \n");
+               printf("#define L1_DATA_ASSOCIATIVE  8        \n");
+               printf("#define L2_SIZE              524288   \n");
+               printf("#define L2_LINESIZE          64       \n");
+               printf("#define L2_ASSOCIATIVE       8        \n");
+               printf("#define L3_SIZE              94371840 \n");
+               printf("#define L3_LINESIZE          64       \n");
+               printf("#define L3_ASSOCIATIVE       32       \n");
+               printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+               printf("#define DTB_SIZE             4096     \n");
+               break;
 #ifdef __APPLE__
-               case CPU_VORTEX:
-                       printf("#define VORTEX                        \n");
-                       sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
-                       printf("#define L1_CODE_SIZE         %lld       \n",value64);
-                       sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
-                       printf("#define L1_CODE_LINESIZE     %lld       \n",value64);
-                       sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
-                       printf("#define L1_DATA_SIZE         %lld       \n",value64);
-                       sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
-                       printf("#define L2_SIZE      %lld       \n",value64);
-                       printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-                       printf("#define DTB_SIZE             4096     \n");
-                       break;
+           case CPU_VORTEX:
+               printf("#define VORTEX                        \n");
+               sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
+               printf("#define L1_CODE_SIZE         %lld       \n",value64);
+               sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
+               printf("#define L1_CODE_LINESIZE     %lld       \n",value64);
+               sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
+               printf("#define L1_DATA_SIZE         %lld       \n",value64);
+               sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
+               printf("#define L2_SIZE      %lld       \n",value64);
+               printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+               printf("#define DTB_SIZE             4096     \n");
+               break;
 #endif                 
+           case CPU_A64FX:
+               printf("#define A64FX\n");
+               printf("#define L1_CODE_SIZE 65535\n");
+               printf("#define L1_DATA_SIZE 65535\n");
+               printf("#define L1_DATA_LINESIZE 256\n");
+               printf("#define L2_SIZE 8388608\n");
+               printf("#define L2_LINESIZE 256\n");
+               printf("#define DTB_DEFAULT_ENTRIES 64\n");
+               printf("#define DTB_SIZE 4096\n");
+               break;
        }
        get_cpucount();
 }
index d095472..60bfe05 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -1424,6 +1424,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "VORTEX"
 #endif
 
+#ifdef FORCE_A64FX
+#define ARMV8
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "A64FX"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DA64FX " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \
+       "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
+       "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "a64fx"
+#define CORENAME  "A64FX"
+#else
+#endif
+
 #ifdef FORCE_ZARCH_GENERIC
 #define FORCE
 #define ARCHITECTURE    "ZARCH"
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
new file mode 100644 (file)
index 0000000..c8a53c8
--- /dev/null
@@ -0,0 +1,198 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
+
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
+
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
+
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
+
+SNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
+
+DDOTKERNEL     = dot.S
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL     = ../generic/dot.c
+else
+SDOTKERNEL = dot.S
+endif
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL     = zdot.S
+ZDOTKERNEL     = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)