#define CPU_EMAG8180 10
// Apple
#define CPU_VORTEX 13
+// Fujitsu
+#define CPU_A64FX 15
static char *cpuname[] = {
"UNKNOWN",
"NEOVERSEN1",
"THUNDERX3T110",
"VORTEX",
- "CORTEXA55"
+ "CORTEXA55",
+ "A64FX"
};
static char *cpuname_lower[] = {
"neoversen1",
"thunderx3t110",
"vortex",
- "cortexa55"
+ "cortexa55",
+ "a64fx"
};
int get_feature(char *search)
// Ampere
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
return CPU_EMAG8180;
+ // Fujitsu
+ else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
+ return CPU_A64FX;
}
p = (char *) NULL ;
switch (d)
{
- case CPU_CORTEXA53:
- case CPU_CORTEXA55:
- printf("#define %s\n", cpuname[d]);
- // Fall-through
- case CPU_ARMV8:
- // Minimum parameters for ARMv8 (based on A53)
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 64\n");
- printf("#define L2_SIZE 262144\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 4\n");
+ case CPU_CORTEXA53:
+ case CPU_CORTEXA55:
+ printf("#define %s\n", cpuname[d]);
+ // Fall-through
+ case CPU_ARMV8:
+ // Minimum parameters for ARMv8 (based on A53)
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 262144\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 4\n");
break;
- case CPU_CORTEXA57:
- case CPU_CORTEXA72:
- case CPU_CORTEXA73:
+ case CPU_CORTEXA57:
+ case CPU_CORTEXA72:
+ case CPU_CORTEXA73:
// Common minimum settings for these Arm cores
// Can change a lot, but we need to be conservative
// TODO: detect info from /sys if possible
- printf("#define %s\n", cpuname[d]);
- printf("#define L1_CODE_SIZE 49152\n");
- printf("#define L1_CODE_LINESIZE 64\n");
- printf("#define L1_CODE_ASSOCIATIVE 3\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 64\n");
- printf("#define L1_DATA_ASSOCIATIVE 2\n");
- printf("#define L2_SIZE 524288\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define L2_ASSOCIATIVE 16\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- break;
- case CPU_NEOVERSEN1:
- printf("#define %s\n", cpuname[d]);
- printf("#define L1_CODE_SIZE 65536\n");
- printf("#define L1_CODE_LINESIZE 64\n");
- printf("#define L1_CODE_ASSOCIATIVE 4\n");
- printf("#define L1_DATA_SIZE 65536\n");
- printf("#define L1_DATA_LINESIZE 64\n");
- printf("#define L1_DATA_ASSOCIATIVE 4\n");
- printf("#define L2_SIZE 1048576\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define L2_ASSOCIATIVE 16\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- break;
-
- case CPU_FALKOR:
- printf("#define FALKOR\n");
- printf("#define L1_CODE_SIZE 65536\n");
- printf("#define L1_CODE_LINESIZE 64\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 128\n");
- printf("#define L2_SIZE 524288\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 16\n");
- break;
-
- case CPU_THUNDERX:
- printf("#define THUNDERX\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 128\n");
- printf("#define L2_SIZE 16777216\n");
- printf("#define L2_LINESIZE 128\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 16\n");
- break;
-
- case CPU_THUNDERX2T99:
- printf("#define THUNDERX2T99 \n");
- printf("#define L1_CODE_SIZE 32768 \n");
- printf("#define L1_CODE_LINESIZE 64 \n");
- printf("#define L1_CODE_ASSOCIATIVE 8 \n");
- printf("#define L1_DATA_SIZE 32768 \n");
- printf("#define L1_DATA_LINESIZE 64 \n");
- printf("#define L1_DATA_ASSOCIATIVE 8 \n");
- printf("#define L2_SIZE 262144 \n");
- printf("#define L2_LINESIZE 64 \n");
- printf("#define L2_ASSOCIATIVE 8 \n");
- printf("#define L3_SIZE 33554432 \n");
- printf("#define L3_LINESIZE 64 \n");
- printf("#define L3_ASSOCIATIVE 32 \n");
- printf("#define DTB_DEFAULT_ENTRIES 64 \n");
- printf("#define DTB_SIZE 4096 \n");
- break;
+ printf("#define %s\n", cpuname[d]);
+ printf("#define L1_CODE_SIZE 49152\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_CODE_ASSOCIATIVE 3\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L1_DATA_ASSOCIATIVE 2\n");
+ printf("#define L2_SIZE 524288\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
+ case CPU_NEOVERSEN1:
+ printf("#define %s\n", cpuname[d]);
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_CODE_ASSOCIATIVE 4\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L1_DATA_ASSOCIATIVE 4\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
+
+ case CPU_FALKOR:
+ printf("#define FALKOR\n");
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 128\n");
+ printf("#define L2_SIZE 524288\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ break;
+
+ case CPU_THUNDERX:
+ printf("#define THUNDERX\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 128\n");
+ printf("#define L2_SIZE 16777216\n");
+ printf("#define L2_LINESIZE 128\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ break;
+
+ case CPU_THUNDERX2T99:
+ printf("#define THUNDERX2T99 \n");
+ printf("#define L1_CODE_SIZE 32768 \n");
+ printf("#define L1_CODE_LINESIZE 64 \n");
+ printf("#define L1_CODE_ASSOCIATIVE 8 \n");
+ printf("#define L1_DATA_SIZE 32768 \n");
+ printf("#define L1_DATA_LINESIZE 64 \n");
+ printf("#define L1_DATA_ASSOCIATIVE 8 \n");
+ printf("#define L2_SIZE 262144 \n");
+ printf("#define L2_LINESIZE 64 \n");
+ printf("#define L2_ASSOCIATIVE 8 \n");
+ printf("#define L3_SIZE 33554432 \n");
+ printf("#define L3_LINESIZE 64 \n");
+ printf("#define L3_ASSOCIATIVE 32 \n");
+ printf("#define DTB_DEFAULT_ENTRIES 64 \n");
+ printf("#define DTB_SIZE 4096 \n");
+ break;
- case CPU_TSV110:
- printf("#define TSV110 \n");
- printf("#define L1_CODE_SIZE 65536 \n");
- printf("#define L1_CODE_LINESIZE 64 \n");
- printf("#define L1_CODE_ASSOCIATIVE 4 \n");
- printf("#define L1_DATA_SIZE 65536 \n");
- printf("#define L1_DATA_LINESIZE 64 \n");
- printf("#define L1_DATA_ASSOCIATIVE 4 \n");
- printf("#define L2_SIZE 524228 \n");
- printf("#define L2_LINESIZE 64 \n");
- printf("#define L2_ASSOCIATIVE 8 \n");
- printf("#define DTB_DEFAULT_ENTRIES 64 \n");
- printf("#define DTB_SIZE 4096 \n");
- break;
-
- case CPU_EMAG8180:
- // Minimum parameters for ARMv8 (based on A53)
- printf("#define EMAG8180\n");
- printf("#define L1_CODE_SIZE 32768\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 64\n");
- printf("#define L2_SIZE 262144\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- break;
-
- case CPU_THUNDERX3T110:
- printf("#define THUNDERX3T110 \n");
- printf("#define L1_CODE_SIZE 65536 \n");
- printf("#define L1_CODE_LINESIZE 64 \n");
- printf("#define L1_CODE_ASSOCIATIVE 8 \n");
- printf("#define L1_DATA_SIZE 32768 \n");
- printf("#define L1_DATA_LINESIZE 64 \n");
- printf("#define L1_DATA_ASSOCIATIVE 8 \n");
- printf("#define L2_SIZE 524288 \n");
- printf("#define L2_LINESIZE 64 \n");
- printf("#define L2_ASSOCIATIVE 8 \n");
- printf("#define L3_SIZE 94371840 \n");
- printf("#define L3_LINESIZE 64 \n");
- printf("#define L3_ASSOCIATIVE 32 \n");
- printf("#define DTB_DEFAULT_ENTRIES 64 \n");
- printf("#define DTB_SIZE 4096 \n");
- break;
+ case CPU_TSV110:
+ printf("#define TSV110 \n");
+ printf("#define L1_CODE_SIZE 65536 \n");
+ printf("#define L1_CODE_LINESIZE 64 \n");
+ printf("#define L1_CODE_ASSOCIATIVE 4 \n");
+ printf("#define L1_DATA_SIZE 65536 \n");
+ printf("#define L1_DATA_LINESIZE 64 \n");
+ printf("#define L1_DATA_ASSOCIATIVE 4 \n");
+ printf("#define L2_SIZE 524228 \n");
+ printf("#define L2_LINESIZE 64 \n");
+ printf("#define L2_ASSOCIATIVE 8 \n");
+ printf("#define DTB_DEFAULT_ENTRIES 64 \n");
+ printf("#define DTB_SIZE 4096 \n");
+ break;
+
+ case CPU_EMAG8180:
+ // Minimum parameters for ARMv8 (based on A53)
+ printf("#define EMAG8180\n");
+ printf("#define L1_CODE_SIZE 32768\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 262144\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
+
+ case CPU_THUNDERX3T110:
+ printf("#define THUNDERX3T110 \n");
+ printf("#define L1_CODE_SIZE 65536 \n");
+ printf("#define L1_CODE_LINESIZE 64 \n");
+ printf("#define L1_CODE_ASSOCIATIVE 8 \n");
+ printf("#define L1_DATA_SIZE 32768 \n");
+ printf("#define L1_DATA_LINESIZE 64 \n");
+ printf("#define L1_DATA_ASSOCIATIVE 8 \n");
+ printf("#define L2_SIZE 524288 \n");
+ printf("#define L2_LINESIZE 64 \n");
+ printf("#define L2_ASSOCIATIVE 8 \n");
+ printf("#define L3_SIZE 94371840 \n");
+ printf("#define L3_LINESIZE 64 \n");
+ printf("#define L3_ASSOCIATIVE 32 \n");
+ printf("#define DTB_DEFAULT_ENTRIES 64 \n");
+ printf("#define DTB_SIZE 4096 \n");
+ break;
#ifdef __APPLE__
- case CPU_VORTEX:
- printf("#define VORTEX \n");
- sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
- printf("#define L1_CODE_SIZE %lld \n",value64);
- sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
- printf("#define L1_CODE_LINESIZE %lld \n",value64);
- sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
- printf("#define L1_DATA_SIZE %lld \n",value64);
- sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
- printf("#define L2_SIZE %lld \n",value64);
- printf("#define DTB_DEFAULT_ENTRIES 64 \n");
- printf("#define DTB_SIZE 4096 \n");
- break;
+ case CPU_VORTEX:
+ printf("#define VORTEX \n");
+ sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
+ printf("#define L1_CODE_SIZE %lld \n",value64);
+ sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
+ printf("#define L1_CODE_LINESIZE %lld \n",value64);
+ sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
+ printf("#define L1_DATA_SIZE %lld \n",value64);
+ sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
+ printf("#define L2_SIZE %lld \n",value64);
+ printf("#define DTB_DEFAULT_ENTRIES 64 \n");
+ printf("#define DTB_SIZE 4096 \n");
+ break;
#endif
+ case CPU_A64FX:
+ printf("#define A64FX\n");
+ printf("#define L1_CODE_SIZE 65535\n");
+ printf("#define L1_DATA_SIZE 65535\n");
+ printf("#define L1_DATA_LINESIZE 256\n");
+ printf("#define L2_SIZE 8388608\n");
+ printf("#define L2_LINESIZE 256\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
}
get_cpucount();
}
--- /dev/null
+SAMINKERNEL = ../arm/amin.c
+DAMINKERNEL = ../arm/amin.c
+CAMINKERNEL = ../arm/zamin.c
+ZAMINKERNEL = ../arm/zamin.c
+
+SMAXKERNEL = ../arm/max.c
+DMAXKERNEL = ../arm/max.c
+
+SMINKERNEL = ../arm/min.c
+DMINKERNEL = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL = ../arm/imax.c
+IDMAXKERNEL = ../arm/imax.c
+
+ISMINKERNEL = ../arm/imin.c
+IDMINKERNEL = ../arm/imin.c
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL = amax.S
+DAMAXKERNEL = amax.S
+CAMAXKERNEL = zamax.S
+ZAMAXKERNEL = zamax.S
+
+SAXPYKERNEL = axpy.S
+DAXPYKERNEL = axpy.S
+CAXPYKERNEL = zaxpy.S
+ZAXPYKERNEL = zaxpy.S
+
+SROTKERNEL = rot.S
+DROTKERNEL = rot.S
+CROTKERNEL = zrot.S
+ZROTKERNEL = zrot.S
+
+SSCALKERNEL = scal.S
+DSCALKERNEL = scal.S
+CSCALKERNEL = zscal.S
+ZSCALKERNEL = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL = asum.S
+DASUMKERNEL = asum.S
+CASUMKERNEL = casum.S
+ZASUMKERNEL = zasum.S
+
+SCOPYKERNEL = copy.S
+DCOPYKERNEL = copy.S
+CCOPYKERNEL = copy.S
+ZCOPYKERNEL = copy.S
+
+SSWAPKERNEL = swap.S
+DSWAPKERNEL = swap.S
+CSWAPKERNEL = swap.S
+ZSWAPKERNEL = swap.S
+
+ISAMAXKERNEL = iamax.S
+IDAMAXKERNEL = iamax.S
+ICAMAXKERNEL = izamax.S
+IZAMAXKERNEL = izamax.S
+
+SNRM2KERNEL = nrm2.S
+DNRM2KERNEL = nrm2.S
+CNRM2KERNEL = znrm2.S
+ZNRM2KERNEL = znrm2.S
+
+DDOTKERNEL = dot.S
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL = ../generic/dot.c
+else
+SDOTKERNEL = dot.S
+endif
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL = zdot.S
+ZDOTKERNEL = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL = dot.S
+
+DGEMM_BETA = dgemm_beta.S
+SGEMM_BETA = sgemm_beta.S
+
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)