#else
#define SGEMM_DEFAULT_UNROLL_M 16
-#ifndef DYNAMIC_ARCH
#define DGEMM_DEFAULT_UNROLL_M 16
-#else
-#define DGEMM_DEFAULT_UNROLL_M 4
-#endif
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 4
-#ifndef DYNAMIC_ARCH
#define DGEMM_DEFAULT_UNROLL_N 2
-#else
-#define DGEMM_DEFAULT_UNROLL_N 8
-#endif
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
+#if defined(HAVE_GAS) && (HAVE_GAS == 1)
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_N 4
+#else
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
+#endif
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#define SGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 16
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_P sgemm_p
-#define DGEMM_DEFAULT_P dgemm_p
+#define DGEMM_DEFAULT_P 32
#define QGEMM_DEFAULT_P qgemm_p
#define CGEMM_DEFAULT_P cgemm_p
#define ZGEMM_DEFAULT_P zgemm_p
#define XGEMM_DEFAULT_P xgemm_p
#define SGEMM_DEFAULT_R sgemm_r
-#define DGEMM_DEFAULT_R dgemm_r
+#define DGEMM_DEFAULT_R 858
#define QGEMM_DEFAULT_R qgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r
#define SGEMM_DEFAULT_Q 128
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 152
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 128
#define SYMV_P 16
-#if defined(CORTEXA57) || \
+#if defined(CORTEXA57) || defined(CORTEXX1) || \
defined(CORTEXA72) || defined(CORTEXA73) || \
- defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
+ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
/*FIXME: this should be using the cache size, but there is currently no easy way to
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
is a big desktop or server with abundant cache rather than a phone or embedded device */
-#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
+#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1)
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
-#elif defined(ARMV8SVE) || defined(A64FX)
+#elif defined(NEOVERSEV1)
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(NEOVERSEN2)
+
+#undef SBGEMM_DEFAULT_UNROLL_M
+#undef SBGEMM_DEFAULT_UNROLL_N
+#define SBGEMM_DEFAULT_UNROLL_M 8
+#define SBGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
+
+/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
+Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_N 8
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If SVE size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN 32
+
/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 8
-#define CGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_MN 32
+
+#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_MN 16
-#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_M 4
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4