From e115c97e05889fc2e8edf041cdfd92d00d63a884 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:59 +0200 Subject: [PATCH] s390x/SGEMM: adjust default P and Q to multiples of M We recently changed the register blocking for SGEMM on s390x to 16x4. However, we did not adjust Q to a multiple of 16 and thus fell back to the 8x4 kernel at each block's margin, without need. Adjust P and Q to multiples of 16 to employ the faster 16x4 kernel for complete full-sized blocks. Signed-off-by: Marius Hillenbrand --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 476f237..3e539a2 100644 --- a/param.h +++ b/param.h @@ -3092,12 +3092,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 456 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 320 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 224 -#define SGEMM_DEFAULT_Q 488 +#define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 352 -- 2.7.4