From 07c334e7be2f30a07263f0f827cb92fd257704dc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:53 +0200 Subject: [PATCH] s390x: Factor out small block sizes for SGEMM/DGEMM on z14 For small register blockings that are too small to fill up vector registers with column vectors, we currently use a generic code block. Replace that with instantiations of the generic code as individual functions, so that the compiler can optimize each one separately. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 78 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index eae2e4d..741c094 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -265,12 +265,58 @@ VECTOR_BLOCK(4, 4) VECTOR_BLOCK(4, 2) VECTOR_BLOCK(4, 1) +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations. + * Simple implementation for smaller block sizes + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define SCALAR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \ + \ + /* \ + * Peel off first iteration (i.e., column of A) for \ + * initializing Caux \ + */ \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \ + \ + for (BLASLONG kk = 1; kk < k; kk++) \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + if (trmm) { \ + C[i + j * ldc] = alpha * Caux[i][j]; \ + } else { \ + C[i + j * ldc] += alpha * Caux[i][j]; \ + } \ + } + #ifdef DOUBLE VECTOR_BLOCK(2, 4) VECTOR_BLOCK(2, 2) VECTOR_BLOCK(2, 1) +#else +SCALAR_BLOCK(2, 4) +SCALAR_BLOCK(2, 2) +SCALAR_BLOCK(2, 1) #endif +SCALAR_BLOCK(1, 4) +SCALAR_BLOCK(1, 2) +SCALAR_BLOCK(1, 1) + /** * Calculate a row-block that fits 4x4 vector registers using a loop @@ -526,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, } } + /* Dispatch into the implementation for each block size: */ + #define BLOCK(bm, bn) \ if (m == bm && n == bn) { \ GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ @@ -541,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); - #ifdef DOUBLE - BLOCK(2, 4); - BLOCK(2, 2); - #endif - -#undef BLOCK + BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1); - /* simple implementation for smaller block sizes: */ - FLOAT Caux[m][n] __attribute__ ((aligned (16))); + BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1); - /* - * Peel off first iteration (i.e., column of A) for initializing Caux - */ - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] = A[i] * B[j]; - - for (BLASLONG kk = 1; kk < k; kk++) - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] += A[i + kk * m] * B[j + kk * n]; - - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - if (trmm) { - C[i + j * ldc] = alpha * Caux[i][j]; - } else { - C[i + j * ldc] += alpha * Caux[i][j]; - } +#undef BLOCK } /** -- 2.7.4