From 71b6eaf459e55e7b5fe5047052c39c49f16c3680 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 14:40:30 +0200
Subject: [PATCH] s390x: Use new sgemm kernel also for strmm on Z14 and newer

Employ the newly added GEMM kernel also for STRMM on Z14. The
implementation in C with vector intrinsics exploits FP32 SIMD operations
and thereby gains performance over the existing assembly code. Extend
the implementation for handling triangular matrix multiplication,
accordingly. As added benefit, the more flexible C code enables us to
adjust register blocking in the subsequent commit.

Tested via make -C test / ctest / utest and by a couple of additional
unit tests that exercise blocking.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/KERNEL.Z14 |   8 +---
 kernel/zarch/gemm_vec.c | 104 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index bd3a966..49fa281 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -86,7 +86,7 @@ DGEMVTKERNEL = dgemv_t_4.c
 CGEMVTKERNEL = cgemv_t_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
-STRMMKERNEL	= strmm8x4V.S
+STRMMKERNEL	= gemm_vec.c
 DTRMMKERNEL	= trmm8x4V.S
 CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
@@ -101,8 +101,6 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-
-
 DGEMMKERNEL    =  gemm8x4V.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
@@ -145,7 +143,3 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
-
-
-
-
diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index e6d613c..a9531c7 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -51,6 +51,29 @@
 static const size_t unroll_m = UNROLL_M;
 static const size_t unroll_n = UNROLL_N;
 
+/* Handling of triangular matrices */
+#ifdef TRMMKERNEL
+static const bool trmm = true;
+static const bool left =
+#ifdef LEFT
+	true;
+#else
+	false;
+#endif
+
+static const bool backwards =
+#if defined(LEFT) != defined(TRANSA)
+	true;
+#else
+	false;
+#endif
+
+#else
+static const bool trmm = false;
+static const bool left = false;
+static const bool backwards = false;
+#endif /* TRMMKERNEL */
+
 /*
  * Background:
  *
@@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N;
  *      vectorization for varying block sizes)
  *    - add alpha * row block of C_aux back into C_j.
  *
+ * Note that there are additional mechanics for handling triangular matrices,
+ * calculating B := alpha (A * B) where either of the matrices A or B can be
+ * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
+ * optionally be transposed.
+ * The code effectively skips an "offset" number of columns in A and rows of B
+ * in each block, to save unnecessary work by exploiting the triangular nature.
+ * To handle all cases, the code discerns (1) a "left" mode when A is triangular
+ * and (2) "forward" / "backwards" modes where only the first "offset"
+ * columns/rows of A/B are used or where the first "offset" columns/rows are
+ * skipped, respectively.
+ *
  * Reference:
  *
  * The summary above is based on staring at various kernel implementations and:
@@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 				vector_float *C_ij =                          \
 				    (vector_float *)(C + i * VLEN_FLOATS +    \
 						     j * ldc);                \
-				*C_ij += alpha * Caux[i][j];                  \
+				if (trmm) {                                   \
+					*C_ij = alpha * Caux[i][j];           \
+				} else {                                      \
+					*C_ij += alpha * Caux[i][j];          \
+				}                                             \
 			}                                                     \
 		}                                                             \
 	}
@@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2)
  * @param[inout] C	Pointer to current column block (panel) of output matrix C.
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
  * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset  Number of columns of A and rows of B to skip (for triangular matrices).
+ * @param[in]	off	Running offset for handling triangular matrices.
  */
 static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		       BLASLONG first_row,
 		       const FLOAT * restrict A, BLASLONG k,
 		       const FLOAT * restrict B,
 		       FLOAT *restrict C, BLASLONG ldc,
-		       FLOAT alpha)
+		       FLOAT alpha,
+		       BLASLONG offset, BLASLONG off)
 {
+	if (trmm && left)
+		off = offset + first_row;
+
 	A += first_row * k;
 	C += first_row;
 
+	if (trmm) {
+		if (backwards) {
+			A += off * m;
+			B += off * n;
+			k -= off;
+		} else {
+			if (left) {
+				k = off + m;
+			} else {
+				k = off + n;
+			}
+		}
+	}
+
 #define BLOCK(bm, bn)                                           \
 	if (m == bm && n == bn) {                               \
 		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 
 	for (BLASLONG i = 0; i < m; i++)
 		for (BLASLONG j = 0; j < n; j++)
-			C[i + j * ldc] += alpha * Caux[i][j];
+			if (trmm) {
+				C[i + j * ldc] = alpha * Caux[i][j];
+			} else {
+				C[i + j * ldc] += alpha * Caux[i][j];
+			}
 }
 
 /**
@@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
  * @param[inout] C	Pointer to output matrix C (note: all of it).
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
  * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
  */
 static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 			const FLOAT *restrict A, BLASLONG bk,
 			const FLOAT *restrict B, BLASLONG bm,
 			FLOAT *restrict C, BLASLONG ldc,
-			FLOAT alpha) {
+			FLOAT alpha,
+			BLASLONG const offset) {
+
 	FLOAT *restrict C_i = C + first_col * ldc;
 	/*
 	 * B is in column-order with n_r packed row elements, which does
@@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 	 */
 	const FLOAT *restrict B_i = B + first_col * bk;
 
+	BLASLONG off = 0;
+	if (trmm) {
+		if (left) {
+			off = offset;
+		} else {
+			off = -offset + first_col;
+		}
+	}
+
 	/*
 	 * Calculate C_aux := A * B_j
 	 * then unpack C_i += alpha * C_aux.
@@ -293,7 +367,7 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
 		for (; bm - row >= block_size; row += block_size)
 			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
-				   ldc, alpha);
+				   ldc, alpha, offset, off);
 }
 
 /**
@@ -301,6 +375,9 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
  * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
  * C are pointers to submatrices of the actual matrices.
  *
+ * For triangular matrix multiplication, calculate B := alpha (A * B) where A
+ * or B can be triangular (in case of A, the macro LEFT will be defined).
+ *
  * @param[in]	bm	Number of rows in C and A.
  * @param[in]	bn	Number of columns in C and B.
  * @param[in]	bk	Number of columns in A and rows in B.
@@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
  * @param[in]	bb	Pointer to input matrix B.
  * @param[inout] C	Pointer to output matrix C.
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
  * @returns 0 on success.
  */
 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	  FLOAT *restrict ba, FLOAT *restrict bb,
-	  FLOAT *restrict C, BLASLONG ldc)
+	  FLOAT *restrict C, BLASLONG ldc
+#ifdef TRMMKERNEL
+	  , BLASLONG offset
+#endif
+	  )
 {
 	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
 		return 0;
@@ -327,6 +409,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	bb = __builtin_assume_aligned(bb, 16);
 
 	/*
+	 * Use offset and off even when compiled as SGEMMKERNEL to simplify
+	 * function signatures and function calls.
+	 */
+#ifndef TRMMKERNEL
+	BLASLONG const offset = 0;
+#endif
+
+	/*
 	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
 	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
 	 *
@@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	BLASLONG col = 0;
 	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
 		for (; bn - col >= block_size; col += block_size)
-			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha);
+			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
 
    return 0;
 }
-- 
2.7.4