const unsigned int ldc) {
#if (defined USE__FP16 && USE_NEON)
- nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA == CblasTrans,
- TransB == CblasTrans);
+ nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta,
+ TransA == CblasTrans, TransB == CblasTrans);
#else
float *A_ = new float[M * K];
float *B_ = new float[N * K];
#include <hgemm_padding.h>
#include <hgemm_transA.h>
#include <hgemm_transAB.h>
-#include <limits>
#include <hgemm_transB.h>
#include <hgemm_util.h>
+#include <limits>
void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta, bool TransA,
hgemm_ensure_divisibility(A, B, C32, M, N, K, alpha, beta, TransA, TransB);
- unsigned int L = M * N;
- unsigned int L8 = (L >> 3) << 3;
-
- for (unsigned int idx = 0; idx < L8; idx += 8) {
+ for (unsigned int idx = 0; idx < size8; idx += 8) {
float32x4_t x1 = vld1q_f32(&C32[idx]);
float32x4_t x2 = vld1q_f32(&C32[idx + 4]);
vst1q_f16(&C[idx], y1);
}
- for (unsigned int idx = L8; idx < L; ++idx) {
+ for (unsigned int idx = size8; idx < size; ++idx) {
C[idx] = static_cast<__fp16>(C32[idx]);
}
/// @note Padding standard : 8x16 is the only KERNEL that outperforms single
/// precision GEMM 'so far'. Padding will forcibly make every GEMM cases to
/// use it. Note that padding is not the optimal way here, but just an option
- /// that is easier to implement. Fine-grained packing should be supported on
- /// the future for optimal performance.
+ /// that is easier to implement. Fine-grained packing, blocking, and
+ /// corresponding kernels should be supported on the future for optimal
+ /// performance.
__fp16 *A_ = (__fp16 *)A, *B_ = (__fp16 *)B;
unsigned int M_ = M, N_ = N, K_ = K;
bool pad_A = false, pad_B = false;
- // Case 2 : smaller than 8, 16 | padding would be redundant?
+ // Case 2 : smaller than 8, 16 | padding would be redundant
if (M < 8 && K < 16 && N < 16)
return hgemm_classify(A_, B_, C32, M_, N_, K_, alpha, beta, TransA, TransB);
bool TransB);
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief Checking function for whether matrix A or B needs padding for
+ * optimal performance of fixed blocking-kernel sequence
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
bool TransA = false, bool TransB = false);
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief Classifying function for GEMM computation case for noTrans,
+ * transA, transB, transAB
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
* @param[in] C __fp16 * for Matrix C
float alpha = 1.F, float beta = 0.F, bool TransA = false,
bool TransB = false);
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief hgemm computation when K = 1. Transpose is mathematically no use
+ * for here, and partial accumulation is also not needed.
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
* @param[in] C __fp16 * for Matrix C
*
*/
-#define A(i, j) a[(i)*lda + (j)]
-#define B(i, j) b[(i)*ldb + (j)]
-#define C(i, j) c[(i)*ldc + (j)]
-
#define N_BLOCKING (768)
#define K_BLOCKING (256)
#define M_BLOCKING (4096)
*
*/
-#include <stdlib.h>
#include <arm_neon.h>
#include <assert.h>
#include <hgemm_kernel.h>
+#include <stdlib.h>
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
a++; \
} while (0)
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27))); \
} while (0)
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
vcvt_f32_f16(vget_high_f16(v9)))); \
} while (0)
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
#include <arm_neon.h>
#include <assert.h>
-#include <iostream>
#include <hgemm_kernel.h>
#include <stdlib.h>
vcvt_f32_f16(vget_high_f16(v120_127)))); \
} while (0)
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
float16x8_t v80_87, v88_95;
float16x8_t v96_103, v104_111;
float16x8_t v112_119, v120_127;
- float16x8_t vb1, vb2;
float16x8_t va0;
+ float16x8_t vb1, vb2;
l = 0;
for (; l < K16;) {
INIT_KERNEL_8X16();
vcvt_f32_f16(vget_high_f16(v31)))); \
} while (0)
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
}
}
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
float alpha = 1.F, float beta = 0.F);
/**
- * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief hgemm fallback with NEON : Y = alpha*op(A)*op(B) + beta*C,
* @param M length of the row of matrix A
* @param N length of the col of matrix B
* @param K length of the col of matrix A
/**
* @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
* @param[in] C __fp16 * for Matrix C
/**
* @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
/**
* Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
*
- * @file hgemm_kernel_pack.cpp
+ * @file hgemm_pack.cpp
* @date 02 July 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Sungsik Kong <ss.kong@samsung.com>
#include <hgemm_util.h>
#include <matrix_transpose_neon.h>
+/// @note Matrix packing strategy is quite similar in terms of normal-tangential
+/// coordinate's point of view. This hint might lead us to re-implement all
+/// packing functions in to single generic function!
+
void packing_A1(unsigned int m, unsigned int k, const __fp16 *from,
unsigned int lda, const __fp16 *to) {
void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
unsigned int ldb, const __fp16 *dst) {
- /// @note ldb = K for here
- assert(K != 0 && N != 0 && N % 16 == 0);
+ /// @note K8 will be intentionally computed for generic K
+ /// implementation in the future
+ assert(K != 0 && K % 8 == 0 && N != 0 && N % 16 == 0);
unsigned int K8 = (K >> 3) << 3;
const __fp16 *src_off = (__fp16 *)src;
__fp16 *tile_T = alignedMalloc(8 * ld_tile_T);
// 1. Do something like 8x16 transpose kernel
- // 2. Save linearized transposed output tile to dst
+ // 2. Linearize transposed output tile to dst
for (unsigned int n = 0; n < N; n += 16) {
const __fp16 *src_off1 = src_off;
__fp16 *dst_off1 = dst_off;
src_off += 16 * ldb;
- dst_off += (K8 * 16 + (K - K8)); // ?
+ dst_off += (K8 * 16 + (K - K8));
for (unsigned int k = 0; k < K8; k += 8) {
// 16x8 tile -> 8x16
transpose_neon<__fp16>(16, 8, src_off1, ldb, tile_T, ld_tile_T);
dst_off1 += 16 * 8;
src_off1 += 8;
}
-
- // Do the equivalent of one by one for the rest
- for (unsigned int k = K8; k < K; ++k) {
- for (unsigned int _n = 0; _n < 16; ++_n) {
- dst_off1[_n] = src_off1[k];
- }
- }
}
+
+ free(tile_T);
}
/**
* Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
*
- * @file hgemm_kernel_pack.h
+ * @file hgemm_pack.h
* @date 01 April 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Sungsik Kong <ss.kong@samsung.com>
* @author Sungsik Kong <ss.kong@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is a header file for including both padding matrix A and B
+ * @note Padding function for matrix A and B will be fused into single
+ * function in this file in the future
*
*/
*/
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief hgemm computation with neon : Y = alpha*A_T*B + beta*C,
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
*/
void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta);
-
*/
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief hgemm computation with neon : Y = alpha*A_T*B_T + beta*C,
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
*/
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief hgemm transB computation : Y = alpha*A*B_T + beta*C,
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta);
+/**
+ * @brief hgemm transB computation : Y = alpha*A*B_T + beta*C,
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
void hgemm_transB_fallback(const __fp16 *A, const __fp16 *B, float *C,
unsigned int M, unsigned int N, unsigned int K,
float alpha, float beta);
/**
- * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief hgemm transB computation with kernel 8x16
* @param[in] A __fp16 * for Matrix A
* @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
* @param[in] M number of op(A)'s and C's row
* @param[in] N number of op(B)'s and C's columns
* @param[in] K number of op(A)'s and columns and op(B)'s rows
'hgemm_util.h',
'hgemm_pack.h',
'hgemm_common.h',
- 'hgemm_padding.h',
]
subdir('hgemm_kernel')
hgemm_sources = [
'hgemm.cpp',
- 'hgemm_padding_a.cpp',
- 'hgemm_padding_b.cpp',
'hgemm_pack.cpp',
'hgemm_noTrans.cpp',
'hgemm_transA.cpp',