- With perspective of memory, when K = 1, matrix transpose condition has nothing to do with GEMM algorithm.
- Remove all K1 noTrans / transA / transB / transAB and unify them into single function.
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: skykongkong8 <ss.kong@samsung.com>
#include <hgemm_padding.h>
#include <hgemm_transA.h>
#include <hgemm_transAB.h>
+#include <limits>
#include <hgemm_transB.h>
#include <hgemm_util.h>
bool TransA, bool TransB) {
unsigned int lda = (TransA) ? M : K;
unsigned int ldb = (TransB) ? K : N;
-
- return hgemm_K1_noTrans(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
-
- if (!TransA && TransB) {
- hgemm_K1_transB(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
- } else if (TransA && !TransB) {
- hgemm_K1_transA(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
- } else if (!TransA && !TransB) {
- hgemm_K1_noTrans(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
- } else { // TransA && TransB
- hgemm_K1_transAB(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
+ unsigned int ldc = N;
+
+ const float eps = std::numeric_limits<float>::epsilon();
+ float16x8_t a_vec;
+ unsigned int N8 = (N >> 3) << 3;
+ for (unsigned int m = 0; m < M; ++m) {
+ a_vec = vmovq_n_f16(alpha * A[m]);
+ if (std::fpclassify(beta) != FP_ZERO) {
+ for (unsigned int n = 0; n < N8; n += 8) {
+ vst1q_f16(&C[m * ldc + n],
+ vaddq_f16(vmulq_f16(a_vec, vld1q_f16(&B[n])),
+ vmulq_n_f16(vld1q_f16(&C[m * ldc + n]), beta)));
+ }
+ } else {
+ for (unsigned int n = 0; n < N8; n += 8) {
+ vst1q_f16(&C[m * ldc + n], vmulq_f16(a_vec, vld1q_f16(&B[n])));
+ }
+ }
+ for (unsigned int n = N8; n < N; ++n) {
+ C[m * ldc + n] = alpha * A[m] * B[n] + beta * C[m * ldc + n];
+ }
}
}
}
}
}
-
-void hgemm_K1_noTrans(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha, float beta) {
- const float eps = std::numeric_limits<float>::epsilon();
- float16x8_t a_vec;
- unsigned int N8 = (N >> 3) << 3;
- for (unsigned int m = 0; m < M; ++m) {
- a_vec = vmovq_n_f16(alpha * A[m]);
- if (std::fpclassify(beta) != FP_ZERO) {
- for (unsigned int n = 0; n < N8; n += 8) {
- vst1q_f16(&C[m * ldc + n],
- vaddq_f16(vmulq_f16(a_vec, vld1q_f16(&B[n])),
- vmulq_n_f16(vld1q_f16(&C[m * ldc + n]), beta)));
- }
- } else {
- for (unsigned int n = 0; n < N8; n += 8) {
- vst1q_f16(&C[m * ldc + n], vmulq_f16(a_vec, vld1q_f16(&B[n])));
- }
- }
- for (unsigned int n = N8; n < N; ++n) {
- C[m * ldc + n] = alpha * A[m] * B[n] + beta * C[m * ldc + n];
- }
- }
-}
unsigned int ldb, float *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);
-/**
- * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param A input matrix A
- * @param lda length of the col of matrix A
- * @param B input matrix B
- * @param ldb length of the col of matrix B
- * @param C output matrix C
- * @param ldc length of the col of matrix C
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void hgemm_K1_noTrans(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha = 1.F, float beta = 0.F);
/**
* @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
* @param M length of the row of matrix A
free(A_T);
}
-
-void hgemm_K1_transA(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha,
- float beta) {
- __fp16 *A_T = alignedMalloc(M * K);
-
- transpose_neon<__fp16>(K, M, A, M, A_T, K);
-
- hgemm_K1_noTrans(M, N, K, A_T, lda, B, ldb, C, ldc, alpha, beta);
-
- free(A_T);
-}
*/
void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta);
-/**
- * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param A input matrix A
- * @param lda length of the col of matrix A
- * @param B input matrix B
- * @param ldb length of the col of matrix B
- * @param C output matrix C
- * @param ldc length of the col of matrix C
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void hgemm_K1_transA(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha = 1.F, float beta = 0.F);
+
free(A_T);
free(B_T);
}
-
-void hgemm_K1_transAB(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha, float beta) {
- __fp16 *A_T = alignedMalloc(M * K);
- __fp16 *B_T = alignedMalloc(K * N);
-
- transpose_neon<__fp16>(K, M, A, M, A_T, K);
- transpose_neon<__fp16>(N, K, B, K, B_T, N);
-
- hgemm_K1_noTrans(M, N, K, A_T, lda, B_T, ldb, C, ldc, alpha, beta);
-
- free(A_T);
- free(B_T);
-}
*/
void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta);
-/**
- * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param A input matrix A
- * @param lda length of the col of matrix A
- * @param B input matrix B
- * @param ldb length of the col of matrix B
- * @param C output matrix C
- * @param ldc length of the col of matrix C
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void hgemm_K1_transAB(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha = 1.F, float beta = 0.F);
\ No newline at end of file
free(B_T);
}
-
-void hgemm_K1_transB(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha,
- float beta) {
- __fp16 *B_T = alignedMalloc(K * N);
-
- transpose_neon<__fp16>(N, K, B, K, B_T, N);
-
- hgemm_K1_noTrans(M, N, K, A, lda, B_T, ldb, C, ldc, alpha, beta);
-
- free(B_T);
-}
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, float *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);
-/**
- * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param A input matrix A
- * @param lda length of the col of matrix A
- * @param B input matrix B
- * @param ldb length of the col of matrix B
- * @param C output matrix C
- * @param ldc length of the col of matrix C
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void hgemm_K1_transB(unsigned int M, unsigned int N, unsigned int K,
- const __fp16 *A, unsigned int lda, const __fp16 *B,
- unsigned int ldb, __fp16 *C, unsigned int ldc,
- float alpha = 1.F, float beta = 0.F);