From: skykongkong8 Date: Wed, 10 Jul 2024 10:07:44 +0000 (+0900) Subject: [ trivial ] Fix typo and add missing doxygen tags X-Git-Tag: accepted/tizen/7.0/unified/20240830.164841~41 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5f1b41eb9bcf29f8bcf79cd3d39b5a097b799f32;p=platform%2Fcore%2Fml%2Fnntrainer.git [ trivial ] Fix typo and add missing doxygen tags - Fix typo and add missing doxygen tags - Add more exact explanation for doxygen tag briefs **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 08f31b34..d4d21d4f 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -326,8 +326,8 @@ static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int ldc) { #if (defined USE__FP16 && USE_NEON) - nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA == CblasTrans, - TransB == CblasTrans); + nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, + TransA == CblasTrans, TransB == CblasTrans); #else float *A_ = new float[M * K]; float *B_ = new float[N * K]; diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp index 48db72a0..e2d584a9 100644 --- a/nntrainer/tensor/hgemm/hgemm.cpp +++ b/nntrainer/tensor/hgemm/hgemm.cpp @@ -20,9 +20,9 @@ #include #include #include -#include #include #include +#include void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta, bool TransA, @@ -70,10 +70,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, hgemm_ensure_divisibility(A, B, C32, M, N, K, alpha, beta, TransA, TransB); - unsigned int L = M * N; - unsigned int L8 = (L >> 3) << 3; - - for (unsigned int idx = 0; idx < L8; idx += 8) { + for (unsigned int idx = 0; idx < size8; idx += 8) { float32x4_t x1 = vld1q_f32(&C32[idx]); float32x4_t x2 = vld1q_f32(&C32[idx + 4]); @@ -81,7 +78,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, vst1q_f16(&C[idx], y1); } - for (unsigned int idx = L8; idx < L; ++idx) { + for (unsigned int idx = size8; idx < size; ++idx) { C[idx] = static_cast<__fp16>(C32[idx]); } @@ -95,14 +92,15 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32, /// @note Padding standard : 8x16 is the only KERNEL that outperforms single /// precision GEMM 'so far'. Padding will forcibly make every GEMM cases to /// use it. Note that padding is not the optimal way here, but just an option - /// that is easier to implement. Fine-grained packing should be supported on - /// the future for optimal performance. + /// that is easier to implement. Fine-grained packing, blocking, and + /// corresponding kernels should be supported on the future for optimal + /// performance. __fp16 *A_ = (__fp16 *)A, *B_ = (__fp16 *)B; unsigned int M_ = M, N_ = N, K_ = K; bool pad_A = false, pad_B = false; - // Case 2 : smaller than 8, 16 | padding would be redundant? + // Case 2 : smaller than 8, 16 | padding would be redundant if (M < 8 && K < 16 && N < 16) return hgemm_classify(A_, B_, C32, M_, N_, K_, alpha, beta, TransA, TransB); diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h index a0c7b6f9..333b1b73 100644 --- a/nntrainer/tensor/hgemm/hgemm.h +++ b/nntrainer/tensor/hgemm/hgemm.h @@ -29,10 +29,11 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, bool TransB); /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, + * @brief Checking function for whether matrix A or B needs padding for + * optimal performance of fixed blocking-kernel sequence * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows @@ -45,7 +46,8 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32, bool TransA = false, bool TransB = false); /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, + * @brief Classifying function for GEMM computation case for noTrans, + * transA, transB, transAB * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B * @param[in] C __fp16 * for Matrix C @@ -60,8 +62,8 @@ void hgemm_classify(const __fp16 *A, const __fp16 *B, float *C32, float alpha = 1.F, float beta = 0.F, bool TransA = false, bool TransB = false); /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T + * @brief hgemm computation when K = 1. Transpose is mathematically no use + * for here, and partial accumulation is also not needed. * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B * @param[in] C __fp16 * for Matrix C diff --git a/nntrainer/tensor/hgemm/hgemm_common.h b/nntrainer/tensor/hgemm/hgemm_common.h index a041e431..0330bd43 100644 --- a/nntrainer/tensor/hgemm/hgemm_common.h +++ b/nntrainer/tensor/hgemm/hgemm_common.h @@ -11,10 +11,6 @@ * */ -#define A(i, j) a[(i)*lda + (j)] -#define B(i, j) b[(i)*ldb + (j)] -#define C(i, j) c[(i)*ldc + (j)] - #define N_BLOCKING (768) #define K_BLOCKING (256) #define M_BLOCKING (4096) diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp index 2c301e59..7ee7ea8d 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp @@ -11,22 +11,11 @@ * */ -#include #include #include #include +#include -/** - * @brief hgemm 1x4 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading dimension of matrix C - */ void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -81,17 +70,6 @@ void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 1x4 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading dimension of matrix C - */ void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp index 35927e55..87939aef 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp @@ -83,17 +83,6 @@ a++; \ } while (0) -/** - * @brief hgemm 1x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -130,17 +119,6 @@ void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 1x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp index 40ab4eae..21d81147 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp @@ -214,17 +214,6 @@ vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27))); \ } while (0) -/** - * @brief hgemm 4x4 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading dimension of matrix C - */ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -303,17 +292,6 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 4x4 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading dimension of matrix C - */ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp index 3cebee45..cf607647 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp @@ -258,17 +258,6 @@ vcvt_f32_f16(vget_high_f16(v9)))); \ } while (0) -/** - * @brief hgemm 4x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -306,17 +295,6 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 4x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp index f8d6b56c..6c8e8ee4 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp @@ -13,7 +13,6 @@ #include #include -#include #include #include @@ -726,17 +725,6 @@ vcvt_f32_f16(vget_high_f16(v120_127)))); \ } while (0) -/** - * @brief hgemm 8x16 kernel sc = sa * sb - * - * @param M length of the row of matrix A - * @param N length of the col of matrix B - * @param K length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -795,17 +783,6 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 8x16 kernel sc = sa * sb - * - * @param M length of the row of matrix A - * @param N length of the col of matrix B - * @param K length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -829,8 +806,8 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, float16x8_t v80_87, v88_95; float16x8_t v96_103, v104_111; float16x8_t v112_119, v120_127; - float16x8_t vb1, vb2; float16x8_t va0; + float16x8_t vb1, vb2; l = 0; for (; l < K16;) { INIT_KERNEL_8X16(); diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp index f799e527..9a46b3a9 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp +++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp @@ -400,17 +400,6 @@ vcvt_f32_f16(vget_high_f16(v31)))); \ } while (0) -/** - * @brief hgemm 8x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); @@ -449,17 +438,6 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, } } -/** - * @brief hgemm 8x8 kernel sc = sa * sb - * - * @param m length of the row of matrix A - * @param n length of the col of matrix B - * @param k length of the col of matrix A - * @param sa sub-matrix of input matrix A - * @param sb sub-matrix of input matrix B - * @param sc sub-matrix of output matrix C - * @param ldc leading-dimension of matrix C - */ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { assert(M > 0 && N > 0 && K > 0); diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.h b/nntrainer/tensor/hgemm/hgemm_noTrans.h index 06b1f102..69ef912b 100644 --- a/nntrainer/tensor/hgemm/hgemm_noTrans.h +++ b/nntrainer/tensor/hgemm/hgemm_noTrans.h @@ -252,7 +252,7 @@ void hgemm_noTrans_8x16(unsigned int M, unsigned int N, unsigned int K, float alpha = 1.F, float beta = 0.F); /** - * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C, + * @brief hgemm fallback with NEON : Y = alpha*op(A)*op(B) + beta*C, * @param M length of the row of matrix A * @param N length of the col of matrix B * @param K length of the col of matrix A @@ -287,6 +287,7 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, /** * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, + * where M, N, K are divisible by at least 4 * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B * @param[in] C __fp16 * for Matrix C @@ -302,9 +303,10 @@ void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, __fp16 *C, /** * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, + * where M, N, K are divisible by at least 4 * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows diff --git a/nntrainer/tensor/hgemm/hgemm_pack.cpp b/nntrainer/tensor/hgemm/hgemm_pack.cpp index 63d1b702..c19fde6e 100644 --- a/nntrainer/tensor/hgemm/hgemm_pack.cpp +++ b/nntrainer/tensor/hgemm/hgemm_pack.cpp @@ -2,7 +2,7 @@ /** * Copyright (C) 2024 Sungsik Kong * - * @file hgemm_kernel_pack.cpp + * @file hgemm_pack.cpp * @date 02 July 2024 * @see https://github.com/nnstreamer/nntrainer * @author Sungsik Kong @@ -18,6 +18,10 @@ #include #include +/// @note Matrix packing strategy is quite similar in terms of normal-tangential +/// coordinate's point of view. This hint might lead us to re-implement all +/// packing functions in to single generic function! + void packing_A1(unsigned int m, unsigned int k, const __fp16 *from, unsigned int lda, const __fp16 *to) { @@ -397,8 +401,9 @@ void packing_B16(unsigned int K, unsigned int N, const __fp16 *src, void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src, unsigned int ldb, const __fp16 *dst) { - /// @note ldb = K for here - assert(K != 0 && N != 0 && N % 16 == 0); + /// @note K8 will be intentionally computed for generic K + /// implementation in the future + assert(K != 0 && K % 8 == 0 && N != 0 && N % 16 == 0); unsigned int K8 = (K >> 3) << 3; const __fp16 *src_off = (__fp16 *)src; @@ -408,12 +413,12 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src, __fp16 *tile_T = alignedMalloc(8 * ld_tile_T); // 1. Do something like 8x16 transpose kernel - // 2. Save linearized transposed output tile to dst + // 2. Linearize transposed output tile to dst for (unsigned int n = 0; n < N; n += 16) { const __fp16 *src_off1 = src_off; __fp16 *dst_off1 = dst_off; src_off += 16 * ldb; - dst_off += (K8 * 16 + (K - K8)); // ? + dst_off += (K8 * 16 + (K - K8)); for (unsigned int k = 0; k < K8; k += 8) { // 16x8 tile -> 8x16 transpose_neon<__fp16>(16, 8, src_off1, ldb, tile_T, ld_tile_T); @@ -439,12 +444,7 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src, dst_off1 += 16 * 8; src_off1 += 8; } - - // Do the equivalent of one by one for the rest - for (unsigned int k = K8; k < K; ++k) { - for (unsigned int _n = 0; _n < 16; ++_n) { - dst_off1[_n] = src_off1[k]; - } - } } + + free(tile_T); } diff --git a/nntrainer/tensor/hgemm/hgemm_pack.h b/nntrainer/tensor/hgemm/hgemm_pack.h index b134ee12..12f0770c 100644 --- a/nntrainer/tensor/hgemm/hgemm_pack.h +++ b/nntrainer/tensor/hgemm/hgemm_pack.h @@ -2,7 +2,7 @@ /** * Copyright (C) 2024 Sungsik Kong * - * @file hgemm_kernel_pack.h + * @file hgemm_pack.h * @date 01 April 2024 * @see https://github.com/nnstreamer/nntrainer * @author Sungsik Kong diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h index f62143a4..5353fe0e 100644 --- a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h +++ b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h @@ -8,6 +8,8 @@ * @author Sungsik Kong * @bug No known bugs except for NYI items * @brief This is a header file for including both padding matrix A and B + * @note Padding function for matrix A and B will be fused into single + * function in this file in the future * */ diff --git a/nntrainer/tensor/hgemm/hgemm_transA.h b/nntrainer/tensor/hgemm/hgemm_transA.h index 8de44c84..e6c1cceb 100644 --- a/nntrainer/tensor/hgemm/hgemm_transA.h +++ b/nntrainer/tensor/hgemm/hgemm_transA.h @@ -12,11 +12,10 @@ */ /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T + * @brief hgemm computation with neon : Y = alpha*A_T*B + beta*C, * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows @@ -25,4 +24,3 @@ */ void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta); - diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.h b/nntrainer/tensor/hgemm/hgemm_transAB.h index 2c228031..b97ce033 100644 --- a/nntrainer/tensor/hgemm/hgemm_transAB.h +++ b/nntrainer/tensor/hgemm/hgemm_transAB.h @@ -12,11 +12,10 @@ */ /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T + * @brief hgemm computation with neon : Y = alpha*A_T*B_T + beta*C, * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows diff --git a/nntrainer/tensor/hgemm/hgemm_transB.h b/nntrainer/tensor/hgemm/hgemm_transB.h index 0c47d23d..6fe3ec0e 100644 --- a/nntrainer/tensor/hgemm/hgemm_transB.h +++ b/nntrainer/tensor/hgemm/hgemm_transB.h @@ -12,11 +12,10 @@ */ /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T + * @brief hgemm transB computation : Y = alpha*A*B_T + beta*C, * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows @@ -26,16 +25,26 @@ void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta); +/** + * @brief hgemm transB computation : Y = alpha*A*B_T + beta*C, + * @param[in] A __fp16 * for Matrix A + * @param[in] B __fp16 * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ void hgemm_transB_fallback(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta); /** - * @brief hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T + * @brief hgemm transB computation with kernel 8x16 * @param[in] A __fp16 * for Matrix A * @param[in] B __fp16 * for Matrix B - * @param[in] C __fp16 * for Matrix C + * @param[in] C float * for Matrix C * @param[in] M number of op(A)'s and C's row * @param[in] N number of op(B)'s and C's columns * @param[in] K number of op(A)'s and columns and op(B)'s rows diff --git a/nntrainer/tensor/hgemm/meson.build b/nntrainer/tensor/hgemm/meson.build index e100f63a..f6781e93 100644 --- a/nntrainer/tensor/hgemm/meson.build +++ b/nntrainer/tensor/hgemm/meson.build @@ -3,7 +3,6 @@ hgemm_headers = [ 'hgemm_util.h', 'hgemm_pack.h', 'hgemm_common.h', - 'hgemm_padding.h', ] subdir('hgemm_kernel') @@ -16,8 +15,6 @@ nntrainer_inc_abs += meson.current_source_dir() / 'hgemm_padding' hgemm_sources = [ 'hgemm.cpp', - 'hgemm_padding_a.cpp', - 'hgemm_padding_b.cpp', 'hgemm_pack.cpp', 'hgemm_noTrans.cpp', 'hgemm_transA.cpp',