From: skykongkong8 <ss.kong@samsung.com>
Date: Wed, 10 Jul 2024 10:07:44 +0000 (+0900)
Subject: [ trivial ] Fix typo and add missing doxygen tags
X-Git-Tag: accepted/tizen/7.0/unified/20240830.164841~41
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5f1b41eb9bcf29f8bcf79cd3d39b5a097b799f32;p=platform%2Fcore%2Fml%2Fnntrainer.git

[ trivial ] Fix typo and add missing doxygen tags

- Fix typo and add missing doxygen tags
- Add more exact explanation for doxygen tag briefs

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
---

diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 08f31b34..d4d21d4f 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -326,8 +326,8 @@ static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
                        const unsigned int ldc) {
 
 #if (defined USE__FP16 && USE_NEON)
-  nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA == CblasTrans,
-                         TransB == CblasTrans);
+  nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta,
+                                TransA == CblasTrans, TransB == CblasTrans);
 #else
   float *A_ = new float[M * K];
   float *B_ = new float[N * K];
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp
index 48db72a0..e2d584a9 100644
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -20,9 +20,9 @@
 #include <hgemm_padding.h>
 #include <hgemm_transA.h>
 #include <hgemm_transAB.h>
-#include <limits>
 #include <hgemm_transB.h>
 #include <hgemm_util.h>
+#include <limits>
 
 void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
            unsigned int N, unsigned int K, float alpha, float beta, bool TransA,
@@ -70,10 +70,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
 
   hgemm_ensure_divisibility(A, B, C32, M, N, K, alpha, beta, TransA, TransB);
 
-  unsigned int L = M * N;
-  unsigned int L8 = (L >> 3) << 3;
-
-  for (unsigned int idx = 0; idx < L8; idx += 8) {
+  for (unsigned int idx = 0; idx < size8; idx += 8) {
     float32x4_t x1 = vld1q_f32(&C32[idx]);
     float32x4_t x2 = vld1q_f32(&C32[idx + 4]);
 
@@ -81,7 +78,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
 
     vst1q_f16(&C[idx], y1);
   }
-  for (unsigned int idx = L8; idx < L; ++idx) {
+  for (unsigned int idx = size8; idx < size; ++idx) {
     C[idx] = static_cast<__fp16>(C32[idx]);
   }
 
@@ -95,14 +92,15 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32,
   /// @note Padding standard : 8x16 is the only KERNEL that outperforms single
   /// precision GEMM 'so far'. Padding will forcibly make every GEMM cases to
   /// use it. Note that padding is not the optimal way here, but just an option
-  /// that is easier to implement. Fine-grained packing should be supported on
-  /// the future for optimal performance.
+  /// that is easier to implement. Fine-grained packing, blocking, and
+  /// corresponding kernels should be supported on the future for optimal
+  /// performance.
 
   __fp16 *A_ = (__fp16 *)A, *B_ = (__fp16 *)B;
   unsigned int M_ = M, N_ = N, K_ = K;
   bool pad_A = false, pad_B = false;
 
-  // Case 2 : smaller than 8, 16 | padding would be redundant?
+  // Case 2 : smaller than 8, 16 | padding would be redundant
   if (M < 8 && K < 16 && N < 16)
     return hgemm_classify(A_, B_, C32, M_, N_, K_, alpha, beta, TransA, TransB);
 
diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h
index a0c7b6f9..333b1b73 100644
--- a/nntrainer/tensor/hgemm/hgemm.h
+++ b/nntrainer/tensor/hgemm/hgemm.h
@@ -29,10 +29,11 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
            bool TransB);
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     Checking function for whether matrix A or B needs padding for
+ * optimal performance of fixed blocking-kernel sequence
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -45,7 +46,8 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32,
                                bool TransA = false, bool TransB = false);
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     Classifying function for GEMM computation case for noTrans,
+ * transA, transB, transAB
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
  * @param[in] C __fp16 * for Matrix C
@@ -60,8 +62,8 @@ void hgemm_classify(const __fp16 *A, const __fp16 *B, float *C32,
                     float alpha = 1.F, float beta = 0.F, bool TransA = false,
                     bool TransB = false);
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation when K = 1. Transpose is mathematically no use
+ * for here, and partial accumulation is also not needed.
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
  * @param[in] C __fp16 * for Matrix C
diff --git a/nntrainer/tensor/hgemm/hgemm_common.h b/nntrainer/tensor/hgemm/hgemm_common.h
index a041e431..0330bd43 100644
--- a/nntrainer/tensor/hgemm/hgemm_common.h
+++ b/nntrainer/tensor/hgemm/hgemm_common.h
@@ -11,10 +11,6 @@
  *
  */
 
-#define A(i, j) a[(i)*lda + (j)]
-#define B(i, j) b[(i)*ldb + (j)]
-#define C(i, j) c[(i)*ldc + (j)]
-
 #define N_BLOCKING (768)
 #define K_BLOCKING (256)
 #define M_BLOCKING (4096)
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
index 2c301e59..7ee7ea8d 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
@@ -11,22 +11,11 @@
  *
  */
 
-#include <stdlib.h>
 #include <arm_neon.h>
 #include <assert.h>
 #include <hgemm_kernel.h>
+#include <stdlib.h>
 
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
 void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -81,17 +70,6 @@ void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
 void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
index 35927e55..87939aef 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
@@ -83,17 +83,6 @@
     a++;                            \
   } while (0)
 
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -130,17 +119,6 @@ void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
index 40ab4eae..21d81147 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
@@ -214,17 +214,6 @@
               vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27)));      \
   } while (0)
 
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
 void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -303,17 +292,6 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
 void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
index 3cebee45..cf607647 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
@@ -258,17 +258,6 @@
                                          vcvt_f32_f16(vget_high_f16(v9))));   \
   } while (0)
 
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -306,17 +295,6 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
index f8d6b56c..6c8e8ee4 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
@@ -13,7 +13,6 @@
 
 #include <arm_neon.h>
 #include <assert.h>
-#include <iostream>
 #include <hgemm_kernel.h>
 #include <stdlib.h>
 
@@ -726,17 +725,6 @@
                         vcvt_f32_f16(vget_high_f16(v120_127))));               \
   } while (0)
 
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -795,17 +783,6 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -829,8 +806,8 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v80_87, v88_95;
       float16x8_t v96_103, v104_111;
       float16x8_t v112_119, v120_127;
-      float16x8_t vb1, vb2;
       float16x8_t va0;
+      float16x8_t vb1, vb2;
       l = 0;
       for (; l < K16;) {
         INIT_KERNEL_8X16();
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
index f799e527..9a46b3a9 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
@@ -400,17 +400,6 @@
                                          vcvt_f32_f16(vget_high_f16(v31))));   \
   } while (0)
 
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
@@ -449,17 +438,6 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
   }
 }
 
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
 void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
                       __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
   assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.h b/nntrainer/tensor/hgemm/hgemm_noTrans.h
index 06b1f102..69ef912b 100644
--- a/nntrainer/tensor/hgemm/hgemm_noTrans.h
+++ b/nntrainer/tensor/hgemm/hgemm_noTrans.h
@@ -252,7 +252,7 @@ void hgemm_noTrans_8x16(unsigned int M, unsigned int N, unsigned int K,
                         float alpha = 1.F, float beta = 0.F);
 
 /**
- * @brief     hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     hgemm fallback with NEON : Y = alpha*op(A)*op(B) + beta*C,
  * @param M length of the row of matrix A
  * @param N length of the col of matrix B
  * @param K length of the col of matrix A
@@ -287,6 +287,7 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
 
 /**
  * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
  * @param[in] C __fp16 * for Matrix C
@@ -302,9 +303,10 @@ void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, __fp16 *C,
 
 /**
  * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.cpp b/nntrainer/tensor/hgemm/hgemm_pack.cpp
index 63d1b702..c19fde6e 100644
--- a/nntrainer/tensor/hgemm/hgemm_pack.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_pack.cpp
@@ -2,7 +2,7 @@
 /**
  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   hgemm_kernel_pack.cpp
+ * @file   hgemm_pack.cpp
  * @date   02 July 2024
  * @see    https://github.com/nnstreamer/nntrainer
  * @author Sungsik Kong <ss.kong@samsung.com>
@@ -18,6 +18,10 @@
 #include <hgemm_util.h>
 #include <matrix_transpose_neon.h>
 
+/// @note Matrix packing strategy is quite similar in terms of normal-tangential
+/// coordinate's point of view. This hint might lead us to re-implement all
+/// packing functions in to single generic function!
+
 void packing_A1(unsigned int m, unsigned int k, const __fp16 *from,
                 unsigned int lda, const __fp16 *to) {
 
@@ -397,8 +401,9 @@ void packing_B16(unsigned int K, unsigned int N, const __fp16 *src,
 
 void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
                       unsigned int ldb, const __fp16 *dst) {
-  /// @note ldb = K for here
-  assert(K != 0 && N != 0 && N % 16 == 0);
+  /// @note K8 will be intentionally computed for generic K
+  /// implementation in the future
+  assert(K != 0 && K % 8 == 0 && N != 0 && N % 16 == 0);
   unsigned int K8 = (K >> 3) << 3;
 
   const __fp16 *src_off = (__fp16 *)src;
@@ -408,12 +413,12 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
   __fp16 *tile_T = alignedMalloc(8 * ld_tile_T);
 
   // 1. Do something like 8x16 transpose kernel
-  // 2. Save linearized transposed output tile to dst
+  // 2. Linearize transposed output tile to dst
   for (unsigned int n = 0; n < N; n += 16) {
     const __fp16 *src_off1 = src_off;
     __fp16 *dst_off1 = dst_off;
     src_off += 16 * ldb;
-    dst_off += (K8 * 16 + (K - K8)); // ?
+    dst_off += (K8 * 16 + (K - K8));
     for (unsigned int k = 0; k < K8; k += 8) {
       // 16x8 tile -> 8x16
       transpose_neon<__fp16>(16, 8, src_off1, ldb, tile_T, ld_tile_T);
@@ -439,12 +444,7 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
       dst_off1 += 16 * 8;
       src_off1 += 8;
     }
-
-    // Do the equivalent of one by one for the rest
-    for (unsigned int k = K8; k < K; ++k) {
-      for (unsigned int _n = 0; _n < 16; ++_n) {
-        dst_off1[_n] = src_off1[k];
-      }
-    }
   }
+
+  free(tile_T);
 }
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.h b/nntrainer/tensor/hgemm/hgemm_pack.h
index b134ee12..12f0770c 100644
--- a/nntrainer/tensor/hgemm/hgemm_pack.h
+++ b/nntrainer/tensor/hgemm/hgemm_pack.h
@@ -2,7 +2,7 @@
 /**
  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   hgemm_kernel_pack.h
+ * @file   hgemm_pack.h
  * @date   01 April 2024
  * @see    https://github.com/nnstreamer/nntrainer
  * @author Sungsik Kong <ss.kong@samsung.com>
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
index f62143a4..5353fe0e 100644
--- a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
+++ b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
@@ -8,6 +8,8 @@
  * @author Sungsik Kong <ss.kong@samsung.com>
  * @bug    No known bugs except for NYI items
  * @brief  This is a header file for including both padding matrix A and B
+ * @note   Padding function for matrix A and B will be fused into single
+ * function in this file in the future
  *
  */
 
diff --git a/nntrainer/tensor/hgemm/hgemm_transA.h b/nntrainer/tensor/hgemm/hgemm_transA.h
index 8de44c84..e6c1cceb 100644
--- a/nntrainer/tensor/hgemm/hgemm_transA.h
+++ b/nntrainer/tensor/hgemm/hgemm_transA.h
@@ -12,11 +12,10 @@
  */
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation with neon : Y = alpha*A_T*B + beta*C,
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -25,4 +24,3 @@
  */
 void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
                   unsigned int N, unsigned int K, float alpha, float beta);
-
diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.h b/nntrainer/tensor/hgemm/hgemm_transAB.h
index 2c228031..b97ce033 100644
--- a/nntrainer/tensor/hgemm/hgemm_transAB.h
+++ b/nntrainer/tensor/hgemm/hgemm_transAB.h
@@ -12,11 +12,10 @@
  */
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation with neon : Y = alpha*A_T*B_T + beta*C,
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/hgemm_transB.h b/nntrainer/tensor/hgemm/hgemm_transB.h
index 0c47d23d..6fe3ec0e 100644
--- a/nntrainer/tensor/hgemm/hgemm_transB.h
+++ b/nntrainer/tensor/hgemm/hgemm_transB.h
@@ -12,11 +12,10 @@
  */
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm transB computation : Y = alpha*A*B_T + beta*C,
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -26,16 +25,26 @@
 void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
                   unsigned int N, unsigned int K, float alpha, float beta);
 
+/**
+ * @brief     hgemm transB computation : Y = alpha*A*B_T + beta*C,
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
 void hgemm_transB_fallback(const __fp16 *A, const __fp16 *B, float *C,
                            unsigned int M, unsigned int N, unsigned int K,
                            float alpha, float beta);
 
 /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm transB computation with kernel 8x16
  * @param[in] A __fp16 * for Matrix A
  * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
  * @param[in] M number of op(A)'s and C's row
  * @param[in] N number of op(B)'s and C's columns
  * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/meson.build b/nntrainer/tensor/hgemm/meson.build
index e100f63a..f6781e93 100644
--- a/nntrainer/tensor/hgemm/meson.build
+++ b/nntrainer/tensor/hgemm/meson.build
@@ -3,7 +3,6 @@ hgemm_headers = [
   'hgemm_util.h',
   'hgemm_pack.h',
   'hgemm_common.h',
-  'hgemm_padding.h',
 ]
 
 subdir('hgemm_kernel')
@@ -16,8 +15,6 @@ nntrainer_inc_abs += meson.current_source_dir() / 'hgemm_padding'
 
 hgemm_sources = [
     'hgemm.cpp',
-    'hgemm_padding_a.cpp',
-    'hgemm_padding_b.cpp',
     'hgemm_pack.cpp',
     'hgemm_noTrans.cpp',
     'hgemm_transA.cpp',