[ trivial ] Fix typo and add missing doxygen tags

author skykongkong8 <ss.kong@samsung.com>

Wed, 10 Jul 2024 10:07:44 +0000 (19:07 +0900)

committer Jijoong Moon <jijoong.moon@samsung.com>

Tue, 30 Jul 2024 22:45:30 +0000 (07:45 +0900)
author skykongkong8 <ss.kong@samsung.com>
Wed, 10 Jul 2024 10:07:44 +0000 (19:07 +0900)
committer Jijoong Moon <jijoong.moon@samsung.com>
Tue, 30 Jul 2024 22:45:30 +0000 (07:45 +0900)
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp

index 08f31b34d0f3b22e2e4857b1e06dfd4ba6bec6d2..d4d21d4fc5bb02854dc05431543d3982d49ee59a 100644 (file)
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -326,8 +326,8 @@ static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
                         const unsigned int ldc) {
  
  #if (defined USE__FP16 && USE_NEON)
-  nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA == CblasTrans,
-                         TransB == CblasTrans);
+  nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta,
+                                TransA == CblasTrans, TransB == CblasTrans);
  #else
    float *A_ = new float[M * K];
    float *B_ = new float[N * K];
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp

index 48db72a0a5a7d03bde2b18d751918f0c50da081a..e2d584a9d4a2c3df996e647f11e7563b3900e124 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -20,9 +20,9 @@
  #include <hgemm_padding.h>
  #include <hgemm_transA.h>
  #include <hgemm_transAB.h>
-#include <limits>
  #include <hgemm_transB.h>
  #include <hgemm_util.h>
+#include <limits>
  
  void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
             unsigned int N, unsigned int K, float alpha, float beta, bool TransA,
@@ -70,10 +70,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
  
    hgemm_ensure_divisibility(A, B, C32, M, N, K, alpha, beta, TransA, TransB);
  
-  unsigned int L = M * N;
-  unsigned int L8 = (L >> 3) << 3;
-
-  for (unsigned int idx = 0; idx < L8; idx += 8) {
+  for (unsigned int idx = 0; idx < size8; idx += 8) {
      float32x4_t x1 = vld1q_f32(&C32[idx]);
      float32x4_t x2 = vld1q_f32(&C32[idx + 4]);
  
@@ -81,7 +78,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
  
      vst1q_f16(&C[idx], y1);
    }
-  for (unsigned int idx = L8; idx < L; ++idx) {
+  for (unsigned int idx = size8; idx < size; ++idx) {
      C[idx] = static_cast<__fp16>(C32[idx]);
    }
  
@@ -95,14 +92,15 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32,
    /// @note Padding standard : 8x16 is the only KERNEL that outperforms single
    /// precision GEMM 'so far'. Padding will forcibly make every GEMM cases to
    /// use it. Note that padding is not the optimal way here, but just an option
-  /// that is easier to implement. Fine-grained packing should be supported on
-  /// the future for optimal performance.
+  /// that is easier to implement. Fine-grained packing, blocking, and
+  /// corresponding kernels should be supported on the future for optimal
+  /// performance.
  
    __fp16 *A_ = (__fp16 *)A, *B_ = (__fp16 *)B;
    unsigned int M_ = M, N_ = N, K_ = K;
    bool pad_A = false, pad_B = false;
  
-  // Case 2 : smaller than 8, 16 | padding would be redundant?
+  // Case 2 : smaller than 8, 16 | padding would be redundant
    if (M < 8 && K < 16 && N < 16)
      return hgemm_classify(A_, B_, C32, M_, N_, K_, alpha, beta, TransA, TransB);
  
diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h

index a0c7b6f9d241910ebe37a8d89ae7a306a3d3cc88..333b1b7389cdf7815b26da4b3c135121bce01511 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm.h
+++ b/nntrainer/tensor/hgemm/hgemm.h
@@ -29,10 +29,11 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
             bool TransB);
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     Checking function for whether matrix A or B needs padding for
+ * optimal performance of fixed blocking-kernel sequence
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -45,7 +46,8 @@ void hgemm_ensure_divisibility(const __fp16 *A, const __fp16 *B, float *C32,
                                 bool TransA = false, bool TransB = false);
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     Classifying function for GEMM computation case for noTrans,
+ * transA, transB, transAB
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
   * @param[in] C __fp16 * for Matrix C
@@ -60,8 +62,8 @@ void hgemm_classify(const __fp16 *A, const __fp16 *B, float *C32,
                      float alpha = 1.F, float beta = 0.F, bool TransA = false,
                      bool TransB = false);
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation when K = 1. Transpose is mathematically no use
+ * for here, and partial accumulation is also not needed.
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
   * @param[in] C __fp16 * for Matrix C
diff --git a/nntrainer/tensor/hgemm/hgemm_common.h b/nntrainer/tensor/hgemm/hgemm_common.h

index a041e4319df654d458c9958d940264d4be737c02..0330bd433dfdea271558b5bc691262d13f93e962 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_common.h
+++ b/nntrainer/tensor/hgemm/hgemm_common.h
@@ -11,10 +11,6 @@
   *
   */
  
-#define A(i, j) a[(i)*lda + (j)]
-#define B(i, j) b[(i)*ldb + (j)]
-#define C(i, j) c[(i)*ldc + (j)]
-
  #define N_BLOCKING (768)
  #define K_BLOCKING (256)
  #define M_BLOCKING (4096)
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp

index 2c301e59051be1de469f2f871156bb718a740389..7ee7ea8dcaa7c4dbe177b9fc95bd89ef2eacd6d0 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
@@ -11,22 +11,11 @@
   *
   */
  
-#include <stdlib.h>
  #include <arm_neon.h>
  #include <assert.h>
  #include <hgemm_kernel.h>
+#include <stdlib.h>
  
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
  void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -81,17 +70,6 @@ void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 1x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
  void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp

index 35927e55c0c1c35dac8fba7daa9d6d98cedd40ea..87939aef2ec5ec9d63cb29af09f8e8f5aa6fadb4 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
@@ -83,17 +83,6 @@
      a++;                            \
    } while (0)
  
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -130,17 +119,6 @@ void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 1x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_1x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp

index 40ab4eaebd1e19b0798aa6d52d87cda4859d8952..21d811479636dcfdd2dee1f4366c2e5c3fae953b 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
@@ -214,17 +214,6 @@
                vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27)));      \
    } while (0)
  
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
  void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -303,17 +292,6 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 4x4 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading dimension of matrix C
- */
  void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp

index 3cebee456d0f42e7bda0397d72e230c0dea4df8d..cf60764722a952b4d29b2316d416718959f74f5c 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
@@ -258,17 +258,6 @@
                                           vcvt_f32_f16(vget_high_f16(v9))));   \
    } while (0)
  
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -306,17 +295,6 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 4x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp

index f8d6b56c23d5bce7c85e75bc508fc88b2fe050a0..6c8e8ee4ec0d3ae0f4d0bcb1dd713f88b169b7db 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
@@ -13,7 +13,6 @@
  
  #include <arm_neon.h>
  #include <assert.h>
-#include <iostream>
  #include <hgemm_kernel.h>
  #include <stdlib.h>
  
@@ -726,17 +725,6 @@
                          vcvt_f32_f16(vget_high_f16(v120_127))));               \
    } while (0)
  
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
                         __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -795,17 +783,6 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 8x16 kernel sc = sa * sb
- *
- * @param M length of the row of matrix A
- * @param N length of the col of matrix B
- * @param K length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
                         __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -829,8 +806,8 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
        float16x8_t v80_87, v88_95;
        float16x8_t v96_103, v104_111;
        float16x8_t v112_119, v120_127;
-      float16x8_t vb1, vb2;
        float16x8_t va0;
+      float16x8_t vb1, vb2;
        l = 0;
        for (; l < K16;) {
          INIT_KERNEL_8X16();
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp

index f799e5273e04b6a73dfa546a854ab22468544b67..9a46b3a92a72078621d06775103280fec7016fed 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
@@ -400,17 +400,6 @@
                                           vcvt_f32_f16(vget_high_f16(v31))));   \
    } while (0)
  
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
@@ -449,17 +438,6 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
    }
  }
  
-/**
- * @brief hgemm 8x8 kernel sc = sa * sb
- *
- * @param m length of the row of matrix A
- * @param n length of the col of matrix B
- * @param k length of the col of matrix A
- * @param sa sub-matrix of input matrix A
- * @param sb sub-matrix of input matrix B
- * @param sc sub-matrix of output matrix C
- * @param ldc leading-dimension of matrix C
- */
  void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
                        __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
    assert(M > 0 && N > 0 && K > 0);
diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.h b/nntrainer/tensor/hgemm/hgemm_noTrans.h

index 06b1f1023d9c9f90b1001e548f354d8ed3911d4a..69ef912ba3795e111d8633a089c07ff2296c1d0e 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_noTrans.h
+++ b/nntrainer/tensor/hgemm/hgemm_noTrans.h
@@ -252,7 +252,7 @@ void hgemm_noTrans_8x16(unsigned int M, unsigned int N, unsigned int K,
                          float alpha = 1.F, float beta = 0.F);
  
  /**
- * @brief     hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @brief     hgemm fallback with NEON : Y = alpha*op(A)*op(B) + beta*C,
   * @param M length of the row of matrix A
   * @param N length of the col of matrix B
   * @param K length of the col of matrix A
@@ -287,6 +287,7 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
  
  /**
   * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
   * @param[in] C __fp16 * for Matrix C
@@ -302,9 +303,10 @@ void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, __fp16 *C,
  
  /**
   * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * where M, N, K are divisible by at least 4
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.cpp b/nntrainer/tensor/hgemm/hgemm_pack.cpp

index 63d1b702fabe1f8c17039563a0f07fd930975f60..c19fde6ecdefc7e143ba954ea7d2e40205a305ad 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_pack.cpp
+++ b/nntrainer/tensor/hgemm/hgemm_pack.cpp
@@ -2,7 +2,7 @@
  /**
   * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
   *
- * @file   hgemm_kernel_pack.cpp
+ * @file   hgemm_pack.cpp
   * @date   02 July 2024
   * @see    https://github.com/nnstreamer/nntrainer
   * @author Sungsik Kong <ss.kong@samsung.com>
@@ -18,6 +18,10 @@
  #include <hgemm_util.h>
  #include <matrix_transpose_neon.h>
  
+/// @note Matrix packing strategy is quite similar in terms of normal-tangential
+/// coordinate's point of view. This hint might lead us to re-implement all
+/// packing functions in to single generic function!
+
  void packing_A1(unsigned int m, unsigned int k, const __fp16 *from,
                  unsigned int lda, const __fp16 *to) {
  
@@ -397,8 +401,9 @@ void packing_B16(unsigned int K, unsigned int N, const __fp16 *src,
  
  void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
                        unsigned int ldb, const __fp16 *dst) {
-  /// @note ldb = K for here
-  assert(K != 0 && N != 0 && N % 16 == 0);
+  /// @note K8 will be intentionally computed for generic K
+  /// implementation in the future
+  assert(K != 0 && K % 8 == 0 && N != 0 && N % 16 == 0);
    unsigned int K8 = (K >> 3) << 3;
  
    const __fp16 *src_off = (__fp16 *)src;
@@ -408,12 +413,12 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
    __fp16 *tile_T = alignedMalloc(8 * ld_tile_T);
  
    // 1. Do something like 8x16 transpose kernel
-  // 2. Save linearized transposed output tile to dst
+  // 2. Linearize transposed output tile to dst
    for (unsigned int n = 0; n < N; n += 16) {
      const __fp16 *src_off1 = src_off;
      __fp16 *dst_off1 = dst_off;
      src_off += 16 * ldb;
-    dst_off += (K8 * 16 + (K - K8)); // ?
+    dst_off += (K8 * 16 + (K - K8));
      for (unsigned int k = 0; k < K8; k += 8) {
        // 16x8 tile -> 8x16
        transpose_neon<__fp16>(16, 8, src_off1, ldb, tile_T, ld_tile_T);
@@ -439,12 +444,7 @@ void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
        dst_off1 += 16 * 8;
        src_off1 += 8;
      }
-
-    // Do the equivalent of one by one for the rest
-    for (unsigned int k = K8; k < K; ++k) {
-      for (unsigned int _n = 0; _n < 16; ++_n) {
-        dst_off1[_n] = src_off1[k];
-      }
-    }
    }
+
+  free(tile_T);
  }
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.h b/nntrainer/tensor/hgemm/hgemm_pack.h

index b134ee1281b728ca2776fe638c92591291775691..12f0770cb8ba76c3d05dc6fe8860eb4e3b221a4f 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_pack.h
+++ b/nntrainer/tensor/hgemm/hgemm_pack.h
@@ -2,7 +2,7 @@
  /**
   * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
   *
- * @file   hgemm_kernel_pack.h
+ * @file   hgemm_pack.h
   * @date   01 April 2024
   * @see    https://github.com/nnstreamer/nntrainer
   * @author Sungsik Kong <ss.kong@samsung.com>
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h

index f62143a41d9d9a7d7ed70e57c3594736c145fb91..5353fe0edb1f95750776dd6aa7890817c736513f 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
+++ b/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
@@ -8,6 +8,8 @@
   * @author Sungsik Kong <ss.kong@samsung.com>
   * @bug    No known bugs except for NYI items
   * @brief  This is a header file for including both padding matrix A and B
+ * @note   Padding function for matrix A and B will be fused into single
+ * function in this file in the future
   *
   */
  
diff --git a/nntrainer/tensor/hgemm/hgemm_transA.h b/nntrainer/tensor/hgemm/hgemm_transA.h

index 8de44c844ec00af07cfc360fd9c6e1fe7df15dff..e6c1cceb54d6338c08450f236d2a6eb2618a51e7 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_transA.h
+++ b/nntrainer/tensor/hgemm/hgemm_transA.h
@@ -12,11 +12,10 @@
   */
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation with neon : Y = alpha*A_T*B + beta*C,
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -25,4 +24,3 @@
   */
  void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
                    unsigned int N, unsigned int K, float alpha, float beta);
-
diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.h b/nntrainer/tensor/hgemm/hgemm_transAB.h

index 2c228031c898880d5b944e6dfff8d6f72900707f..b97ce033445aa90ee6ebce01caeda774283b0d7c 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_transAB.h
+++ b/nntrainer/tensor/hgemm/hgemm_transAB.h
@@ -12,11 +12,10 @@
   */
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm computation with neon : Y = alpha*A_T*B_T + beta*C,
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/hgemm_transB.h b/nntrainer/tensor/hgemm/hgemm_transB.h

index 0c47d23d7454befa3ed1f4b4e462f4e0aaacc088..6fe3ec0e6e59605b6c72e5262917ed8be5a744cb 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm_transB.h
+++ b/nntrainer/tensor/hgemm/hgemm_transB.h
@@ -12,11 +12,10 @@
   */
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm transB computation : Y = alpha*A*B_T + beta*C,
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
@@ -26,16 +25,26 @@
  void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
                    unsigned int N, unsigned int K, float alpha, float beta);
  
+/**
+ * @brief     hgemm transB computation : Y = alpha*A*B_T + beta*C,
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
  void hgemm_transB_fallback(const __fp16 *A, const __fp16 *B, float *C,
                             unsigned int M, unsigned int N, unsigned int K,
                             float alpha, float beta);
  
  /**
- * @brief     hgemm computation with neon : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
+ * @brief     hgemm transB computation with kernel 8x16
   * @param[in] A __fp16 * for Matrix A
   * @param[in] B __fp16 * for Matrix B
- * @param[in] C __fp16 * for Matrix C
+ * @param[in] C float * for Matrix C
   * @param[in] M number of op(A)'s and C's row
   * @param[in] N number of op(B)'s and C's columns
   * @param[in] K number of op(A)'s and columns and op(B)'s rows
diff --git a/nntrainer/tensor/hgemm/meson.build b/nntrainer/tensor/hgemm/meson.build

index e100f63ade7ca626ffb0d84c2e201551d16fa315..f6781e9395fb0e4ce793b2f32cc93c9276e0d57c 100644 (file)
--- a/nntrainer/tensor/hgemm/meson.build
+++ b/nntrainer/tensor/hgemm/meson.build
@@ -3,7 +3,6 @@ hgemm_headers = [
    'hgemm_util.h',
    'hgemm_pack.h',
    'hgemm_common.h',
-  'hgemm_padding.h',
  ]
  
  subdir('hgemm_kernel')
@@ -16,8 +15,6 @@ nntrainer_inc_abs += meson.current_source_dir() / 'hgemm_padding'
  
  hgemm_sources = [
      'hgemm.cpp',
-    'hgemm_padding_a.cpp',
-    'hgemm_padding_b.cpp',
      'hgemm_pack.cpp',
      'hgemm_noTrans.cpp',
      'hgemm_transA.cpp',
author	skykongkong8 <ss.kong@samsung.com>
	Wed, 10 Jul 2024 10:07:44 +0000 (19:07 +0900)
committer	Jijoong Moon <jijoong.moon@samsung.com>
	Tue, 30 Jul 2024 22:45:30 +0000 (07:45 +0900)
nntrainer/tensor/blas_interface.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_common.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_noTrans.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_pack.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_pack.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_transA.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_transAB.h		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm_transB.h		patch \| blob \| history
nntrainer/tensor/hgemm/meson.build		patch \| blob \| history