- Adding padding to matrices is not an optimal solution to approach, but yet can be one sub-optimal option.
- Final goal of this directory would be deleting this directory itself.
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: skykongkong8 <ss.kong@samsung.com>
#include <assert.h>
#include <hgemm_common.h>
#include <hgemm_pack.h>
+#include <hgemm_util.h>
#include <matrix_transpose_neon.h>
void packing_A1(unsigned int m, unsigned int k, const __fp16 *from,
__fp16 *dst_off = (__fp16 *)dst;
const unsigned int ld_tile_T = 16;
- __fp16 *tile_T = new __fp16[8 * ld_tile_T];
- // __fp16 *tile_T = alignedMalloc(8 * ld_tile_T);
+ __fp16 *tile_T = alignedMalloc(8 * ld_tile_T);
// 1. Do something like 8x16 transpose kernel
// 2. Save linearized transposed output tile to dst
void packing_B16(unsigned int K, unsigned int N, const __fp16 *src,
unsigned int ldb, const __fp16 *dst);
/**
- * @brief
+ * @brief packing function of input matrix B_T
*
- * @param K
- * @param N
- * @param src
- * @param ldb
- * @param dst
+ * @param M length of the row of the matrix
+ * @param K length of the col of the matrix
+ * @param src input of original source of the matrix
+ * @param ldb leading dimension of the matrix
+ * @param dst output of packed data of the matrix
*/
void packing_transB16(unsigned int K, unsigned int N, const __fp16 *src,
unsigned int ldb, const __fp16 *dst);
+++ /dev/null
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
- *
- * @file hgemm_padding.h
- * @date 05 July 2024
- * @see https://github.com/nnstreamer/nntrainer
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug No known bugs except for NYI items
- * @brief This is a header file for including both padding matrix A and B
- *
- */
-
-#include <hgemm_padding_a.h>
-#include <hgemm_padding_b.h>
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for including both padding matrix A and B
+ *
+ */
+
+#include <hgemm_padding_a.h>
+#include <hgemm_padding_b.h>
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_a.cpp
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a source file for padding function used in hgemm
+ *
+ */
+
+#include <arm_neon.h>
+#include <hgemm_padding_a.h>
+#include <hgemm_util.h>
+#include <iostream>
+
+void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8,
+ bool transA) {
+ if (transA)
+ hgemm_padding_A_Trans(A, Ap, M, K, M8, K8);
+ else
+ hgemm_padding_A_noTrans(A, Ap, M, K, M8, K8);
+}
+
+void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8) {
+ if (M != M8 && K != K8) {
+ hgemm_padding_A_noTrans_wrt_MK(A, Ap, M, K, M8, K8);
+ } else if (M != M8) {
+ hgemm_padding_A_noTrans_wrt_M(A, Ap, M, K, M8, K8);
+ } else if (K != K8) {
+ hgemm_padding_A_noTrans_wrt_K(A, Ap, M, K, M8, K8);
+ } else {
+ std::cerr << "Error : No room for matrix A padding!\n";
+ }
+}
+
+void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8) {
+ if (M != M8 && K != K8) {
+ hgemm_padding_A_Trans_wrt_MK(A, Ap, M, K, M8, K8);
+ } else if (M != M8) {
+ hgemm_padding_A_Trans_wrt_M(A, Ap, M, K, M8, K8);
+ } else if (K != K8) {
+ hgemm_padding_A_Trans_wrt_K(A, Ap, M, K, M8, K8);
+ } else {
+ std::cerr << "Error : No room for matrix A padding!\n";
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int m = 0; m < M; ++m) {
+ for (unsigned int k = 0; k < K; k += 8) {
+ vst1q_f16(&Ap[m * K + k], vld1q_f16(&A[m * K + k]));
+ }
+ }
+ for (unsigned int m = M; m < M8; ++m) {
+ for (unsigned int k = 0; k < K; k += 8) {
+ vst1q_f16(&Ap[m * K + k], ZEROS);
+ }
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ const unsigned int K8_low = (K >> 3) << 3;
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int m = 0; m < M; ++m) {
+ for (unsigned int k = 0; k < K8_low; k += 8) {
+ vst1q_f16(&Ap[m * K8 + k], vld1q_f16(&A[m * K + k]));
+ }
+ for (unsigned int k = K8_low; k < K; ++k) {
+ Ap[m * K8 + k] = A[m * K + k];
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ Ap[m * K8 + k] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_noTrans_wrt_MK NYI!\n";
+}
+
+void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ const unsigned int M8_low = (M >> 3) << 3;
+
+ for (unsigned int k = 0; k < K; ++k) {
+ for (unsigned int m = 0; m < M8_low; m += 8) {
+ vst1q_f16(&Ap[k * M + m], vld1q_f16(&A[k * M + m]));
+ }
+ for (unsigned int m = M8_low; m < M; ++m) {
+ Ap[k * M + m] = A[k * M + m];
+ }
+ for (unsigned int m = M; m < M8; ++m) {
+ Ap[k * M + m] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_Trans_wrt_K NYI!\n";
+}
+
+void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_Trans_wrt_MK NYI!\n";
+}
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_a.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for padding function used in hgemm
+ *
+ */
+
+/**
+ * @brief Padding function for matrix A in HGEMM
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param transA Whether the matrix A is transposed or not
+ */
+void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8,
+ bool transA);
+
+/**
+ * @brief Padding function for non-transposed matrix A in HGEMM
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8);
+
+/**
+ * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. M
+ * direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
+/**
+ * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. K
+ * direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
+
+/**
+ * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. M and K
+ * direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
+/**
+ * @brief Padding function for transposed matrix A in HGEMM
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8);
+/**
+ * @brief Padding function for transposed matrix A in HGEMM w.r.t. M direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
+/**
+ * @brief Padding function for transposed matrix A in HGEMM w.r.t. K direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
+/**
+ * @brief Padding function for transposed matrix A in HGEMM w.r.t. M and K
+ * direction
+ *
+ * @param A src matrix to pad
+ * @param Ap dst matrix after padding
+ * @param M row length of matrix A
+ * @param K col length of matrix A
+ * @param M8 Least multiple of 8 that is bigger than or equal to M
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ */
+void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8);
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_b.cpp
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a source file for padding function used in hgemm
+ *
+ */
+
+#include <arm_neon.h>
+#include <hgemm_padding_b.h>
+#include <hgemm_util.h>
+#include <iostream>
+
+void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16,
+ bool transB) {
+ if (transB) {
+ hgemm_padding_B_Trans(B, Bp, K, N, K8, N16);
+ } else {
+ hgemm_padding_B_noTrans(B, Bp, K, N, K8, N16);
+ }
+}
+
+void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ if (K != K8 && N != N16) {
+ hgemm_padding_B_noTrans_wrt_KN(B, Bp, K, N, K8, N16);
+ } else if (K != K8) {
+ hgemm_padding_B_noTrans_wrt_K(B, Bp, K, N, K8, N16);
+ } else if (N != N16) {
+ hgemm_padding_B_noTrans_wrt_N(B, Bp, K, N, K8, N16);
+ } else {
+ std::cerr << "Error : No room for matrix B padding\n";
+ }
+}
+
+void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16) {
+ if (K != K8 && N != N16) {
+ hgemm_padding_B_Trans_wrt_KN(B, Bp, K, N, K8, N16);
+ } else if (K != K8) {
+ hgemm_padding_B_Trans_wrt_K(B, Bp, K, N, K8, N16);
+ } else if (N != N16) {
+ hgemm_padding_B_Trans_wrt_N(B, Bp, K, N, K8, N16);
+ } else {
+ std::cerr << "Error : No room for matrix B padding\n";
+ }
+}
+
+void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "Error : hgemm_padding_B_noTrans_wrt_N NYI!\n";
+}
+
+void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int k = 0; k < K; ++k) {
+ for (unsigned int n = 0; n < N; n += 8) {
+ vst1q_f16(&Bp[k * N + n], vld1q_f16(&B[k * N + n]));
+ }
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ for (unsigned int n = 0; n < N; n += 8) {
+ vst1q_f16(&Bp[k * N + n], ZEROS);
+ }
+ }
+}
+
+void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "Error : hgemm_padding_B_noTrans_wrt_KN NYI!\n";
+}
+
+void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "Error : hgemm_padding_B_Trans_wrt_N NYI!\n";
+}
+
+void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ const unsigned int K8_low = (K >> 3) << 3;
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int n = 0; n < N; ++n) {
+ for (unsigned int k = 0; k < K8_low; k += 8) {
+ vst1q_f16(&Bp[n * K8 + k], vld1q_f16(&B[n * K + k]));
+ }
+ for (unsigned int k = K8_low; k < K; ++k) {
+ Bp[n * K8 + k] = B[n * K + k];
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ Bp[n * K8 + k] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "Error : hgemm_padding_B_Trans_wrt_KN NYI!\n";
+}
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_b.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for padding function used in hgemm
+ *
+ */
+
+/**
+ * @brief Padding function for matrix B in HGEMM
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ * @param transB Whether the matrix B is transposed or not
+ */
+void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16,
+ bool transB);
+
+/**
+ * @brief Padding function for non-transposed matrix B in HGEMM
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16);
+/**
+ * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. N
+ * direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
+/**
+ * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. K
+ * direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
+/**
+ * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. N and K
+ * direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
+/**
+ * @brief Padding function for transposed matrix B in HGEMM
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16);
+/**
+ * @brief Padding function for transposed matrix B in HGEMM w.r.t. N direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
+/**
+ * @brief Padding function for transposed matrix B in HGEMM w.r.t. K direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
+
+/**
+ * @brief Padding function for transposed matrix B in HGEMM w.r.t. K and N
+ * direction
+ *
+ * @param B src matrix to pad
+ * @param Bp dst matrix after padding
+ * @param K row length of matrix B
+ * @param N col length of matrix B
+ * @param K8 Least multiple of 8 that is bigger than or equal to K
+ * @param N16 Least multiple of 16 that is bigger than or equal to N
+ */
+void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16);
--- /dev/null
+hgemm_padding_headers = [
+ 'hgemm_padding.h',
+ 'hgemm_padding_a.h',
+ 'hgemm_padding_b.h',
+]
+
+
+hgemm_padding_sources = [
+ 'hgemm_padding_a.cpp',
+ 'hgemm_padding_b.cpp',
+]
+
+foreach s : hgemm_padding_sources
+ nntrainer_sources += meson.current_source_dir() / s
+endforeach
+
+foreach h : hgemm_padding_headers
+ nntrainer_headers += meson.current_source_dir() / h
+endforeach
+
+++ /dev/null
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
- *
- * @file hgemm_padding_a.cpp
- * @date 05 July 2024
- * @see https://github.com/nnstreamer/nntrainer
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug No known bugs except for NYI items
- * @brief This is a source file for padding function used in hgemm
- *
- */
-
-#include <arm_neon.h>
-#include <hgemm_padding_a.h>
-#include <hgemm_util.h>
-#include <iostream>
-
-void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8,
- bool transA) {
- if (transA)
- hgemm_padding_A_Trans(A, Ap, M, K, M8, K8);
- else
- hgemm_padding_A_noTrans(A, Ap, M, K, M8, K8);
-}
-
-void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8) {
- if (M != M8 && K != K8) {
- hgemm_padding_A_noTrans_wrt_MK(A, Ap, M, K, M8, K8);
- } else if (M != M8) {
- hgemm_padding_A_noTrans_wrt_M(A, Ap, M, K, M8, K8);
- } else if (K != K8) {
- hgemm_padding_A_noTrans_wrt_K(A, Ap, M, K, M8, K8);
- } else {
- std::cerr << "Error : No room for matrix A padding!\n";
- }
-}
-
-void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8) {
- if (M != M8 && K != K8) {
- hgemm_padding_A_Trans_wrt_MK(A, Ap, M, K, M8, K8);
- } else if (M != M8) {
- hgemm_padding_A_Trans_wrt_M(A, Ap, M, K, M8, K8);
- } else if (K != K8) {
- hgemm_padding_A_Trans_wrt_K(A, Ap, M, K, M8, K8);
- } else {
- std::cerr << "Error : No room for matrix A padding!\n";
- }
-}
-
-void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- float16x8_t ZEROS = vmovq_n_f16(0.F);
-
- for (unsigned int m = 0; m < M; ++m) {
- for (unsigned int k = 0; k < K; k += 8) {
- vst1q_f16(&Ap[m * K + k], vld1q_f16(&A[m * K + k]));
- }
- }
- for (unsigned int m = M; m < M8; ++m) {
- for (unsigned int k = 0; k < K; k += 8) {
- vst1q_f16(&Ap[m * K + k], ZEROS);
- }
- }
-}
-
-void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- const unsigned int K8_low = (K >> 3) << 3;
- float16x8_t ZEROS = vmovq_n_f16(0.F);
-
- for (unsigned int m = 0; m < M; ++m) {
- for (unsigned int k = 0; k < K8_low; k += 8) {
- vst1q_f16(&Ap[m * K8 + k], vld1q_f16(&A[m * K + k]));
- }
- for (unsigned int k = K8_low; k < K; ++k) {
- Ap[m * K8 + k] = A[m * K + k];
- }
- for (unsigned int k = K; k < K8; ++k) {
- Ap[m * K8 + k] = 0.F;
- }
- }
-}
-
-void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- std::cerr << "Error : hgemm_padding_A_noTrans_wrt_MK NYI!\n";
-}
-
-void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- const unsigned int M8_low = (M >> 3) << 3;
-
- for (unsigned int k = 0; k < K; ++k) {
- for (unsigned int m = 0; m < M8_low; m += 8) {
- vst1q_f16(&Ap[k * M + m], vld1q_f16(&A[k * M + m]));
- }
- for (unsigned int m = M8_low; m < M; ++m) {
- Ap[k * M + m] = A[k * M + m];
- }
- for (unsigned int m = M; m < M8; ++m) {
- Ap[k * M + m] = 0.F;
- }
- }
-}
-
-void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- std::cerr << "Error : hgemm_padding_A_Trans_wrt_K NYI!\n";
-}
-
-void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8) {
- std::cerr << "Error : hgemm_padding_A_Trans_wrt_MK NYI!\n";
-}
+++ /dev/null
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
- *
- * @file hgemm_padding_a.h
- * @date 05 July 2024
- * @see https://github.com/nnstreamer/nntrainer
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug No known bugs except for NYI items
- * @brief This is a header file for padding function used in hgemm
- *
- */
-
-/**
- * @brief Padding function for matrix A in HGEMM
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param transA Whether the matrix A is transposed or not
- */
-void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8,
- bool transA);
-
-/**
- * @brief Padding function for non-transposed matrix A in HGEMM
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8);
-
-/**
- * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. M
- * direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
-/**
- * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. K
- * direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
-
-/**
- * @brief Padding function for non-transposed matrix A in HGEMM w.r.t. M and K
- * direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
-/**
- * @brief Padding function for transposed matrix A in HGEMM
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8, unsigned int K8);
-/**
- * @brief Padding function for transposed matrix A in HGEMM w.r.t. M direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
-/**
- * @brief Padding function for transposed matrix A in HGEMM w.r.t. K direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
-/**
- * @brief Padding function for transposed matrix A in HGEMM w.r.t. M and K
- * direction
- *
- * @param A src matrix to pad
- * @param Ap dst matrix after padding
- * @param M row length of matrix A
- * @param K col length of matrix A
- * @param M8 Least multiple of 8 that is bigger than or equal to M
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- */
-void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
- unsigned int K, unsigned int M8,
- unsigned int K8);
+++ /dev/null
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
- *
- * @file hgemm_padding_b.cpp
- * @date 05 July 2024
- * @see https://github.com/nnstreamer/nntrainer
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug No known bugs except for NYI items
- * @brief This is a source file for padding function used in hgemm
- *
- */
-
-#include <arm_neon.h>
-#include <hgemm_padding_b.h>
-#include <hgemm_util.h>
-#include <iostream>
-
-void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8, unsigned int N16,
- bool transB) {
- if (transB) {
- hgemm_padding_B_Trans(B, Bp, K, N, K8, N16);
- } else {
- hgemm_padding_B_noTrans(B, Bp, K, N, K8, N16);
- }
-}
-
-void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- if (K != K8 && N != N16) {
- hgemm_padding_B_noTrans_wrt_KN(B, Bp, K, N, K8, N16);
- } else if (K != K8) {
- hgemm_padding_B_noTrans_wrt_K(B, Bp, K, N, K8, N16);
- } else if (N != N16) {
- hgemm_padding_B_noTrans_wrt_N(B, Bp, K, N, K8, N16);
- } else {
- std::cerr << "Error : No room for matrix B padding\n";
- }
-}
-
-void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8, unsigned int N16) {
- if (K != K8 && N != N16) {
- hgemm_padding_B_Trans_wrt_KN(B, Bp, K, N, K8, N16);
- } else if (K != K8) {
- hgemm_padding_B_Trans_wrt_K(B, Bp, K, N, K8, N16);
- } else if (N != N16) {
- hgemm_padding_B_Trans_wrt_N(B, Bp, K, N, K8, N16);
- } else {
- std::cerr << "Error : No room for matrix B padding\n";
- }
-}
-
-void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- std::cerr << "Error : hgemm_padding_B_noTrans_wrt_N NYI!\n";
-}
-
-void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- float16x8_t ZEROS = vmovq_n_f16(0.F);
-
- for (unsigned int k = 0; k < K; ++k) {
- for (unsigned int n = 0; n < N; n += 8) {
- vst1q_f16(&Bp[k * N + n], vld1q_f16(&B[k * N + n]));
- }
- }
- for (unsigned int k = K; k < K8; ++k) {
- for (unsigned int n = 0; n < N; n += 8) {
- vst1q_f16(&Bp[k * N + n], ZEROS);
- }
- }
-}
-
-void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- std::cerr << "Error : hgemm_padding_B_noTrans_wrt_KN NYI!\n";
-}
-
-void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- std::cerr << "Error : hgemm_padding_B_Trans_wrt_N NYI!\n";
-}
-
-void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- const unsigned int K8_low = (K >> 3) << 3;
- float16x8_t ZEROS = vmovq_n_f16(0.F);
-
- for (unsigned int n = 0; n < N; ++n) {
- for (unsigned int k = 0; k < K8_low; k += 8) {
- vst1q_f16(&Bp[n * K8 + k], vld1q_f16(&B[n * K + k]));
- }
- for (unsigned int k = K8_low; k < K; ++k) {
- Bp[n * K8 + k] = B[n * K + k];
- }
- for (unsigned int k = K; k < K8; ++k) {
- Bp[n * K8 + k] = 0.F;
- }
- }
-}
-
-void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16) {
- std::cerr << "Error : hgemm_padding_B_Trans_wrt_KN NYI!\n";
-}
+++ /dev/null
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
- *
- * @file hgemm_padding_b.h
- * @date 05 July 2024
- * @see https://github.com/nnstreamer/nntrainer
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug No known bugs except for NYI items
- * @brief This is a header file for padding function used in hgemm
- *
- */
-
-/**
- * @brief Padding function for matrix B in HGEMM
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- * @param transB Whether the matrix B is transposed or not
- */
-void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8, unsigned int N16,
- bool transB);
-
-/**
- * @brief Padding function for non-transposed matrix B in HGEMM
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8, unsigned int N16);
-/**
- * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. N
- * direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
-/**
- * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. K
- * direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
-/**
- * @brief Padding function for non-transposed matrix B in HGEMM w.r.t. N and K
- * direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
-/**
- * @brief Padding function for transposed matrix B in HGEMM
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8, unsigned int N16);
-/**
- * @brief Padding function for transposed matrix B in HGEMM w.r.t. N direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
-/**
- * @brief Padding function for transposed matrix B in HGEMM w.r.t. K direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
-
-/**
- * @brief Padding function for transposed matrix B in HGEMM w.r.t. K and N
- * direction
- *
- * @param B src matrix to pad
- * @param Bp dst matrix after padding
- * @param K row length of matrix B
- * @param N col length of matrix B
- * @param K8 Least multiple of 8 that is bigger than or equal to K
- * @param N16 Least multiple of 16 that is bigger than or equal to N
- */
-void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
- unsigned int N, unsigned int K8,
- unsigned int N16);
nntrainer_inc += include_directories('hgemm_kernel')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm_kernel'
+subdir('hgemm_padding')
+nntrainer_inc += include_directories('hgemm_padding')
+nntrainer_inc_abs += meson.current_source_dir() / 'hgemm_padding'
+
hgemm_sources = [
'hgemm.cpp',
'hgemm_padding_a.cpp',