- Since current kernel / blocking function supports for fixed shape only, implement padding function for temporary solution.
- Note that flexible kernel / blocking implementation should be added for optimal performances
- Current implementation separates padding function for matrix A and B but it will eventually be governed with single function
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: skykongkong8 <ss.kong@samsung.com>
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for including both padding matrix A and B
+ *
+ */
+
+#include <hgemm_padding_a.h>
+#include <hgemm_padding_b.h>
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_a.cpp
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a source file for padding function used in hgemm
+ *
+ */
+
+#include <arm_neon.h>
+#include <hgemm_padding_a.h>
+#include <hgemm_util.h>
+#include <iostream>
+
+void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8,
+ bool transA) {
+ if (transA)
+ hgemm_padding_A_Trans(A, Ap, M, K, M8, K8);
+ else
+ hgemm_padding_A_noTrans(A, Ap, M, K, M8, K8);
+}
+
+void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8) {
+ if (M != M8 && K != K8) {
+ hgemm_padding_A_noTrans_wrt_MK(A, Ap, M, K, M8, K8);
+ } else if (M != M8) {
+ hgemm_padding_A_noTrans_wrt_M(A, Ap, M, K, M8, K8);
+ } else if (K != K8) {
+ hgemm_padding_A_noTrans_wrt_K(A, Ap, M, K, M8, K8);
+ } else {
+ std::cerr << "Error : No room for matrix A padding!\n";
+ }
+}
+
+void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8) {
+ if (M != M8 && K != K8) {
+ hgemm_padding_A_Trans_wrt_MK(A, Ap, M, K, M8, K8);
+ } else if (M != M8) {
+ hgemm_padding_A_Trans_wrt_M(A, Ap, M, K, M8, K8);
+ } else if (K != K8) {
+ hgemm_padding_A_Trans_wrt_K(A, Ap, M, K, M8, K8);
+ } else {
+ std::cerr << "Error : No room for matrix A padding!\n";
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int m = 0; m < M; ++m) {
+ for (unsigned int k = 0; k < K; k += 8) {
+ vst1q_f16(&Ap[m * K + k], vld1q_f16(&A[m * K + k]));
+ }
+ }
+ for (unsigned int m = M; m < M8; ++m) {
+ for (unsigned int k = 0; k < K; k += 8) {
+ vst1q_f16(&Ap[m * K + k], ZEROS);
+ }
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ const unsigned int K8_low = (K >> 3) << 3;
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int m = 0; m < M; ++m) {
+ for (unsigned int k = 0; k < K8_low; k += 8) {
+ vst1q_f16(&Ap[m * K8 + k], vld1q_f16(&A[m * K + k]));
+ }
+ for (unsigned int k = K8_low; k < K; ++k) {
+ Ap[m * K8 + k] = A[m * K + k];
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ Ap[m * K8 + k] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_noTrans_wrt_MK NYI!\n";
+}
+
+void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ const unsigned int M8_low = (M >> 3) << 3;
+
+ for (unsigned int k = 0; k < K; ++k) {
+ for (unsigned int m = 0; m < M8_low; m += 8) {
+ vst1q_f16(&Ap[k * M + m], vld1q_f16(&A[k * M + m]));
+ }
+ for (unsigned int m = M8_low; m < M; ++m) {
+ Ap[k * M + m] = A[k * M + m];
+ }
+ for (unsigned int m = M; m < M8; ++m) {
+ Ap[k * M + m] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_Trans_wrt_K NYI!\n";
+}
+
+void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8,
+ unsigned int K8) {
+ std::cerr << "Error : hgemm_padding_A_Trans_wrt_MK NYI!\n";
+}
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_a.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for padding function used in hgemm
+ *
+ */
+
+void hgemm_padding_A(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8,
+ bool transA);
+void hgemm_padding_A_noTrans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8);
+void hgemm_padding_A_noTrans_wrt_M(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
+void hgemm_padding_A_noTrans_wrt_K(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
+void hgemm_padding_A_noTrans_wrt_MK(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
+void hgemm_padding_A_Trans(const __fp16 *A, __fp16 *Ap, unsigned int M,
+ unsigned int K, unsigned int M8, unsigned int K8);
+void hgemm_padding_A_Trans_wrt_M(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
+void hgemm_padding_A_Trans_wrt_K(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
+void hgemm_padding_A_Trans_wrt_MK(const __fp16 *A, __fp16 *Ap,
+ unsigned int M, unsigned int K,
+ unsigned int M8, unsigned int K8);
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_b.cpp
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a source file for padding function used in hgemm
+ *
+ */
+
+#include <arm_neon.h>
+#include <hgemm_padding_b.h>
+#include <hgemm_util.h>
+#include <iostream>
+
+void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16,
+ bool transB) {
+ if (transB) {
+ hgemm_padding_B_Trans(B, Bp, K, N, K8, N16);
+ } else {
+ hgemm_padding_B_noTrans(B, Bp, K, N, K8, N16);
+ }
+}
+
+void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ if (K != K8 && N != N16) {
+ hgemm_padding_B_noTrans_wrt_KN(B, Bp, K, N, K8, N16);
+ } else if (K != K8) {
+ hgemm_padding_B_noTrans_wrt_K(B, Bp, K, N, K8, N16);
+ } else if (N != N16) {
+ hgemm_padding_B_noTrans_wrt_N(B, Bp, K, N, K8, N16);
+ } else {
+ std::cerr << "Error : No room for matrix B padding\n";
+ }
+}
+
+void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16) {
+ if (K != K8 && N != N16) {
+ hgemm_padding_B_Trans_wrt_KN(B, Bp, K, N, K8, N16);
+ } else if (K != K8) {
+ hgemm_padding_B_Trans_wrt_K(B, Bp, K, N, K8, N16);
+ } else if (N != N16) {
+ hgemm_padding_B_Trans_wrt_N(B, Bp, K, N, K8, N16);
+ } else {
+ std::cerr << "Error : No room for matrix B padding\n";
+ }
+}
+
+void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "NYI : hgemm_padding_B_noTrans_wrt_N\n";
+}
+
+void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int k = 0; k < K; ++k) {
+ for (unsigned int n = 0; n < N; n += 8) {
+ vst1q_f16(&Bp[k * N + n], vld1q_f16(&B[k * N + n]));
+ }
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ for (unsigned int n = 0; n < N; n += 8) {
+ vst1q_f16(&Bp[k * N + n], ZEROS);
+ }
+ }
+}
+
+void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "NYI : hgemm_padding_B_noTrans_wrt_KN\n";
+}
+
+void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "NYI : hgemm_padding_B_Trans_wrt_N\n";
+}
+
+void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ const unsigned int K8_low = (K >> 3) << 3;
+ float16x8_t ZEROS = vmovq_n_f16(0.F);
+
+ for (unsigned int n = 0; n < N; ++n) {
+ for (unsigned int k = 0; k < K8_low; k += 8) {
+ vst1q_f16(&Bp[n * K8 + k], vld1q_f16(&B[n * K + k]));
+ }
+ for (unsigned int k = K8_low; k < K; ++k) {
+ Bp[n * K8 + k] = B[n * K + k];
+ }
+ for (unsigned int k = K; k < K8; ++k) {
+ Bp[n * K8 + k] = 0.F;
+ }
+ }
+}
+
+void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8,
+ unsigned int N16) {
+ std::cerr << "NYI : hgemm_padding_B_Trans_wrt_KN\n";
+}
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file hgemm_padding_b.h
+ * @date 05 July 2024
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is a header file for padding function used in hgemm
+ *
+ */
+
+void hgemm_padding_B(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16,
+ bool transB);
+
+void hgemm_padding_B_noTrans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_noTrans_wrt_N(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_noTrans_wrt_K(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_noTrans_wrt_KN(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_Trans(const __fp16 *B, __fp16 *Bp, unsigned int K,
+ unsigned int N, unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_Trans_wrt_N(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_Trans_wrt_K(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
+
+void hgemm_padding_B_Trans_wrt_KN(const __fp16 *B, __fp16 *Bp,
+ unsigned int K, unsigned int N,
+ unsigned int K8, unsigned int N16);
]
hgemm_sources = [
- 'hgemm.cpp'
+ 'hgemm.cpp',
+ 'hgemm_padding_a.cpp',
+ 'hgemm_padding_b.cpp',
]
foreach s : hgemm_sources