[ hgemm ] Support scaling factor beta in kernel-based hgemm

author skykongkong8 <ss.kong@samsung.com>

Tue, 14 May 2024 07:29:53 +0000 (16:29 +0900)

committer MyungJoo Ham <myungjoo.ham@samsung.com>

Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
author skykongkong8 <ss.kong@samsung.com>
Tue, 14 May 2024 07:29:53 +0000 (16:29 +0900)
committer MyungJoo Ham <myungjoo.ham@samsung.com>
Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp

index 32c8eb9349f9b4a54a70df80f4b7d90872129369..c8249fb4b8b9103f28038819294127da31bce179 100644 (file)
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -1595,22 +1595,33 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N,
    // performing beta*C
    unsigned int idx = 0;
    unsigned int size = M * N;
-  for (; idx < (size - idx) && (size - idx) >= 8; idx += 8) {
-    float16x8_t c = vmulq_n_f16(vld1q_f16(&C[idx]), static_cast<__fp16>(beta));
+  if (beta != 0.F) {
+    for (; idx < (size - idx) && (size - idx) >= 8; idx += 8) {
+      float16x8_t c =
+        vmulq_n_f16(vld1q_f16(&C[idx]), static_cast<__fp16>(beta));
  
-    vst1q_f32(&C32[idx], vcvt_f32_f16(vget_low_f16(c)));
-    vst1q_f32(&C32[idx + 4], vcvt_f32_f16(vget_high_f16(c)));
-  }
-  // remaining 4
-  for (; idx < (size - idx) && (size - idx) >= 4; idx += 4) {
-    float16x4_t c = vmul_n_f16(vld1_f16(&C[idx]), static_cast<__fp16>(beta));
+      vst1q_f32(&C32[idx], vcvt_f32_f16(vget_low_f16(c)));
+      vst1q_f32(&C32[idx + 4], vcvt_f32_f16(vget_high_f16(c)));
+    }
+    // remaining 4
+    for (; idx < (size - idx) && (size - idx) >= 4; idx += 4) {
+      float16x4_t c = vmul_n_f16(vld1_f16(&C[idx]), static_cast<__fp16>(beta));
  
-    vst1q_f32(&C32[idx], vcvt_f32_f16(c));
-  }
+      vst1q_f32(&C32[idx], vcvt_f32_f16(c));
+    }
  
-  // remaining values if dimensions not a multiple of 8
-  for (; idx < size; idx++) {
-    C32[idx] = C[idx] * beta;
+    // remaining values if dimensions not a multiple of 8
+    for (; idx < size; idx++) {
+      C32[idx] = C[idx] * beta;
+    }
+  } else {
+    float32x4_t zeros = vmovq_n_f32(0.F);
+    for (; idx < (size - idx) && (size - idx) >= 4; idx += 4) {
+      vst1q_f32(&C32[idx], zeros);
+    }
+    for (; idx < size; idx++) {
+      C32[idx] = 0.F;
+    }
    }
  
    if (!TransA && TransB) {
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp

index be61cd5c91a46f03e1bee995761e221064ee1b2e..dd1c173a99e45e1fed4f5a175e50ae4ed3216137 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -31,7 +31,7 @@
  
  void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
                     unsigned int N, unsigned int K, float alpha, float beta) {
-  if (alpha == 1.F && beta == 0.F && N > 4) {
+  if (alpha == 1.F) {
      // used bitwise operator instead of modulo for performance
      // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
      if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
@@ -53,7 +53,7 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
  
  void hgemm_noTrans(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
                     unsigned int N, unsigned int K, float alpha, float beta) {
-  if (alpha == 1.F && beta == 0.F) {
+  if (alpha == 1.F) {
      // used bitwise operator instead of modulo for performance
      // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
      if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
author	skykongkong8 <ss.kong@samsung.com>
	Tue, 14 May 2024 07:29:53 +0000 (16:29 +0900)
committer	MyungJoo Ham <myungjoo.ham@samsung.com>
	Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
nntrainer/tensor/blas_neon.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm.cpp		patch \| blob \| history