[ neon/trivial ] Compare float scaling factors more precisely

author skykongkong8 <ss.kong@samsung.com>

Mon, 3 Jun 2024 10:55:31 +0000 (19:55 +0900)

committer MyungJoo Ham <myungjoo.ham@samsung.com>

Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
author skykongkong8 <ss.kong@samsung.com>
Mon, 3 Jun 2024 10:55:31 +0000 (19:55 +0900)
committer MyungJoo Ham <myungjoo.ham@samsung.com>
Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp

index fb9b14386f4ebe891b24c143470fad4111ca302a..4a200311954f6258527b86724d2a38f110ceb923 100644 (file)
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -1597,7 +1597,7 @@ void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N,
    unsigned int size = M * N;
    unsigned int size8 = (size >> 3) << 3;
    unsigned int size4 = (size >> 2) << 2;
-  if (beta != 0.F) {
+  if (std::fpclassify(beta) != FP_ZERO) {
      for (; idx < size8; idx += 8) {
        float16x8_t c =
          vmulq_n_f16(vld1q_f16(&C[idx]), static_cast<__fp16>(beta));
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp

index dd1c173a99e45e1fed4f5a175e50ae4ed3216137..f7788335e91b64718f6ce6a960616208a3eda32b 100644 (file)
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -12,6 +12,7 @@
   *
   */
  
+#include <cmath>
  #include <hgemm.h>
  #include <hgemm_kernel_1x4.h>
  #include <hgemm_kernel_1x8.h>
@@ -21,6 +22,7 @@
  #include <hgemm_kernel_8x8.h>
  #include <hgemm_kernel_pack.h>
  #include <hgemm_util.h>
+#include <limits>
  
  #define HGEMM_KERNEL_1x4 hgemm_kernel_1x4
  #define HGEMM_KERNEL_4x4 hgemm_kernel_4x4
@@ -31,7 +33,8 @@
  
  void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
                     unsigned int N, unsigned int K, float alpha, float beta) {
-  if (alpha == 1.F) {
+  const float eps = std::numeric_limits<float>::epsilon();
+  if (std::abs(alpha - 1.F) < eps) {
      // used bitwise operator instead of modulo for performance
      // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
      if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
author	skykongkong8 <ss.kong@samsung.com>
	Mon, 3 Jun 2024 10:55:31 +0000 (19:55 +0900)
committer	MyungJoo Ham <myungjoo.ham@samsung.com>
	Mon, 10 Jun 2024 09:43:30 +0000 (18:43 +0900)
nntrainer/tensor/blas_neon.cpp		patch \| blob \| history
nntrainer/tensor/hgemm/hgemm.cpp		patch \| blob \| history