[blas/neon] Optimization on SGEMV fp16 implementation

author Debadri Samaddar <s.debadri@samsung.com>

Tue, 29 Aug 2023 08:37:19 +0000 (14:07 +0530)

committer Jijoong Moon <jijoong.moon@samsung.com>

Tue, 29 Aug 2023 12:12:32 +0000 (21:12 +0900)
author Debadri Samaddar <s.debadri@samsung.com>
Tue, 29 Aug 2023 08:37:19 +0000 (14:07 +0530)
committer Jijoong Moon <jijoong.moon@samsung.com>
Tue, 29 Aug 2023 12:12:32 +0000 (21:12 +0900)
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp

index a0b51089f921e48deafde404e443c4f05084120e..3d9cf2ee50632f5f7de3f53b2628ec1f2d33e01d 100644 (file)
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -277,8 +277,12 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
                       uint32_t cols, float alpha, float beta) {
    const __fp16 *__restrict x;
  
-  for (unsigned int i = 0; i < rows; ++i) {
-    Y[i] = Y[i] * beta;
+  float16x8_t v_beta = vmovq_n_f16(beta);
+
+  for (unsigned int i = 0; i < rows; i += 8) {
+    float16x8_t y = vld1q_f16(&Y[i]);
+    y = vmulq_f16(v_beta, y);
+    vst1q_f16(&Y[i], y);
    }
  
    float16x8_t v_alpha = vmovq_n_f16(alpha);
@@ -302,12 +306,14 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
        const __fp16 *__restrict w;
  
        float16x8_t y0;
+      __fp16 r[4];
  
+      float16x4_t y0_high;
+      float16x4_t y0_low;
        for (unsigned int j = 0; j < rows; ++j) {
          w = &A[j * cols + i];
          y0 = vmovq_n_f16(0);
  
-        __fp16 r[8];
          wvec0_7 = vld1q_f16(&w[0]);
          wvec8_15 = vld1q_f16(&w[8]);
          wvec16_23 = vld1q_f16(&w[16]);
@@ -318,10 +324,13 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
          y0 = vfmaq_f16(y0, wvec16_23, x16_23);
          y0 = vfmaq_f16(y0, wvec24_31, x24_31);
  
-        vst1q_f16(r, y0);
-        for (unsigned int k = 0; k < 8; ++k) {
-          Y[j] += r[k];
-        }
+        y0_high = vget_high_f16(y0);
+        y0_low = vget_low_f16(y0);
+
+        y0_low = vadd_f16(y0_high, y0_low);
+        vst1_f16(r, y0_low);
+
+        Y[j] += r[0] + r[1] + r[2] + r[3];
        }
      }
  
@@ -341,22 +350,27 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
        const __fp16 *__restrict w;
  
        float16x8_t y0;
+      __fp16 r[4];
  
+      float16x4_t y0_high;
+      float16x4_t y0_low;
        for (unsigned int j = 0; j < rows; ++j) {
          w = &A[j * cols + i];
          y0 = vmovq_n_f16(0);
  
-        __fp16 r[8];
          wvec0_7 = vld1q_f16(&w[0]);
          wvec8_15 = vld1q_f16(&w[8]);
  
          y0 = vfmaq_f16(y0, wvec0_7, x0_7);
          y0 = vfmaq_f16(y0, wvec8_15, x8_15);
  
-        vst1q_f16(r, y0);
-        for (unsigned int k = 0; k < 8; ++k) {
-          Y[j] += r[k];
-        }
+        y0_high = vget_high_f16(y0);
+        y0_low = vget_low_f16(y0);
+
+        y0_low = vadd_f16(y0_high, y0_low);
+        vst1_f16(r, y0_low);
+
+        Y[j] += r[0] + r[1] + r[2] + r[3];
        }
      }
    } else if (cols % 8 == 0) {
@@ -369,23 +383,24 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
  
        float16x8_t wvec0_7;
  
-      const __fp16 *__restrict w;
-
        float16x8_t y0;
+      __fp16 r[4];
  
+      float16x4_t y0_high;
+      float16x4_t y0_low;
        for (unsigned int j = 0; j < rows; ++j) {
-        w = &A[j * cols + i];
-        y0 = vmovq_n_f16(0);
  
-        __fp16 r[8];
-        wvec0_7 = vld1q_f16(&w[0]);
+        wvec0_7 = vld1q_f16(&A[j * cols + i]);
  
-        y0 = vfmaq_f16(y0, wvec0_7, x0_7);
+        y0 = vmulq_f16(wvec0_7, x0_7);
  
-        vst1q_f16(r, y0);
-        for (unsigned int k = 0; k < 8; ++k) {
-          Y[j] += r[k];
-        }
+        y0_high = vget_high_f16(y0);
+        y0_low = vget_low_f16(y0);
+
+        y0_low = vadd_f16(y0_high, y0_low);
+        vst1_f16(r, y0_low);
+
+        Y[j] += r[0] + r[1] + r[2] + r[3];
        }
      }
    }
@@ -394,23 +409,20 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
  void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
                                 uint32_t rows, uint32_t cols, float alpha,
                                 float beta) {
-  const __fp16 *__restrict x;
  
    const float16x8_t v_beta = vmovq_n_f16(beta);
    const float16x8_t v_alpha = vmovq_n_f16(alpha);
  
    if (cols % 32 == 0) {
-    unsigned int n = cols / 32;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
  
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 32; ++i) {
-      initialized[i] = false;
+    for (unsigned int j = 0; j < cols; j += 8) {
+      float16x8_t y0_7 = vld1q_f16(&Y[j]);
+      y0_7 = vmulq_f16(y0_7, v_beta);
+      vst1q_f16(&Y[j], y0_7);
      }
  
      for (unsigned int i = 0; i < rows; ++i) {
-      float16x8_t x = vld1q_dup_f16(&X[i]);
-      x = vmulq_f16(x, v_alpha);
+      __fp16 x = alpha * X[i];
  
        for (unsigned int j = 0; j < cols; j += 32) {
          __fp16 *__restrict y = &Y[j];
@@ -420,15 +432,6 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
          float16x8_t y16_23 = vld1q_f16(&y[16]);
          float16x8_t y24_31 = vld1q_f16(&y[24]);
  
-        step = j / 32;
-        if (!initialized[step]) {
-          y0_7 = vmulq_f16(y0_7, v_beta);
-          y8_15 = vmulq_f16(y8_15, v_beta);
-          y16_23 = vmulq_f16(y16_23, v_beta);
-          y24_31 = vmulq_f16(y24_31, v_beta);
-          initialized[step] = true;
-        }
-
          float16x8_t wvec0_7, wvec8_15, wvec16_23, wvec24_31;
          const __fp16 *__restrict w;
  
@@ -439,10 +442,10 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
          wvec16_23 = vld1q_f16(&w[16]);
          wvec24_31 = vld1q_f16(&w[24]);
  
-        y0_7 = vfmaq_f16(y0_7, wvec0_7, x);
-        y8_15 = vfmaq_f16(y8_15, wvec8_15, x);
-        y16_23 = vfmaq_f16(y16_23, wvec16_23, x);
-        y24_31 = vfmaq_f16(y24_31, wvec24_31, x);
+        y0_7 = vfmaq_n_f16(y0_7, wvec0_7, x);
+        y8_15 = vfmaq_n_f16(y8_15, wvec8_15, x);
+        y16_23 = vfmaq_n_f16(y16_23, wvec16_23, x);
+        y24_31 = vfmaq_n_f16(y24_31, wvec24_31, x);
  
          vst1q_f16(&y[0], y0_7);
          vst1q_f16(&y[8], y8_15);
@@ -450,20 +453,17 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
          vst1q_f16(&y[24], y24_31);
        }
      }
-    free(initialized);
      return;
    } else if (cols % 16 == 0) {
-    unsigned int n = cols / 16;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
  
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 16; ++i) {
-      initialized[i] = false;
+    for (unsigned int j = 0; j < cols; j += 8) {
+      float16x8_t y0_7 = vld1q_f16(&Y[j]);
+      y0_7 = vmulq_f16(y0_7, v_beta);
+      vst1q_f16(&Y[j], y0_7);
      }
  
      for (unsigned int i = 0; i < rows; ++i) {
-      float16x8_t x = vld1q_dup_f16(&X[i]);
-      x = vmulq_f16(x, v_alpha);
+      __fp16 x = alpha * X[i];
  
        for (unsigned int j = 0; j < cols; j += 16) {
          __fp16 *__restrict y = &Y[j];
@@ -471,13 +471,6 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
          float16x8_t y0_7 = vld1q_f16(&y[0]);
          float16x8_t y8_15 = vld1q_f16(&y[8]);
  
-        step = j / 16;
-        if (!initialized[step]) {
-          y0_7 = vmulq_f16(y0_7, v_beta);
-          y8_15 = vmulq_f16(y8_15, v_beta);
-          initialized[step] = true;
-        }
-
          float16x8_t wvec0_7, wvec8_15;
          const __fp16 *__restrict w;
  
@@ -486,53 +479,36 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
          wvec0_7 = vld1q_f16(&w[0]);
          wvec8_15 = vld1q_f16(&w[8]);
  
-        y0_7 = vfmaq_f16(y0_7, wvec0_7, x);
-        y8_15 = vfmaq_f16(y8_15, wvec8_15, x);
+        y0_7 = vfmaq_n_f16(y0_7, wvec0_7, x);
+        y8_15 = vfmaq_n_f16(y8_15, wvec8_15, x);
  
          vst1q_f16(&y[0], y0_7);
          vst1q_f16(&y[8], y8_15);
        }
      }
-    free(initialized);
      return;
    } else if (cols % 8 == 0) {
-    unsigned int n = cols / 8;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
  
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 8; ++i) {
-      initialized[i] = false;
+    for (unsigned int j = 0; j < cols; j += 8) {
+      float16x8_t y0_7 = vld1q_f16(&Y[j]);
+      y0_7 = vmulq_f16(y0_7, v_beta);
+      vst1q_f16(&Y[j], y0_7);
      }
  
-    __fp16 temp[8];
      for (unsigned int i = 0; i < rows; ++i) {
-      float16x8_t x = vld1q_dup_f16(&X[i]);
-      x = vmulq_f16(x, v_alpha);
  
-      for (unsigned int j = 0; j < cols; j += 8) {
-        __fp16 *__restrict y = &Y[j];
+      __fp16 x = alpha * X[i];
  
-        float16x8_t y0_7 = vld1q_f16(&y[0]);
-
-        step = j / 8;
-        if (!initialized[step]) {
-          y0_7 = vmulq_f16(y0_7, v_beta);
-          initialized[step] = true;
-        }
-
-        float16x8_t wvec0_7;
-        const __fp16 *__restrict w;
+      for (unsigned int j = 0; j < cols; j += 8) {
  
-        w = &A[i * cols + j];
+        float16x8_t y0_7 = vld1q_f16(&Y[j]);
+        float16x8_t wvec0_7 = vld1q_f16(&A[i * cols + j]);
  
-        wvec0_7 = vld1q_f16(&w[0]);
+        y0_7 = vfmaq_n_f16(y0_7, wvec0_7, x);
  
-        y0_7 = vfmaq_f16(y0_7, wvec0_7, x);
-
-        vst1q_f16(&y[0], y0_7);
+        vst1q_f16(&Y[j], y0_7);
        }
      }
-    free(initialized);
      return;
    }
  }
diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp

index 8e253ab880d3b606216b0c5ce456b1e8e2c5bfe0..973c1f83a5d8d68a50cebc6344bc9e3fd7dff1c5 100644 (file)
--- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
@@ -282,6 +282,92 @@ TEST(nntrainer_Tensor, max_abs) {
    EXPECT_NEAR(result_neon, result_fp32, epsilon);
  }
  
+TEST(nntrainer_Tensor, sum_sgemv_transpose) {
+  int batch = 3;
+  int channel = 2;
+  int height = 2;
+  int width = 10;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor input(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor input_copy(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor input_fp32(batch, channel, height, width, t_type_nchw_fp32);
+
+  const float alpha = 1e-5;
+
+  GEN_TEST_INPUT(input, i * (batch * height * channel) * alpha +
+                          j * (batch * height) * alpha + k * (width)*alpha + l +
+                          1);
+  GEN_TEST_INPUT(input_copy, i * (batch * height * channel) * alpha +
+                               j * (batch * height) * alpha +
+                               k * (width)*alpha + l + 1);
+  GEN_TEST_INPUT(input_fp32, i * (batch * height * channel) * alpha +
+                               j * (batch * height) * alpha +
+                               k * (width)*alpha + l + 1);
+
+  nntrainer::Tensor result0 = input.sum(0);
+  nntrainer::Tensor result0_fp32 = input_fp32.sum(0);
+
+  float mseErrorNeon = mse<__fp16>(
+    result0.getData<__fp16>(), result0_fp32.getData<float>(), result0.size());
+
+  double cosSimNeon = cosine_similarity<__fp16>(
+    result0.getData<__fp16>(), result0_fp32.getData<float>(), result0.size());
+
+  const float epsilon = 1e-4;
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
+TEST(nntrainer_Tensor, sum_sgemv) {
+  int batch = 3;
+  int channel = 2;
+  int height = 2;
+  int width = 10;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor input(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor input_copy(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor input_fp32(batch, channel, height, width, t_type_nchw_fp32);
+
+  const float alpha = 1e-5;
+
+  GEN_TEST_INPUT(input, i * (batch * height * channel) * alpha +
+                          j * (batch * height) * alpha + k * (width)*alpha + l +
+                          1);
+  GEN_TEST_INPUT(input_copy, i * (batch * height * channel) * alpha +
+                               j * (batch * height) * alpha +
+                               k * (width)*alpha + l + 1);
+  GEN_TEST_INPUT(input_fp32, i * (batch * height * channel) * alpha +
+                               j * (batch * height) * alpha +
+                               k * (width)*alpha + l + 1);
+
+  nntrainer::Tensor result0 = input.sum_by_batch();
+  nntrainer::Tensor result0_fp32 = input_fp32.sum_by_batch();
+
+  float mseErrorNeon = mse<__fp16>(
+    result0.getData<__fp16>(), result0_fp32.getData<float>(), result0.size());
+
+  double cosSimNeon = cosine_similarity<__fp16>(
+    result0.getData<__fp16>(), result0_fp32.getData<float>(), result0.size());
+
+  const float epsilon = 1e-4;
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
  GTEST_API_ int main(int argc, char **argv) {
    int result = -1;
author	Debadri Samaddar <s.debadri@samsung.com>
	Tue, 29 Aug 2023 08:37:19 +0000 (14:07 +0530)
committer	Jijoong Moon <jijoong.moon@samsung.com>
	Tue, 29 Aug 2023 12:12:32 +0000 (21:12 +0900)
nntrainer/tensor/blas_neon.cpp		patch \| blob \| history
test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp		patch \| blob \| history