From c30bb4145dbfbe2fc109703dcef3455c8014341a Mon Sep 17 00:00:00 2001
From: Debadri Samaddar <s.debadri@samsung.com>
Date: Tue, 22 Aug 2023 21:27:34 +0530
Subject: [PATCH] [ blas/neon ] Add NEON fp16 function for scopy

Enable neon scopy function for Android (ARM) fp16 computation.
Add unit test for fp16 scopy function in Android(ARM).

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Debadri Samaddar <s.debadri@samsung.com>
---
 nntrainer/tensor/blas_interface.cpp                |  9 ++++
 nntrainer/tensor/blas_neon.cpp                     | 23 ++++++++-
 nntrainer/tensor/blas_neon.h                       | 10 +++-
 .../unittest_nntrainer_tensor_neon_fp16.cpp        | 54 ++++++++++++++++++++--
 4 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 49078ec..6f4950c 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -134,8 +134,17 @@ static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX,
   unsigned int incy = abs(incY);
   unsigned int incx = abs(incX);
 
+#ifdef USE__FP16
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::scopy_neon_fp16(N, X, Y);
+  } else {
+    for (unsigned int i = 0; i < N; ++i)
+      Y[i * incy] = X[i * incx];
+  }
+#else
   for (unsigned int i = 0; i < N; ++i)
     Y[i * incy] = X[i * incx];
+#endif
 }
 
 void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index a268760..101882a 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -617,7 +617,6 @@ __fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y) {
 
   return ret;
 }
-#endif
 
 __fp16 snrm2_neon_fp16(const unsigned int N, const __fp16 *X) {
 
@@ -694,4 +693,26 @@ void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha) {
     X[idx] = alpha * X[idx];
 }
 
+void scopy_neon_fp16(const unsigned int N, const __fp16 *X, __fp16 *Y) {
+
+  unsigned int idx = 0;
+
+  // processing batch of 8
+  for (; (N - idx) >= 8; idx += 8) {
+    float16x8_t batch = vld1q_f16(&X[idx]);
+    vst1q_f16(&Y[idx], batch);
+  }
+
+  // processing remaining batch of 4
+  for (; (N - idx) >= 4; idx += 4) {
+    float16x4_t batch = vld1_f16(&X[idx]);
+    vst1_f16(&Y[idx], batch);
+  }
+
+  // pocessing remaining values
+  for (; idx < N; idx++)
+    Y[idx] = X[idx];
+}
+#endif
+
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index 6034eda..e833673 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -94,7 +94,6 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X,
  * @param[in] Y __fp16 * for Vector Y
  */
 __fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y);
-#endif
 
 /**
  * @brief     snrm2 computation with neon: Euclidean norm
@@ -111,6 +110,15 @@ __fp16 snrm2_neon_fp16(const unsigned int N, const __fp16 *X);
  */
 void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha);
 
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_neon_fp16(const unsigned int N, const __fp16 *X, __fp16 *Y);
+#endif
+
 } // namespace nntrainer::neon
 
 #endif /* __cplusplus */
diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
index 7d99237..90d8dc3 100644
--- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
@@ -137,13 +137,13 @@ TEST(nntrainer_Tensor, l2norm) {
 
   size_t width = 23;
 
-  __fp16 a_data[] = {0,  1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7,
-                     2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1};
+  __fp16 a_data[] = {0,   1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7,
+                     2.3, 1,   2, 1.1, 3.1, 1.1, 2.8, 3.2, 2,   3.6, 1};
   nntrainer::Tensor input(
     nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16), a_data);
 
-  float a_data_fp32[] = {0,  1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7,
-                     2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1};
+  float a_data_fp32[] = {0,   1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7,
+                         2.3, 1,   2, 1.1, 3.1, 1.1, 2.8, 3.2, 2,   3.6, 1};
   nntrainer::Tensor input_fp32(
     nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32), a_data_fp32);
 
@@ -162,7 +162,7 @@ TEST(nntrainer_Tensor, l2norm) {
   EXPECT_NEAR(result_neon, result_fp32, epsilon);
 }
 
-TEST(nntrainer_Tensor, sscal) {
+TEST(nntrainer_Tensor, multiply_i) {
   int batch = 1;
   int channel = 1;
   int height = 2;
@@ -204,6 +204,50 @@ TEST(nntrainer_Tensor, sscal) {
   EXPECT_IN_RANGE(cosSimNeon, 0.99, 1);
 }
 
+TEST(nntrainer_Tensor, copy) {
+  int batch = 1;
+  int channel = 1;
+  int height = 2;
+  int width = 11;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor input(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor input_fp32(batch, channel, height, width, t_type_nchw_fp32);
+
+  const float alpha = 1e-5;
+  const float epsilon = 1e-4;
+
+  GEN_TEST_INPUT(input, i * (batch * height * channel) * alpha +
+                          j * (batch * height) * alpha + k * (width)*alpha + l +
+                          1);
+  GEN_TEST_INPUT(input_fp32, i * (batch * height * channel) * alpha +
+                               j * (batch * height) * alpha +
+                               k * (width)*alpha + l + 1);
+
+  nntrainer::Tensor output;
+  nntrainer::Tensor output_fp32;
+
+  // NEON fp16
+  output.copy(input);
+
+  // fp32
+  output_fp32.copy(input_fp32);
+
+  float mseErrorNeon = mse<__fp16>(output.getData<__fp16>(),
+                                   output_fp32.getData<float>(), output.size());
+
+  double cosSimNeon = cosine_similarity<__fp16>(
+    output.getData<__fp16>(), output_fp32.getData<float>(), output.size());
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE(cosSimNeon, 0.99, 1);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 
-- 
2.7.4