From c30bb4145dbfbe2fc109703dcef3455c8014341a Mon Sep 17 00:00:00 2001 From: Debadri Samaddar Date: Tue, 22 Aug 2023 21:27:34 +0530 Subject: [PATCH] [ blas/neon ] Add NEON fp16 function for scopy Enable neon scopy function for Android (ARM) fp16 computation. Add unit test for fp16 scopy function in Android(ARM). **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Debadri Samaddar --- nntrainer/tensor/blas_interface.cpp | 9 ++++ nntrainer/tensor/blas_neon.cpp | 23 ++++++++- nntrainer/tensor/blas_neon.h | 10 +++- .../unittest_nntrainer_tensor_neon_fp16.cpp | 54 ++++++++++++++++++++-- 4 files changed, 89 insertions(+), 7 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 49078ec..6f4950c 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -134,8 +134,17 @@ static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX, unsigned int incy = abs(incY); unsigned int incx = abs(incX); +#ifdef USE__FP16 + if (incX == 1 && incY == 1) { + nntrainer::neon::scopy_neon_fp16(N, X, Y); + } else { + for (unsigned int i = 0; i < N; ++i) + Y[i * incy] = X[i * incx]; + } +#else for (unsigned int i = 0; i < N; ++i) Y[i * incy] = X[i * incx]; +#endif } void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) { diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index a268760..101882a 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -617,7 +617,6 @@ __fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y) { return ret; } -#endif __fp16 snrm2_neon_fp16(const unsigned int N, const __fp16 *X) { @@ -694,4 +693,26 @@ void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha) { X[idx] = alpha * X[idx]; } +void scopy_neon_fp16(const unsigned int N, const __fp16 *X, __fp16 *Y) { + + unsigned int idx = 0; + + // processing batch of 8 + for (; (N - idx) >= 8; idx += 8) { + float16x8_t batch = vld1q_f16(&X[idx]); + vst1q_f16(&Y[idx], batch); + } + + // processing remaining batch of 4 + for (; (N - idx) >= 4; idx += 4) { + float16x4_t batch = vld1_f16(&X[idx]); + vst1_f16(&Y[idx], batch); + } + + // pocessing remaining values + for (; idx < N; idx++) + Y[idx] = X[idx]; +} +#endif + } // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 6034eda..e833673 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -94,7 +94,6 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, * @param[in] Y __fp16 * for Vector Y */ __fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y); -#endif /** * @brief snrm2 computation with neon: Euclidean norm @@ -111,6 +110,15 @@ __fp16 snrm2_neon_fp16(const unsigned int N, const __fp16 *X); */ void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha); +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_neon_fp16(const unsigned int N, const __fp16 *X, __fp16 *Y); +#endif + } // namespace nntrainer::neon #endif /* __cplusplus */ diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp index 7d99237..90d8dc3 100644 --- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp @@ -137,13 +137,13 @@ TEST(nntrainer_Tensor, l2norm) { size_t width = 23; - __fp16 a_data[] = {0, 1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7, - 2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1}; + __fp16 a_data[] = {0, 1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7, + 2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1}; nntrainer::Tensor input( nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16), a_data); - float a_data_fp32[] = {0, 1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7, - 2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1}; + float a_data_fp32[] = {0, 1.2, 2, 3.4, 4.1, 5.3, 2.9, 2.1, 1.4, 1.6, 0, 2.7, + 2.3, 1, 2, 1.1, 3.1, 1.1, 2.8, 3.2, 2, 3.6, 1}; nntrainer::Tensor input_fp32( nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32), a_data_fp32); @@ -162,7 +162,7 @@ TEST(nntrainer_Tensor, l2norm) { EXPECT_NEAR(result_neon, result_fp32, epsilon); } -TEST(nntrainer_Tensor, sscal) { +TEST(nntrainer_Tensor, multiply_i) { int batch = 1; int channel = 1; int height = 2; @@ -204,6 +204,50 @@ TEST(nntrainer_Tensor, sscal) { EXPECT_IN_RANGE(cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, copy) { + int batch = 1; + int channel = 1; + int height = 2; + int width = 11; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + nntrainer::Tensor input(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor input_fp32(batch, channel, height, width, t_type_nchw_fp32); + + const float alpha = 1e-5; + const float epsilon = 1e-4; + + GEN_TEST_INPUT(input, i * (batch * height * channel) * alpha + + j * (batch * height) * alpha + k * (width)*alpha + l + + 1); + GEN_TEST_INPUT(input_fp32, i * (batch * height * channel) * alpha + + j * (batch * height) * alpha + + k * (width)*alpha + l + 1); + + nntrainer::Tensor output; + nntrainer::Tensor output_fp32; + + // NEON fp16 + output.copy(input); + + // fp32 + output_fp32.copy(input_fp32); + + float mseErrorNeon = mse<__fp16>(output.getData<__fp16>(), + output_fp32.getData(), output.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + output.getData<__fp16>(), output_fp32.getData(), output.size()); + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE(cosSimNeon, 0.99, 1); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1; -- 2.7.4