Enable neon sscal function for Android (ARM) fp16 computation.
Add unit test for fp16 sscal function in Android(ARM).
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: Debadri Samaddar <s.debadri@samsung.com>
void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
unsigned int incx = abs(incX);
+#ifdef USE__FP16
+ if (incX == 1) {
+ nntrainer::neon::sscal_neon_fp16(N, X, alpha);
+ } else {
+ for (unsigned int i = 0; i < N; ++i)
+ X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
+ }
+#else
for (unsigned int i = 0; i < N; ++i)
X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
+#endif
}
static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) {
return ret;
}
+void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha) {
+ const float16x8_t v_alphaX8 = vmovq_n_f16(alpha);
+ const float16x4_t v_alphaX4 = vmov_n_f16(alpha);
+
+ unsigned int idx = 0;
+
+ // processing batch of 8
+ for (; (N - idx) >= 8; idx += 8) {
+ float16x8_t x = vld1q_f16(&X[idx]);
+
+ // alpha*X -> X
+ float16x8_t mulacc = vmulq_f16(v_alphaX8, x);
+ vst1q_f16(&X[idx], mulacc);
+ }
+
+ // processing remaining batch of 4
+ for (; (N - idx) >= 4; idx += 4) {
+ float16x4_t x = vld1_f16(&X[idx]);
+
+ // alpha*X -> X
+ float16x4_t mulacc = vmul_f16(v_alphaX4, x);
+ vst1_f16(&X[idx], mulacc);
+ }
+
+ // pocessing remaining values
+ for (; idx < N; idx++)
+ X[idx] = alpha * X[idx];
+}
+
} // namespace nntrainer::neon
#endif
/**
- * @brief sdot computation with neon: sum of all X * Y
- * @param[in] N number of elements in Y
+ * @brief snrm2 computation with neon: Euclidean norm
+ * @param[in] N number of elements in X
* @param[in] X __fp16 * for Vector X
- * @param[in] Y __fp16 * for Vector Y
*/
__fp16 snrm2_neon_fp16(const unsigned int N, const __fp16 *X);
+/**
+ * @brief sscal computation with neon: X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] alpha float number
+ */
+void sscal_neon_fp16(const unsigned int N, __fp16 *X, const float alpha);
+
} // namespace nntrainer::neon
#endif /* __cplusplus */
EXPECT_NEAR(result_neon, result_fp32, epsilon);
}
+TEST(nntrainer_Tensor, sscal) {
+ int batch = 1;
+ int channel = 1;
+ int height = 2;
+ int width = 11;
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+ nntrainer::Tensor input(batch, channel, height, width, t_type_nchw_fp16);
+ nntrainer::Tensor input_copy(batch, channel, height, width, t_type_nchw_fp16);
+ nntrainer::Tensor input_fp32(batch, channel, height, width, t_type_nchw_fp32);
+
+ const float alpha = 1e-5;
+ const float epsilon = 1e-4;
+
+ GEN_TEST_INPUT(input, i * (batch * height * channel) * alpha +
+ j * (batch * height) * alpha + k * (width)*alpha + l +
+ 1);
+ GEN_TEST_INPUT(input_fp32, i * (batch * height * channel) * alpha +
+ j * (batch * height) * alpha +
+ k * (width)*alpha + l + 1);
+
+ // NEON fp16
+ int result = input.multiply_i(0.1);
+
+ // fp32
+ result = input_fp32.multiply_i(0.1);
+
+ float mseErrorNeon = mse<__fp16>(input.getData<__fp16>(),
+ input_fp32.getData<float>(), input.size());
+
+ double cosSimNeon = cosine_similarity<__fp16>(
+ input.getData<__fp16>(), input_fp32.getData<float>(), input.size());
+
+ EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+ EXPECT_IN_RANGE(cosSimNeon, 0.99, 1);
+}
+
GTEST_API_ int main(int argc, char **argv) {
int result = -1;