}
}
-void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, __fp16 *Y) {
+void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X,
+ __fp16 *Y) {
const float16x8_t v_alphaX8 = vmovq_n_f16(alpha);
const float16x4_t v_alphaX4 = vmov_n_f16(alpha);
unsigned int idx = 0;
// processing batch of 8
- for(; (N - idx) >= 8 ; idx += 8){
+ for (; (N - idx) >= 8; idx += 8) {
float16x8_t x = vld1q_f16(&X[idx]);
float16x8_t y = vld1q_f16(&Y[idx]);
}
// processing remaining batch of 4
- for(; (N - idx) >= 4 ; idx += 4){
+ for (; (N - idx) >= 4; idx += 4) {
float16x4_t x = vld1_f16(&X[idx]);
float16x4_t y = vld1_f16(&Y[idx]);
Y[idx] = Y[idx] + alpha * X[idx];
}
+__fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y) {
+
+ float16x8_t accX8 = vmovq_n_f16(0);
+ float16x4_t accX4 = vmov_n_f16(0);
+
+ unsigned int idx = 0;
+ __fp16 ret = 0;
+
+ // processing batch of 8
+ for (; (N - idx) >= 8; idx += 8) {
+ float16x8_t x = vld1q_f16(&X[idx]);
+ float16x8_t y = vld1q_f16(&Y[idx]);
+
+ // x*y + accX8 -> accX8
+ accX8 = vfmaq_f16(accX8, x, y);
+ }
+
+ // check at least one batch of 8 is processed
+ if (N - 8 >= 0) {
+ __fp16 result[8];
+ vst1q_f16(result, accX8);
+ for (unsigned int i = 0; i < 8; i++)
+ ret += result[i];
+ }
+
+ // processing remaining batch of 4
+ for (; (N - idx) >= 4; idx += 4) {
+ float16x4_t x = vld1_f16(&X[idx]);
+ float16x4_t y = vld1_f16(&Y[idx]);
+
+ // x*y + accX4 -> accX4
+ accX4 = vfma_f16(accX4, x, y);
+ }
+
+ // check at least one batch of 4 is processed
+ if (N % 8 >= 4) {
+ __fp16 result[4];
+ vst1_f16(result, accX4);
+ ret += result[0] + result[1] + result[2] + result[3];
+ }
+
+ // pocessing remaining values
+ for (; idx < N; idx++)
+ ret += X[idx] * Y[idx];
+
+ return ret;
+}
+
} // namespace nntrainer::neon
double cosSimNeon = cosine_similarity<__fp16>(
input.getData<__fp16>(), input_fp32.getData<float>(), input.size());
-
+
EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
EXPECT_IN_RANGE(cosSimNeon, 0.99, 1);
}
+TEST(nntrainer_Tensor, dot) {
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+ // conditions for fp16 sdot call:
+ // this->(batch * channel * height) = arg->(width) = 1;
+
+ size_t width = 23;
+
+ __fp16 a_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
+ 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ nntrainer::Tensor input(
+ nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16), a_data);
+ __fp16 b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
+ 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ nntrainer::Tensor input_2(
+ nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16), b_data);
+
+ float a_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
+ 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ nntrainer::Tensor input_fp32(
+ nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32), a_data_fp32);
+ float b_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
+ 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ nntrainer::Tensor input_fp32_2(
+ nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32), b_data_fp32);
+
+ nntrainer::Tensor result_neon;
+ nntrainer::Tensor result_fp32;
+
+ // NEON fp16
+ result_neon = input.dot(input_2, false, false);
+
+ // fp32
+ result_fp32 = input_fp32.dot(input_fp32_2, false, false);
+
+ float mseErrorNeon =
+ mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData<float>(),
+ result_neon.size());
+
+ double cosSimNeon =
+ cosine_similarity<__fp16>(result_neon.getData<__fp16>(),
+ result_fp32.getData<float>(), result_neon.size());
+
+ const float epsilon = 1e-4;
+
+ EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+ EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
GTEST_API_ int main(int argc, char **argv) {
int result = -1;