- Previous hnrm2 was using full-fp16.
- Since this is also one of dimension-shrinking computation, should use inter-fp32 values to enhance precision.
- This has not been detected due to small dimension Tensor usage in unittest. Add higher dimension test case accordingly.
- Note that this function is responsible for Tensor::l2norm(), frequently used for mse loss computation.
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: skykongkong8 <ss.kong@samsung.com>
static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) {
unsigned int incx = abs(incX);
- _FP16 sum = 0;
+ _FP16 sum;
_FP16 tmp;
#if (defined USE__FP16 && USE_NEON)
if (incX == 1) {
sum = nntrainer::neon::hnrm2(N, X);
} else {
+ float sum32 = 0;
for (unsigned int i = 0; i < N; i++) {
tmp = X[i * incx];
- sum += tmp * tmp;
+ sum32 += tmp * tmp;
}
+ sum = static_cast<_FP16>(sqrt(sum32));
}
#else
+ float sum32 = 0;
for (unsigned int i = 0; i < N; i++) {
tmp = X[i * incx];
- sum += tmp * tmp;
+ sum32 += tmp * tmp;
}
+ sum = static_cast<_FP16>(sqrt(sum32));
#endif
- return static_cast<_FP16>(sqrt(sum));
+ return sum;
}
static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
}
__fp16 hnrm2(const unsigned int N, const __fp16 *X) {
-
- float16x8_t accX8 = vmovq_n_f16(0);
- float16x4_t accX4 = vmov_n_f16(0);
+ float32x4_t accX0_3 = vmovq_n_f32(0.F);
+ float32x4_t accX4_7 = vmovq_n_f32(0.F);
unsigned int idx = 0;
- __fp16 ret = 0;
-
- // processing batch of 8
- for (; (N - idx) >= 8; idx += 8) {
- float16x8_t x = vld1q_f16(&X[idx]);
-
- // x*x + accX8 -> accX8
- accX8 = vfmaq_f16(accX8, x, x);
- }
-
- // check at least one batch of 8 is processed
- if (N - 8 >= 0) {
- __fp16 result[8];
- vst1q_f16(result, accX8);
- for (unsigned int i = 0; i < 8; i++)
- ret += result[i];
- }
+ unsigned int N8 = (N >> 3) << 3;
+ float ret = 0;
- // processing remaining batch of 4
- for (; (N - idx) >= 4; idx += 4) {
- float16x4_t x = vld1_f16(&X[idx]);
+ // Adaptive loop for batch size of 8
+ for (; idx < N8; idx += 8) {
+ float16x8_t x0_7 = vld1q_f16(&X[idx]);
- // x*x + accX4 -> accX4
- accX4 = vfma_f16(accX4, x, x);
+ x0_7 = vmulq_f16(x0_7, x0_7);
+ accX0_3 = vaddq_f32(accX0_3, vcvt_f32_f16(vget_low_f16(x0_7)));
+ accX4_7 = vaddq_f32(accX4_7, vcvt_f32_f16(vget_high_f16(x0_7)));
}
+ ret += vaddvq_f32(accX0_3) + vaddvq_f32(accX4_7);
- // check at least one batch of 4 is processed
- if (N % 8 >= 4) {
- __fp16 result[4];
- vst1_f16(result, accX4);
- ret += result[0] + result[1] + result[2] + result[3];
- }
-
- // pocessing remaining values
- for (; idx < N; idx++)
+ // Loop for remaining indices
+ for (; idx < N; idx++) {
ret += X[idx] * X[idx];
+ }
- return ret;
+ return static_cast<__fp16>(sqrt(ret));
}
void hscal(const unsigned int N, __fp16 *X, const float alpha) {
result_fp32 = input_fp32.l2norm();
// absolute error
- const float epsilon = 1e-2;
+ const float epsilon = 1e-3 * width;
EXPECT_NEAR(result_neon, result_fp32, epsilon);
}
+TEST(nntrainer_Tensor, l2norm_big768) {
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+ size_t batch = 1;
+ size_t channel = 1;
+ size_t height = 768;
+ size_t width = 768;
+
+ nntrainer::Tensor input(
+ nntrainer::TensorDim(1, 1, height, width, t_type_nchw_fp16));
+
+ nntrainer::Tensor input_fp32(
+ nntrainer::TensorDim(1, 1, height, width, t_type_nchw_fp32));
+
+ const float alpha = 1e-1;
+ const int MOD = 10;
+
+ GEN_TEST_INPUT(input, ((i * j * (batch * height * channel) +
+ j * (batch * height) + k * (width) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT(input_fp32, ((i * j * (batch * height * channel) +
+ j * (batch * height) + k * (width) + l + 1) %
+ MOD) *
+ alpha);
+
+ __fp16 result_neon;
+ float result_fp32;
+
+ result_neon = input.l2norm();
+ result_fp32 = input_fp32.l2norm();
+
+ float ErrorNeon = abs(result_neon - result_fp32);
+
+ const float epsilon = 1e-3 * width;
+ EXPECT_IN_RANGE(ErrorNeon, 0, epsilon);
+}
+
TEST(nntrainer_Tensor, multiply_i) {
int batch = 1;
int channel = 1;