avoid using fmadd in RNG

author Tomoaki Teshima <tomoaki.teshima@gmail.com>

Tue, 25 Oct 2016 09:04:11 +0000 (18:04 +0900)

committer Tomoaki Teshima <tomoaki.teshima@gmail.com>

Thu, 3 Nov 2016 14:11:52 +0000 (23:11 +0900)
author Tomoaki Teshima <tomoaki.teshima@gmail.com>
Tue, 25 Oct 2016 09:04:11 +0000 (18:04 +0900)
committer Tomoaki Teshima <tomoaki.teshima@gmail.com>
Thu, 3 Nov 2016 14:11:52 +0000 (23:11 +0900)
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp

index 5247af8..094cde9 100644 (file)
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -239,6 +239,17 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
          __m128 p1 = _mm_unpackhi_ps(q01l, q01h);
  
          _mm_storeu_ps(arr + i, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(f), p0), p1));
+#elif defined __ARM_NEON && defined __aarch64__
+        // handwritten NEON is required not for performance but for numerical stability!
+        // 64bit gcc tends to use fmadd instead of separate multiply and add
+        // use volatile to ensure to separate the multiply and add
+        float32x4x2_t q = vld2q_f32((const float*)(p + i));
+
+        float32x4_t p0 = q.val[0];
+        float32x4_t p1 = q.val[1];
+
+        volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
+        vst1q_f32(arr+i, vaddq_f32(v0, p1));
  #else
          arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
          arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
@@ -255,6 +266,11 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
                  _mm_mul_ss(_mm_set_ss((float)(int)temp), _mm_set_ss(p[i][0])),
                  _mm_set_ss(p[i][1]))
                  );
+#elif defined __ARM_NEON && defined __aarch64__
+        float32x2_t t = vadd_f32(vmul_f32(
+                vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
+                vdup_n_f32(p[i][1]));
+        arr[i] = vget_lane_f32(t, 0);
  #else
          arr[i] = (int)temp*p[i][0] + p[i][1];
  #endif
author	Tomoaki Teshima <tomoaki.teshima@gmail.com>
	Tue, 25 Oct 2016 09:04:11 +0000 (18:04 +0900)
committer	Tomoaki Teshima <tomoaki.teshima@gmail.com>
	Thu, 3 Nov 2016 14:11:52 +0000 (23:11 +0900)