arm_compute v17.05

[platform/upstream/armcl.git] / arm_compute / core / NEON / NEMath.h
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h

index bf27f11..d40e0c3 100644 (file)
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =
   *
   * @return The calculated inverse square root.
   */
-inline float32x4_t vinvsqrt_f32(float32x4_t x)
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
  {
      float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
      sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
@@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)
   *
   * @return The calculated reciprocal.
   */
-inline float32x4_t vinv_f32(const float32x4_t &x)
+inline float32x4_t vinvq_f32(const float32x4_t &x)
  {
      float32x4_t recip = vrecpeq_f32(x);
      recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
@@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)
   *
   * @return The calculated approximation.
   */
-inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
+inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
  {
      float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
      float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
@@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float
   *
   * @return The calculated exponent.
   */
-inline float32x4_t vexp_f32(const float32x4_t &x)
+inline float32x4_t vexpq_f32(const float32x4_t &x)
  {
      static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
      static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
@@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
      float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
  
      // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
  
      // Reconstruct
      poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
@@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
   *
   * @return The calculated logarithm.
   */
-inline float32x4_t vlog_f32(const float32x4_t &x)
+inline float32x4_t vlogq_f32(const float32x4_t &x)
  {
      static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
      static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
@@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
      float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
  
      // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, log_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
  
      // Reconstruct
      poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
@@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
   *
   * tanh(x) = (e^2x - 1)/(e^2x + 1)
   *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
   * @param val Input vector value in F32 format.
   *
   * @return The calculated Hyperbolic Tangent.
   */
-inline float32x4_t vtanh_f32(const float32x4_t &val)
+inline float32x4_t vtanhq_f32(const float32x4_t &val)
  {
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
-    static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);  // 1.f
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);  // 2.f
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f);  // 5.f
  
-    float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
      float32x4_t num   = vsubq_f32(exp2x, CONST_1);
      float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinv_f32(den));
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
      return tanh;
  }
  
@@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)
   */
  inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
  {
-    return vexp_f32(vmulq_f32(n, vlog_f32(val)));
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
  }
  }