24 #ifndef __ARM_COMPUTE_NEMATH_H__ 25 #define __ARM_COMPUTE_NEMATH_H__ 32 const std::array<float32x4_t, 8>
exp_tab =
36 vdupq_n_f32(0.0416598916054f),
37 vdupq_n_f32(0.500000596046f),
38 vdupq_n_f32(0.0014122662833f),
39 vdupq_n_f32(1.00000011921f),
40 vdupq_n_f32(0.00833693705499f),
41 vdupq_n_f32(0.166665703058f),
42 vdupq_n_f32(0.000195780929062f),
47 const std::array<float32x4_t, 8>
log_tab =
50 vdupq_n_f32(-2.29561495781f),
51 vdupq_n_f32(-2.47071170807f),
52 vdupq_n_f32(-5.68692588806f),
53 vdupq_n_f32(-0.165253549814f),
54 vdupq_n_f32(5.17591238022f),
55 vdupq_n_f32(0.844007015228f),
56 vdupq_n_f32(4.58445882797f),
57 vdupq_n_f32(0.0141278216615f),
69 float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
70 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
71 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
73 return sqrt_reciprocal;
82 inline float32x4_t
vinv_f32(
const float32x4_t &x)
84 float32x4_t recip = vrecpeq_f32(x);
85 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
86 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
97 inline float32x4_t
vtaylor_poly_f32(
const float32x4_t &x,
const std::array<float32x4_t, 8> &coeffs)
99 float32x4_t
A = vmlaq_f32(coeffs[0], coeffs[4], x);
100 float32x4_t
B = vmlaq_f32(coeffs[2], coeffs[6], x);
101 float32x4_t C = vmlaq_f32(coeffs[1], coeffs[5], x);
102 float32x4_t D = vmlaq_f32(coeffs[3], coeffs[7], x);
103 float32x4_t x2 = vmulq_f32(x, x);
104 float32x4_t x4 = vmulq_f32(x2, x2);
105 float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
117 static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);
118 static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f);
121 int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
122 float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
128 poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
141 static const int32x4_t CONST_127 = vdupq_n_s32(127);
142 static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);
145 int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
146 float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
152 poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
167 static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
168 static const float32x4_t CONST_2 = vdupq_n_f32(2.f);
170 float32x4_t exp2x =
vexp_f32(vmulq_f32(CONST_2, val));
171 float32x4_t num = vsubq_f32(exp2x, CONST_1);
172 float32x4_t den = vaddq_f32(exp2x, CONST_1);
173 float32x4_t tanh = vmulq_f32(num,
vinv_f32(den));
186 inline float32x4_t
vpowq_f32(
const float32x4_t &val,
const float32x4_t &n)
float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array< float32x4_t, 8 > &coeffs)
Perform a 7th degree polynomial approximation using Estrin's method.
float32x4_t vinv_f32(const float32x4_t &x)
Calculate reciprocal.
float32x4_t vtanh_f32(const float32x4_t &val)
Calculate hyperbolic tangent.
const std::array< float32x4_t, 8 > exp_tab
float32x4_t vinvsqrt_f32(float32x4_t x)
Calculate inverse square root.
float32x4_t vlog_f32(const float32x4_t &x)
Calculate logarithm.
float32x4_t vexp_f32(const float32x4_t &x)
Calculate exponential.
float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
Calculate n power of a number.
const std::array< float32x4_t, 8 > log_tab