*
* @return The calculated inverse square root.
*/
-inline float32x4_t vinvsqrt_f32(float32x4_t x)
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
*
* @return The calculated reciprocal.
*/
-inline float32x4_t vinv_f32(const float32x4_t &x)
+inline float32x4_t vinvq_f32(const float32x4_t &x)
{
float32x4_t recip = vrecpeq_f32(x);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
*
* @return The calculated approximation.
*/
-inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
+inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
{
float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x);
float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x);
*
* @return The calculated exponent.
*/
-inline float32x4_t vexp_f32(const float32x4_t &x)
+inline float32x4_t vexpq_f32(const float32x4_t &x)
{
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
// Polynomial Approximation
- float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
+ float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
// Reconstruct
poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
*
* @return The calculated logarithm.
*/
-inline float32x4_t vlog_f32(const float32x4_t &x)
+inline float32x4_t vlogq_f32(const float32x4_t &x)
{
static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
// Polynomial Approximation
- float32x4_t poly = vtaylor_poly_f32(val, log_tab);
+ float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
// Reconstruct
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
*
* tanh(x) = (e^2x - 1)/(e^2x + 1)
*
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
* @param val Input vector value in F32 format.
*
* @return The calculated Hyperbolic Tangent.
*/
-inline float32x4_t vtanh_f32(const float32x4_t &val)
+inline float32x4_t vtanhq_f32(const float32x4_t &val)
{
- static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
- static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
+ static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
+ static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
+ static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
+ static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f); // 5.f
- float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
+ float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+ float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
float32x4_t num = vsubq_f32(exp2x, CONST_1);
float32x4_t den = vaddq_f32(exp2x, CONST_1);
- float32x4_t tanh = vmulq_f32(num, vinv_f32(den));
+ float32x4_t tanh = vmulq_f32(num, vinvq_f32(den));
return tanh;
}
*/
inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
{
- return vexp_f32(vmulq_f32(n, vlog_f32(val)));
+ return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
}
}