From: skykongkong8 Date: Mon, 15 Jan 2024 05:35:21 +0000 (+0900) Subject: [ BLAS ] Refactor neon_mathfun X-Git-Tag: accepted/tizen/7.0/unified/20240830.164841~374 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f9df2b0eeaae18e1e547cbb3ba8e89803ece931a;p=platform%2Fcore%2Fml%2Fnntrainer.git [ BLAS ] Refactor neon_mathfun - For easier usage of neon_mathfun, refactor to avoid duplicated symbol error **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 2ca280e0..8bf24c7c 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -13,7 +13,6 @@ */ #include -#include #include namespace nntrainer::neon { diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 14ed097d..f9bd22d5 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -17,6 +17,7 @@ #include #include +#include namespace nntrainer::neon { diff --git a/nntrainer/tensor/neon_mathfun.h b/nntrainer/tensor/neon_mathfun.h new file mode 100644 index 00000000..602d0484 --- /dev/null +++ b/nntrainer/tensor/neon_mathfun.h @@ -0,0 +1,80 @@ +/* NEON implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library +*/ + +/* Copyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +/** + * @file neon_mathfun.h + * @date 15 Jan 2024 + * @brief This is collection of sin, cos, exp, log function with NEON SIMD + * @see https://github.com/nnstreamer/nntrainer + * @author Julien Pommier + * @bug No known bugs except for NYI items + * + */ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#ifndef NEON_MATHFUN_H_ +#define NEON_MATHFUN_H_ + +#include + +typedef float32x4_t v4sf; // vector of 4 float + +// prototypes +/** + * @brief log function with neon x = log(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf log_ps(v4sf x); + +/** + * @brief exp function with neon x = exp(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf exp_ps(v4sf x); + +/** + * @brief sin_ps function with neon x = sin(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf sin_ps(v4sf x); + +/** + * @brief cos_ps function with neon x = cos(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf cos_ps(v4sf x); + +/** + * @brief sincos_ps function with neon x = sin(x) or cos(x) + * @param[in] x register variable (float32x4_t) + */ +inline void sincos_ps(v4sf x, v4sf *s, v4sf *c); + +#include "neon_mathfun.hxx" + +#endif +#endif diff --git a/nntrainer/tensor/neon_mathfun.hxx b/nntrainer/tensor/neon_mathfun.hxx new file mode 100644 index 00000000..ed889b83 --- /dev/null +++ b/nntrainer/tensor/neon_mathfun.hxx @@ -0,0 +1,311 @@ +/* NEON implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library +*/ + +/* Copyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +/** + * @file neon_mathfun.hxx + * @date 15 Jan 2024 + * @brief This is collection of sin, cos, exp, log function with NEON SIMD + * @see https://github.com/nnstreamer/nntrainer + * @author Julien Pommier + * @bug No known bugs except for NYI items + * + */ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +typedef uint32x4_t v4su; // vector of 4 uint32 +typedef int32x4_t v4si; // vector of 4 uint32 + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 -1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 -1.2420140846E-1 +#define c_cephes_log_p4 +1.4249322787E-1 +#define c_cephes_log_p5 -1.6668057665E-1 +#define c_cephes_log_p6 +2.0000714765E-1 +#define c_cephes_log_p7 -2.4999993993E-1 +#define c_cephes_log_p8 +3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +v4sf log_ps(v4sf x) { + v4sf one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + v4si ux = vreinterpretq_s32_f32(x); + + v4si emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + v4sf e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32( + e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + v4sf z = vmulq_f32(x, x); + + v4sf y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32( + vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +v4sf exp_ps(v4sf x) { + v4sf tmp, fx; + + v4sf one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + v4su mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, + c_cephes_exp_p2, c_cephes_exp_p3, + c_cephes_exp_p4, c_cephes_exp_p5}; + v4sf y = vld1q_dup_f32(cephes_exp_p + 0); + v4sf c1 = vld1q_dup_f32(cephes_exp_p + 1); + v4sf c2 = vld1q_dup_f32(cephes_exp_p + 2); + v4sf c3 = vld1q_dup_f32(cephes_exp_p + 3); + v4sf c4 = vld1q_dup_f32(cephes_exp_p + 4); + v4sf c5 = vld1q_dup_f32(cephes_exp_p + 5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + v4sf pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Note also that when you compute sin(x), cos(x) is available at + almost no extra price so both sin_ps and cos_ps make use of + sincos_ps.. + */ +void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x + v4sf xmm1, xmm2, xmm3, y; + + v4su emm2; + + v4su sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 - -typedef float32x4_t v4sf; // vector of 4 float -typedef uint32x4_t v4su; // vector of 4 uint32 -typedef int32x4_t v4si; // vector of 4 uint32 - -/** - * @def c_inv_mant_mask extract the mantissa of a float by performing a - * bitwise negation on the binary representation of 0x7f800000. - * @def c_cephes_SQRTHF value of sqrt(0.5). - * @def c_cephes_log_p0 coefficients used in logarithm calculations - * @def c_cephes_log_p1 coefficients used in logarithm calculations - * @def c_cephes_log_p2 coefficients used in logarithm calculations - * @def c_cephes_log_p3 coefficients used in logarithm calculations - * @def c_cephes_log_p4 coefficients used in logarithm calculations - * @def c_cephes_log_p5 coefficients used in logarithm calculations - * @def c_cephes_log_p6 coefficients used in logarithm calculations - * @def c_cephes_log_p7 coefficients used in logarithm calculations - * @def c_cephes_log_p8 coefficients used in logarithm calculations - * @def c_cephes_log_q1 constant used in logarithm calculations. - * @def c_cephes_log_q2 natural logarithm of 2. - */ -#define c_inv_mant_mask ~0x7f800000u -#define c_cephes_SQRTHF 0.707106781186547524 -#define c_cephes_log_p0 7.0376836292E-2 -#define c_cephes_log_p1 -1.1514610310E-1 -#define c_cephes_log_p2 1.1676998740E-1 -#define c_cephes_log_p3 -1.2420140846E-1 -#define c_cephes_log_p4 +1.4249322787E-1 -#define c_cephes_log_p5 -1.6668057665E-1 -#define c_cephes_log_p6 +2.0000714765E-1 -#define c_cephes_log_p7 -2.4999993993E-1 -#define c_cephes_log_p8 +3.3333331174E-1 -#define c_cephes_log_q1 -2.12194440e-4 -#define c_cephes_log_q2 0.693359375 - -/** natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 -*/ -/** - * @brief log function with neon x = log(x) - * @param[in] x register variable (float32x4_t) - */ -v4sf log_ps(v4sf x) { - v4sf one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - v4si ux = vreinterpretq_s32_f32(x); - - v4si emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - v4sf e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - /** part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32( - e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - v4sf z = vmulq_f32(x, x); - - v4sf y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN - return x; -} -/** - * @def c_exp_hi constant used in exponent calculations - * @def c_exp_lo constant used in exponent calculations - * @def c_cephes_LOG2EF base-e logarithm of 2 - * @def c_cephes_exp_C1 constant used in exponent calculations - * @def c_cephes_exp_C2 constant used in exponent calculations - * @def c_cephes_exp_p0 coefficient used in exponent calculations - * @def c_cephes_exp_p1 coefficient used in exponent calculations - * @def c_cephes_exp_p2 coefficient used in exponent calculations - * @def c_cephes_exp_p3 coefficient used in exponent calculations - * @def c_cephes_exp_p4 coefficient used in exponent calculations - * @def c_cephes_exp_p5 coefficient used in exponent calculations - */ -#define c_exp_hi 88.3762626647949f -#define c_exp_lo -88.3762626647949f -#define c_cephes_LOG2EF 1.44269504088896341 -#define c_cephes_exp_C1 0.693359375 -#define c_cephes_exp_C2 -2.12194440e-4 -#define c_cephes_exp_p0 1.9875691500E-4 -#define c_cephes_exp_p1 1.3981999507E-3 -#define c_cephes_exp_p2 8.3334519073E-3 -#define c_cephes_exp_p3 4.1665795894E-2 -#define c_cephes_exp_p4 1.6666665459E-1 -#define c_cephes_exp_p5 5.0000001201E-1 - -/* exp() computed for 4 float at once */ -/** - * @brief exp function with neon x = exp(x) - * @param[in] x register variable (float32x4_t) - */ -v4sf exp_ps(v4sf x) { - v4sf tmp, fx; - - v4sf one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - v4su mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, - c_cephes_exp_p2, c_cephes_exp_p3, - c_cephes_exp_p4, c_cephes_exp_p5}; - v4sf y = vld1q_dup_f32(cephes_exp_p + 0); - v4sf c1 = vld1q_dup_f32(cephes_exp_p + 1); - v4sf c2 = vld1q_dup_f32(cephes_exp_p + 2); - v4sf c3 = vld1q_dup_f32(cephes_exp_p + 3); - v4sf c4 = vld1q_dup_f32(cephes_exp_p + 4); - v4sf c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - v4sf pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -/** - * @def c_minus_cephes_DP1 constant used in sine and cosine calculations - * @def c_minus_cephes_DP2 constant used in sine and cosine calculations - * @def c_minus_cephes_DP3 constant used in sine and cosine calculations - * @def c_sincof_p0 coefficient used in sine calculations - * @def c_sincof_p1 coefficient used in sine calculations - * @def c_sincof_p2 coefficient used in sine calculations - * @def c_coscof_p0 coefficient used in cosine calculations - * @def c_coscof_p1 coefficient used in cosine calculations - * @def c_coscof_p2 coefficient used in cosine calculations - * @def c_cephes_FOPI approximation of 4 / pi - */ -#define c_minus_cephes_DP1 -0.78515625 -#define c_minus_cephes_DP2 -2.4187564849853515625e-4 -#define c_minus_cephes_DP3 -3.77489497744594108e-8 -#define c_sincof_p0 -1.9515295891E-4 -#define c_sincof_p1 8.3321608736E-3 -#define c_sincof_p2 -1.6666654611E-1 -#define c_coscof_p0 2.443315711809948E-005 -#define c_coscof_p1 -1.388731625493765E-003 -#define c_coscof_p2 4.166664568298827E-002 -#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI - -/** evaluation of 4 sines & cosines at once. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. - - Note also that when you compute sin(x), cos(x) is available at - almost no extra price so both sin_ps and cos_ps make use of - sincos_ps.. - */ -/** - * @brief sincos_ps function with neon x = sin(x) or cos(x) - * @param[in] x register variable (float32x4_t) - */ -void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x - v4sf xmm1, xmm2, xmm3, y; - - v4su emm2; - - v4su sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - /* scale by 4/Pi */ - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - /* store the integer part of y in mm0 */ - emm2 = vcvtq_u32_f32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - /** get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4