From 41e8926aa4b7f17bc95984737ee82a254ad0911c Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Mon, 25 Nov 2013 06:12:30 -0600 Subject: [PATCH] PowerPC: Set/restore rounding mode only when needed This patch helps some math functions performance by adding the libc_fexxx variant of inline functions to handle both FPU round and exception set/restore and by using them on the libc_fexxx_ctx functions. It is based on already coded fexxx family functions for PPC with fpu. Here is the summary of performance improvements due this patch (measured on a POWER7 machine): Before: cos(): ITERS:9.5895e+07: TOTAL:5116.03Mcy, MAX:77.6cy, MIN:49.792cy, 18744 calls/Mcy exp(): ITERS:2.827e+07: TOTAL:5187.15Mcy, MAX:494.018cy, MIN:38.422cy, 5450.01 calls/Mcy pow(): ITERS:6.1705e+07: TOTAL:5144.26Mcy, MAX:171.95cy, MIN:29.935cy, 11994.9 calls/Mcy sin(): ITERS:8.6898e+07: TOTAL:5117.06Mcy, MAX:83.841cy, MIN:46.582cy, 16982 calls/Mcy tan(): ITERS:2.9473e+07: TOTAL:5115.39Mcy, MAX:191.017cy, MIN:172.352cy, 5761.63 calls/Mcy After: cos(): ITERS:2.05265e+08: TOTAL:5111.37Mcy, MAX:78.754cy, MIN:24.196cy, 40158.5 calls/Mcy exp(): ITERS:3.341e+07: TOTAL:5170.84Mcy, MAX:476.317cy, MIN:15.574cy, 6461.23 calls/Mcy pow(): ITERS:7.6153e+07: TOTAL:5129.1Mcy, MAX:147.5cy, MIN:30.916cy, 14847.2 calls/Mcy sin(): ITERS:1.58816e+08: TOTAL:5115.11Mcy, MAX:1490.39cy, MIN:22.341cy, 31048.4 calls/Mcy tan(): ITERS:3.4964e+07: TOTAL:5114.18Mcy, MAX:177.422cy, MIN:146.115cy, 6836.68 calls/Mcy --- ChangeLog | 16 ++ sysdeps/ieee754/ldbl-128ibm/e_expl.c | 31 ++-- sysdeps/ieee754/ldbl-128ibm/s_llrintl.c | 2 +- sysdeps/ieee754/ldbl-128ibm/s_lrintl.c | 2 +- sysdeps/ieee754/ldbl-128ibm/s_rintl.c | 2 +- sysdeps/powerpc/fpu/fenv_libc.h | 6 +- sysdeps/powerpc/fpu/fenv_private.h | 274 ++++++++++++++++++++++++++++++++ sysdeps/powerpc/fpu/math_private.h | 1 + 8 files changed, 309 insertions(+), 25 deletions(-) create mode 100644 sysdeps/powerpc/fpu/fenv_private.h diff --git a/ChangeLog b/ChangeLog index b9201fc..9e2a0c6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2013-11-25 Adhemerval Zanella + + * sysdeps/ieee754/ldbl-128ibm/e_expl.c (__ieee754_expl): Use + SET_RESTORE_ROUND instead of feholdexcept/fesetround/fesetenv. + * sysdeps/powerpc/fpu/fenv_libc.h (__fegetround): Remove define. + (__fesetround): Remove define. + * sysdeps/powerpc/fpu/fenv_private.h: New file: Inline floating point + rounding and exceptions handling. + * sysdeps/powerpc/fpu/math_private.h: Include fenv_private.h. + * sysdeps/powerpc/fpu/fenv_libc.h (__fe_mask_env): Define as hidden. + (__fe_nomask_env): Likewise. + * sysdeps/ieee754/ldbl-128ibm/s_llrintl.c (__llrintl): Use + __fegetround instead of fegetround. + * sysdeps/ieee754/ldbl-128ibm/s_lrintl.c (__lrintl): Likewise. + * sysdeps/ieee754/ldbl-128ibm/s_rintl.c (__rintl): Likewise. + 2013-11-21 Roland McGrath * malloc/malloc.c: Move #include to the top; comment why diff --git a/sysdeps/ieee754/ldbl-128ibm/e_expl.c b/sysdeps/ieee754/ldbl-128ibm/e_expl.c index f7c50bf..65ef185 100644 --- a/sysdeps/ieee754/ldbl-128ibm/e_expl.c +++ b/sysdeps/ieee754/ldbl-128ibm/e_expl.c @@ -134,18 +134,17 @@ static const long double C[] = { long double __ieee754_expl (long double x) { + long double result, x22; + union ibm_extended_long_double ex2_u, scale_u; + int unsafe; + /* Check for usual case. */ if (isless (x, himark) && isgreater (x, lomark)) { - int tval1, tval2, unsafe, n_i, exponent2; - long double x22, n, result, xl; - union ibm_extended_long_double ex2_u, scale_u; - fenv_t oldenv; - - feholdexcept (&oldenv); -#ifdef FE_TONEAREST - fesetround (FE_TONEAREST); -#endif + int tval1, tval2, n_i, exponent2; + long double n, xl; + + SET_RESTORE_ROUND (FE_TONEAREST); n = __roundl (x*M_1_LN2); x = x-n*M_LN2_0; @@ -201,11 +200,6 @@ __ieee754_expl (long double x) less than 4.8e-39. */ x22 = x + x*x*(P1+x*(P2+x*(P3+x*(P4+x*(P5+x*P6))))); - /* Return result. */ - fesetenv (&oldenv); - - result = x22 * ex2_u.ld + ex2_u.ld; - /* Now we can test whether the result is ultimate or if we are unsure. In the later case we should probably call a mpn based routine to give the ultimate result. @@ -235,10 +229,6 @@ __ieee754_expl (long double x) return __ieee754_expl_proc2 (origx); } */ - if (!unsafe) - return result; - else - return result * scale_u.ld; } /* Exceptional cases: */ else if (isless (x, himark)) @@ -253,5 +243,10 @@ __ieee754_expl (long double x) else /* Return x, if x is a NaN or Inf; or overflow, otherwise. */ return TWO1023*x; + + result = x22 * ex2_u.ld + ex2_u.ld; + if (!unsafe) + return result; + return result * scale_u.ld; } strong_alias (__ieee754_expl, __expl_finite) diff --git a/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c b/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c index 8560349..3503973 100644 --- a/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c +++ b/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c @@ -43,7 +43,7 @@ __llrintl (long double x) #endif ) { - save_round = fegetround (); + save_round = __fegetround (); if (__builtin_expect ((xh == -(double) (-__LONG_LONG_MAX__ - 1)), 0)) { diff --git a/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c b/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c index 588098d..49dbd42 100644 --- a/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c +++ b/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c @@ -49,7 +49,7 @@ __lrintl (long double x) #endif ) { - save_round = fegetround (); + save_round = __fegetround (); #if __LONG_MAX__ == 2147483647 long long llhi = (long long) xh; diff --git a/sysdeps/ieee754/ldbl-128ibm/s_rintl.c b/sysdeps/ieee754/ldbl-128ibm/s_rintl.c index 48dbe85..5fd6bb8 100644 --- a/sysdeps/ieee754/ldbl-128ibm/s_rintl.c +++ b/sysdeps/ieee754/ldbl-128ibm/s_rintl.c @@ -40,7 +40,7 @@ __rintl (long double x) __builtin_inf ()), 1)) { double orig_xh; - int save_round = fegetround (); + int save_round = __fegetround (); /* Long double arithmetic, including the canonicalisation below, only works in round-to-nearest mode. */ diff --git a/sysdeps/powerpc/fpu/fenv_libc.h b/sysdeps/powerpc/fpu/fenv_libc.h index cb15c1c..ecd6b91 100644 --- a/sysdeps/powerpc/fpu/fenv_libc.h +++ b/sysdeps/powerpc/fpu/fenv_libc.h @@ -23,9 +23,9 @@ #include #include -extern const fenv_t *__fe_nomask_env (void); +extern const fenv_t *__fe_nomask_env (void) attribute_hidden; -extern const fenv_t *__fe_mask_env (void); +extern const fenv_t *__fe_mask_env (void) attribute_hidden; /* The sticky bits in the FPSCR indicating exceptions have occurred. */ #define FPSCR_STICKY_BITS ((FE_ALL_EXCEPT | FE_ALL_INVALID) & ~FE_INVALID) @@ -83,7 +83,6 @@ __fegetround (void) "mfcr %0" : "=r"(result) : : "cr7"); return result & 3; } -#define fegetround() __fegetround() static inline int __fesetround (int round) @@ -107,7 +106,6 @@ __fesetround (int round) return 0; } -#define fesetround(mode) __fesetround(mode) /* Definitions of all the FPSCR bit numbers */ enum { diff --git a/sysdeps/powerpc/fpu/fenv_private.h b/sysdeps/powerpc/fpu/fenv_private.h new file mode 100644 index 0000000..293f840 --- /dev/null +++ b/sysdeps/powerpc/fpu/fenv_private.h @@ -0,0 +1,274 @@ +/* Private floating point rounding and exceptions handling. PowerPC version. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef FENV_PRIVATE_H +#define FENV_PRIVATE_H 1 + +#include +#include +#include + +#define _FPU_MASK_ALL (_FPU_MASK_ZM | _FPU_MASK_OM | _FPU_MASK_UM \ + | _FPU_MASK_XM | _FPU_MASK_IM) + +/* Mask everything but the rounding moded and non-IEEE arithmetic flags. */ +#define _FPU_MASK_ROUNDING 0xffffffff00000007LL + +/* Mask restore rounding mode and exception enabled. */ +#define _FPU_MASK_EXCEPT_ROUND 0xffffffff1fffff00LL + +/* Mask exception enable but fraction rounded/inexact and FP result/CC + bits. */ +#define _FPU_MASK_FRAC_INEX_RET_CC 0x1ff80fff + +static __always_inline void +libc_feholdexcept_ppc (fenv_t *envp) +{ + fenv_union_t old, new; + + old.fenv = *envp = fegetenv_register (); + + new.l = old.l & _FPU_MASK_ROUNDING; + + /* If the old env had any enabled exceptions, then mask SIGFPE in the + MSR FE0/FE1 bits. This may allow the FPU to run faster because it + always takes the default action and can not generate SIGFPE. */ + if ((old.l & _FPU_MASK_ALL) != 0) + (void) __fe_mask_env (); + + fesetenv_register (new.fenv); +} + +static __always_inline void +libc_fesetround_ppc (int r) +{ + __fesetround (r); +} + +static __always_inline void +libc_feholdexcept_setround_ppc (fenv_t *envp, int r) +{ + fenv_union_t old, new; + + old.fenv = *envp = fegetenv_register (); + + new.l = (old.l & _FPU_MASK_ROUNDING) | r; + + if ((old.l & _FPU_MASK_ALL) != 0) + (void) __fe_mask_env (); + + fesetenv_register (new.fenv); +} + +static __always_inline int +libc_fetestexcept_ppc (int e) +{ + fenv_union_t u; + u.fenv = fegetenv_register (); + return u.l & e; +} + +static __always_inline void +libc_fesetenv_ppc (const fenv_t *envp) +{ + fenv_union_t old, new; + + new.fenv = *envp; + old.fenv = fegetenv_register (); + + /* If the old env has no enabled exceptions and the new env has any enabled + exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits. This will put the + hardware into "precise mode" and may cause the FPU to run slower on some + hardware. */ + if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0) + (void) __fe_nomask_env (); + + /* If the old env had any enabled exceptions and the new env has no enabled + exceptions, then mask SIGFPE in the MSR FE0/FE1 bits. This may allow the + FPU to run faster because it always takes the default action and can not + generate SIGFPE. */ + if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0) + (void) __fe_mask_env (); + + fesetenv_register (*envp); +} + +static __always_inline int +libc_feupdateenv_test_ppc (fenv_t *envp, int ex) +{ + fenv_union_t old, new; + + new.fenv = *envp; + old.fenv = fegetenv_register (); + + /* Restore rounding mode and exception enable from *envp and merge + exceptions. Leave fraction rounded/inexact and FP result/CC bits + unchanged. */ + new.l = (old.l & _FPU_MASK_EXCEPT_ROUND) + | (new.l & _FPU_MASK_FRAC_INEX_RET_CC); + + if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0) + (void) __fe_nomask_env (); + + if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0) + (void) __fe_mask_env (); + + fesetenv_register (new.fenv); + + return old.l & ex; +} + +static __always_inline void +libc_feupdateenv_ppc (fenv_t *e) +{ + libc_feupdateenv_test_ppc (e, 0); +} + +static __always_inline void +libc_feholdsetround_ppc (fenv_t *e, int r) +{ + fenv_union_t old, new; + + old.fenv = fegetenv_register (); + /* Clear current precision and set newer one. */ + new.l = (old.l & ~0x3) | r; + *e = old.fenv; + + if ((old.l & _FPU_MASK_ALL) != 0) + (void) __fe_mask_env (); + fesetenv_register (new.fenv); +} + +static __always_inline void +libc_feresetround_ppc (fenv_t *envp) +{ + fenv_union_t old, new; + + new.fenv = *envp; + old.fenv = fegetenv_register (); + + /* Restore rounding mode and exception enable from *envp and merge + exceptions. Leave fraction rounded/inexact and FP result/CC bits + unchanged. */ + new.l = (old.l & _FPU_MASK_EXCEPT_ROUND) + | (new.l & _FPU_MASK_FRAC_INEX_RET_CC); + + if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0) + (void) __fe_nomask_env (); + + if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0) + (void) __fe_mask_env (); + + /* Atomically enable and raise (if appropriate) exceptions set in `new'. */ + fesetenv_register (new.fenv); +} + +#define libc_feholdexceptf libc_feholdexcept_ppc +#define libc_feholdexcept libc_feholdexcept_ppc +#define libc_feholdexcept_setroundf libc_feholdexcept_setround_ppc +#define libc_feholdexcept_setround libc_feholdexcept_setround_ppc +#define libc_fetestexceptf libc_fetestexcept_ppc +#define libc_fetestexcept libc_fetestexcept_ppc +#define libc_fesetroundf libc_fesetround_ppc +#define libc_fesetround libc_fesetround_ppc +#define libc_fesetenvf libc_fesetenv_ppc +#define libc_fesetenv libc_fesetenv_ppc +#define libc_feupdateenv_testf libc_feupdateenv_test_ppc +#define libc_feupdateenv_test libc_feupdateenv_test_ppc +#define libc_feupdateenvf libc_feupdateenv_ppc +#define libc_feupdateenv libc_feupdateenv_ppc +#define libc_feholdsetroundf libc_feholdsetround_ppc +#define libc_feholdsetround libc_feholdsetround_ppc +#define libc_feresetroundf libc_feresetround_ppc +#define libc_feresetround libc_feresetround_ppc + + +/* We have support for rounding mode context. */ +#define HAVE_RM_CTX 1 + +static __always_inline void +libc_feholdexcept_setround_ppc_ctx (struct rm_ctx *ctx, int r) +{ + fenv_union_t old, new; + + old.fenv = fegetenv_register (); + + new.l = (old.l & _FPU_MASK_ROUNDING) | r; + ctx->env = old.fenv; + if (__glibc_unlikely (new.l != old.l)) + { + if ((old.l & _FPU_MASK_ALL) != 0) + (void) __fe_mask_env (); + fesetenv_register (new.fenv); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +static __always_inline void +libc_fesetenv_ppc_ctx (struct rm_ctx *ctx) +{ + libc_fesetenv_ppc (&ctx->env); +} + +static __always_inline void +libc_feupdateenv_ppc_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + libc_feupdateenv_test_ppc (&ctx->env, 0); +} + +static __always_inline void +libc_feholdsetround_ppc_ctx (struct rm_ctx *ctx, int r) +{ + fenv_union_t old, new; + + old.fenv = fegetenv_register (); + new.l = (old.l & ~0x3) | r; + ctx->env = old.fenv; + if (__glibc_unlikely (new.l != old.l)) + { + if ((old.l & _FPU_MASK_ALL) != 0) + (void) __fe_mask_env (); + fesetenv_register (new.fenv); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +static __always_inline void +libc_feresetround_ppc_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + libc_feresetround_ppc (&ctx->env); +} + +#define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_ppc_ctx +#define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_ppc_ctx +#define libc_fesetenv_ctx libc_fesetenv_ppc_ctx +#define libc_fesetenvf_ctx libc_fesetenv_ppc_ctx +#define libc_feholdsetround_ctx libc_feholdsetround_ppc_ctx +#define libc_feholdsetroundf_ctx libc_feholdsetround_ppc_ctx +#define libc_feresetround_ctx libc_feresetround_ppc_ctx +#define libc_feresetroundf_ctx libc_feresetround_ppc_ctx +#define libc_feupdateenvf_ctx libc_feupdateenv_ppc_ctx +#define libc_feupdateenv_ctx libc_feupdateenv_ppc_ctx + +#endif diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h index 6c00785..c8833d6 100644 --- a/sysdeps/powerpc/fpu/math_private.h +++ b/sysdeps/powerpc/fpu/math_private.h @@ -22,6 +22,7 @@ #include #include #include +#include #include_next # if __WORDSIZE == 64 || defined _ARCH_PWR4 -- 2.7.4