The branch prediction hints is actually hurts performance in this case.
The assembly implementation make two assumptions: 1. 'fabs (x) < 2^52'
is unlikely and 2. 'x > 0.0' is unlike (if 1. is true). Since it a
general floating point function, expected input is not bounded and then
it is better to let the hardware handle the branches.
# See pow-inputs for an example.
subdir := benchtests
-bench := exp pow
+bench := exp pow rint
exp-ITER = 100000
exp-ARGLIST = double
pow-RET = double
LDFLAGS-bench-pow = -lm
+rint-ITER = 250000000
+rint-ARGLIST = double
+rint-RET = double
+LDFLAGS-bench-rint = -lm
+
include ../Makeconfig
include ../Rules
--- /dev/null
+78.5
+-78.5
+4503599627370497.0
+-4503599627370497.0
fsub fp12,fp13,fp13 /* generate 0.0 */
fcmpu cr7,fp0,fp13 /* if (fabs(x) > TWO52) */
fcmpu cr6,fp1,fp12 /* if (x > 0.0) */
- bnllr- cr7
- bng- cr6,.L4
+ bnllr cr7
+ bng cr6,.L4
fadd fp1,fp1,fp13 /* x+= TWO52; */
fsub fp1,fp1,fp13 /* x-= TWO52; */
fabs fp1,fp1 /* if (x == 0.0) */
blr /* x = 0.0; */
.L4:
- bnllr- cr6 /* if (x < 0.0) */
+ bnllr cr6 /* if (x < 0.0) */
fsub fp1,fp1,fp13 /* x-= TWO52; */
fadd fp1,fp1,fp13 /* x+= TWO52; */
fnabs fp1,fp1 /* if (x == 0.0) */
fsubs fp12,fp13,fp13 /* generate 0.0 */
fcmpu cr7,fp0,fp13 /* if (fabs(x) > TWO23) */
fcmpu cr6,fp1,fp12 /* if (x > 0.0) */
- bnllr- cr7
- bng- cr6,.L4
+ bnllr cr7
+ bng cr6,.L4
fadds fp1,fp1,fp13 /* x+= TWO23; */
fsubs fp1,fp1,fp13 /* x-= TWO23; */
fabs fp1,fp1 /* if (x == 0.0) */
blr /* x = 0.0; */
.L4:
- bnllr- cr6 /* if (x < 0.0) */
+ bnllr cr6 /* if (x < 0.0) */
fsubs fp1,fp1,fp13 /* x-= TWO23; */
fadds fp1,fp1,fp13 /* x+= TWO23; */
fnabs fp1,fp1 /* if (x == 0.0) */
fsub fp12,fp13,fp13 /* generate 0.0 */
fcmpu cr7,fp0,fp13 /* if (fabs(x) > TWO52) */
fcmpu cr6,fp1,fp12 /* if (x > 0.0) */
- bnllr- cr7
- bng- cr6,.L4
+ bnllr cr7
+ bng cr6,.L4
fadd fp1,fp1,fp13 /* x+= TWO52; */
fsub fp1,fp1,fp13 /* x-= TWO52; */
fabs fp1,fp1 /* if (x == 0.0) */
blr /* x = 0.0; */
.L4:
- bnllr- cr6 /* if (x < 0.0) */
+ bnllr cr6 /* if (x < 0.0) */
fsub fp1,fp1,fp13 /* x-= TWO52; */
fadd fp1,fp1,fp13 /* x+= TWO52; */
fnabs fp1,fp1 /* if (x == 0.0) */
fsubs fp12,fp13,fp13 /* generate 0.0 */
fcmpu cr7,fp0,fp13 /* if (fabs(x) > TWO23) */
fcmpu cr6,fp1,fp12 /* if (x > 0.0) */
- bnllr- cr7
- bng- cr6,.L4
+ bnllr cr7
+ bng cr6,.L4
fadds fp1,fp1,fp13 /* x+= TWO23; */
fsubs fp1,fp1,fp13 /* x-= TWO23; */
fabs fp1,fp1 /* if (x == 0.0) */
blr /* x = 0.0; */
.L4:
- bnllr- cr6 /* if (x < 0.0) */
+ bnllr cr6 /* if (x < 0.0) */
fsubs fp1,fp1,fp13 /* x-= TWO23; */
fadds fp1,fp1,fp13 /* x+= TWO23; */
fnabs fp1,fp1 /* if (x == 0.0) */