From 84fa13e375c49759fad21fd7d4fe09996a81d3d5 Mon Sep 17 00:00:00 2001 From: Lv Meng Date: Wed, 18 Dec 2013 15:24:14 +0800 Subject: [PATCH] GBE: improve precision of expm1 Signed-off-by: Lv Meng Tested-by: "Yang, Rong R" --- backend/src/ocl_stdlib.tmpl.h | 107 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index fa23fd0..14325c8 100755 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -1661,7 +1661,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) { INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); } INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); } INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); } -INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; } INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) { return __gen_ocl_pow(x, 0.3333333333f); } @@ -2044,6 +2043,112 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) { return x; /* exact output */ } +INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { + //return __gen_ocl_pow(M_E_F, x) - 1; + float Q1 = -3.3333335072e-02, /* 0xbd088889 */ + ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ + ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ + Q2 = 1.5873016091e-03, /* 0x3ad00d01 */ + Q3 = -7.9365076090e-05, /* 0xb8a670cd */ + Q4 = 4.0082177293e-06, /* 0x36867e54 */ + Q5 = -2.0109921195e-07, /* 0xb457edbb */ + huge = 1.0e30, + tiny = 1.0e-30, + ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */ + one = 1.0, + o_threshold= 8.8721679688e+01; /* 0x42b17180 */ + float y,hi,lo,c,t,e,hxs,hfx,r1; + int k,xsb; + int hx; + GEN_OCL_GET_FLOAT_WORD(hx,x); + xsb = hx&0x80000000; + /* sign bit of x */ + //if(xsb==0) + //y=x; + //else + //y= -x; /* y = |x| */ + y = __gen_ocl_internal_fabs(x); + hx &= 0x7fffffff; /* high word of |x| */ + /* filter out huge and non-finite argument */ + if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */ + if(hx >= 0x42b17218) { /* if |x|>=88.721... */ + if(hx>0x7f800000) + return x+x; /* NaN */ + if(hx==0x7f800000) + return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */ + if(x > o_threshold) + return huge*huge; /* overflow */ + } + if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */ + if(x+tiny<(float)0.0) /* raise inexact */ + return tiny-one; /* return -1 */ + } + } + /* argument reduction */ + if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */ + if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */ + if(xsb==0){ + hi = x - ln2_hi; lo = ln2_lo; k = 1; + } else { + hi = x + ln2_hi; lo = -ln2_lo; k = -1; + } + } else { + k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5); + t = k; + hi = x - t*ln2_hi;/* t*ln2_hi is exact here */ + lo = t*ln2_lo; + } + x = hi - lo; + c = (hi-x)-lo; + } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */ + //t = huge+x; /* return x with inexact flags when x!=0 */ + //return x - (t-(huge+x)); + return x; + } else k = 0; + /* x is now in primary range */ + hfx = (float)0.5*x; + hxs = x*hfx; + r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5)))); + t = (float)3.0-r1*hfx; + e = hxs*((r1-t)/((float)6.0 - x*t)); + if(k==0) + return x - (x*e-hxs); /* c is 0 */ + else{ + e = (x*(e-c)-c); + e -= hxs; + if(k== -1)return (float)0.5*(x-e)-(float)0.5; + if(k==1){ + if(x < (float)-0.25) + return -(float)2.0*(e-(x+(float)0.5)); + else + return (one+(float)2.0*(x-e)); + } + if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */ + int i; + y = one-(e-x); + GEN_OCL_GET_FLOAT_WORD(i,y); + GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */ + return y-one; + } + t = one; + if(k<23) { + int i; + GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */ + y = t-(e-x); + GEN_OCL_GET_FLOAT_WORD(i,y); + GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */ + } else { + int i; + GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */ + y = x-(e+t); + y += one; + GEN_OCL_GET_FLOAT_WORD(i,y); + GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */ + } + } + return y; +} + // TODO use llvm intrinsics definitions #define cos native_cos #define cospi __gen_ocl_internal_cospi -- 2.7.4