From 5b0c2996f8b9919cf2ce27e068a827152bacf7ec Mon Sep 17 00:00:00 2001 From: Lv Meng Date: Wed, 18 Dec 2013 14:29:02 +0800 Subject: [PATCH] GBE: improve precision of exp Signed-off-by: Lv Meng Tested-by: "Yang, Rong R" --- backend/src/ocl_stdlib.tmpl.h | 101 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) mode change 100644 => 100755 backend/src/ocl_stdlib.tmpl.h diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h old mode 100644 new mode 100755 index a9e9419..a7414b7 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -148,6 +148,31 @@ INLINE_OVERLOADABLE float __ocl_nanf(void) { u.u = 0x7F800001; return u.f; } +typedef union +{ + float value; + uint word; +} float_shape_type; + +/* Get a 32 bit int from a float. */ +#ifndef GEN_OCL_GET_FLOAT_WORD +# define GEN_OCL_GET_FLOAT_WORD(i,d) \ +do { \ + float_shape_type gf_u; \ + gf_u.value = (d); \ + (i) = gf_u.word; \ +} while (0) +#endif +/* Set a float from a 32 bit int. */ +#ifndef GEN_OCL_SET_FLOAT_WORD +# define GEN_OCL_SET_FLOAT_WORD(d,i) \ +do { \ + float_shape_type sf_u; \ + sf_u.word = (i); \ + (d) = sf_u.value; \ +} while (0) +#endif + #define HUGE_VALF (__ocl_inff()) #define INFINITY (__ocl_inff()) #define NAN (__ocl_nanf()) @@ -1864,13 +1889,87 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { } INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); } INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); } -INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) { return native_exp(x); } INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); } INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); } INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); } INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) { return __gen_ocl_rnde(x); } + +INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) { + //return native_exp(x); + float o_threshold = 8.8721679688e+01, /* 0x42b17180 */ + u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */ + twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */ + ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */ + one = 1.0, + huge = 1.0e+30, + P1 = 1.6666667163e-01, /* 0x3e2aaaab */ + P2 = -2.7777778450e-03, /* 0xbb360b61 */ + P3 = 6.6137559770e-05, /* 0x388ab355 */ + P4 = -1.6533901999e-06, /* 0xb5ddea0e */ + P5 = 4.1381369442e-08; /* 0x3331bb4c */ + float ln2HI[2],ln2LO[2],halF[2]; + float y,hi=0.0,lo=0.0,c,t; + int k=0,xsb; + unsigned hx; + ln2HI[0] = 6.9313812256e-01; /* 0x3f317180 */ + ln2HI[1] = -6.9313812256e-01; /* 0xbf317180 */ + ln2LO[0] = 9.0580006145e-06; /* 0x3717f7d1 */ + ln2LO[1] = -9.0580006145e-06; /* 0xb717f7d1 */ + halF[0] = 0.5; + halF[1] = -0.5; + + GEN_OCL_GET_FLOAT_WORD(hx,x); + xsb = (hx>>31)&1; /* sign bit of x */ + hx &= 0x7fffffff; /* high word of |x| */ + + /* filter out non-finite argument */ + if(hx >= 0x42b17218) { /* if |x|>=88.721... */ + if(hx>0x7f800000) + return x+x; /* NaN */ + if(hx==0x7f800000) + return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */ + if(x > o_threshold) return huge*huge; /* overflow */ + if(x < u_threshold) return twom100*twom100; /* underflow */ + } + /* argument reduction */ + if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */ + if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */ + hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb; + } else { + k = ivln2*x+halF[xsb]; + t = k; + hi = x - t*ln2HI[0]; /* t*ln2HI is exact here */ + lo = t*ln2LO[0]; + } + x = hi - lo; + } + else if(hx < 0x31800000) { /* when |x|<2**-28 */ + if(huge+x>one) return one+x;/* trigger inexact */ + } + else k = 0; + + /* x is now in primary range */ + t = x*x; + c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))); + if(k==0) + return one-((x*c)/(c-(float)2.0)-x); + else + y = one-((lo-(x*c)/((float)2.0-c))-hi); + if(k >= -125) { + unsigned hy; + GEN_OCL_GET_FLOAT_WORD(hy,y); + GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */ + return y; + } else { + unsigned hy; + GEN_OCL_GET_FLOAT_WORD(hy,y); + GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */ + return y*twom100; + } +} + // TODO use llvm intrinsics definitions #define cos native_cos #define cospi __gen_ocl_internal_cospi -- 2.7.4