[Beignet] [PATCH] [PATCH]GBE: improve precision of expm1
Lv Meng
meng.lv at intel.com
Tue Dec 17 23:24:14 PST 2013
Signed-off-by: Lv Meng <meng.lv at intel.com>
---
backend/src/ocl_stdlib.tmpl.h | 107 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 106 insertions(+), 1 deletion(-)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index c99fde5..2e4d626 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1648,7 +1648,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
return __gen_ocl_pow(x, 0.3333333333f);
}
@@ -2031,6 +2030,112 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
return x; /* exact output */
}
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+ //return __gen_ocl_pow(M_E_F, x) - 1;
+ float Q1 = -3.3333335072e-02, /* 0xbd088889 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+ Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+ Q4 = 4.0082177293e-06, /* 0x36867e54 */
+ Q5 = -2.0109921195e-07, /* 0xb457edbb */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ o_threshold= 8.8721679688e+01; /* 0x42b17180 */
+ float y,hi,lo,c,t,e,hxs,hfx,r1;
+ int k,xsb;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = hx&0x80000000;
+ /* sign bit of x */
+ //if(xsb==0)
+ //y=x;
+ //else
+ //y= -x; /* y = |x| */
+ y = __gen_ocl_internal_fabs(x);
+ hx &= 0x7fffffff; /* high word of |x| */
+ /* filter out huge and non-finite argument */
+ if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+ if(x > o_threshold)
+ return huge*huge; /* overflow */
+ }
+ if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+ if(x+tiny<(float)0.0) /* raise inexact */
+ return tiny-one; /* return -1 */
+ }
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+ if(xsb==0){
+ hi = x - ln2_hi; lo = ln2_lo; k = 1;
+ } else {
+ hi = x + ln2_hi; lo = -ln2_lo; k = -1;
+ }
+ } else {
+ k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+ t = k;
+ hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+ lo = t*ln2_lo;
+ }
+ x = hi - lo;
+ c = (hi-x)-lo;
+ } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
+ //t = huge+x; /* return x with inexact flags when x!=0 */
+ //return x - (t-(huge+x));
+ return x;
+ } else k = 0;
+ /* x is now in primary range */
+ hfx = (float)0.5*x;
+ hxs = x*hfx;
+ r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ t = (float)3.0-r1*hfx;
+ e = hxs*((r1-t)/((float)6.0 - x*t));
+ if(k==0)
+ return x - (x*e-hxs); /* c is 0 */
+ else{
+ e = (x*(e-c)-c);
+ e -= hxs;
+ if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+ if(k==1){
+ if(x < (float)-0.25)
+ return -(float)2.0*(e-(x+(float)0.5));
+ else
+ return (one+(float)2.0*(x-e));
+ }
+ if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
+ int i;
+ y = one-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ return y-one;
+ }
+ t = one;
+ if(k<23) {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+ y = t-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ } else {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
+ y = x-(e+t);
+ y += one;
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ }
+ }
+ return y;
+}
+
// TODO use llvm intrinsics definitions
#define cos native_cos
#define cospi __gen_ocl_internal_cospi
--
1.7.10.4
More information about the Beignet
mailing list