[Beignet] [PATCH] [PATCH]GBE: improve precision of expm1, acosh, asinh, sinh
Lv
Lv
Wed Dec 18 10:17:11 PST 2013
From: Lv Meng <meng.lv at intel.com>
Signed-off-by: Lv Meng <meng.lv at intel.com>
---
backend/src/ocl_stdlib.tmpl.h | 198 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 188 insertions(+), 10 deletions(-)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index e380b79..5e06919 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1623,7 +1623,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
return __gen_ocl_pow(x, 0.3333333333f);
}
@@ -1635,9 +1634,6 @@ INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
#undef BODY
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
- return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
}
@@ -1735,12 +1731,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
return __gen_ocl_internal_atan(x) / M_PI_F;
}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
- return native_log(x + native_sqrt(x * x + 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
- return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
return 0.5f * native_sqrt((1 + x) / (1 - x));
}
@@ -2025,6 +2015,194 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
}
return x; /* exact output */
}
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+ //return __gen_ocl_pow(M_E_F, x) - 1;
+ float Q1 = -3.3333335072e-02, /* 0xbd088889 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+ Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+ Q4 = 4.0082177293e-06, /* 0x36867e54 */
+ Q5 = -2.0109921195e-07, /* 0xb457edbb */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ o_threshold= 8.8721679688e+01; /* 0x42b17180 */
+ float y,hi,lo,c,t,e,hxs,hfx,r1;
+ int k,xsb;
+ int hx;
+ GET_FLOAT_WORD(hx,x);
+ xsb = hx&0x80000000;
+ /* sign bit of x */
+ //if(xsb==0)
+ //y=x;
+ //else
+ //y= -x; /* y = |x| */
+ y = __gen_ocl_internal_fabs(x);
+ hx &= 0x7fffffff; /* high word of |x| */
+ /* filter out huge and non-finite argument */
+ if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+ if(x > o_threshold)
+ return huge*huge; /* overflow */
+ }
+ if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+ if(x+tiny<(float)0.0) /* raise inexact */
+ return tiny-one; /* return -1 */
+ }
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+ if(xsb==0){
+ hi = x - ln2_hi; lo = ln2_lo; k = 1;
+ } else {
+ hi = x + ln2_hi; lo = -ln2_lo; k = -1;
+ }
+ } else {
+ k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+ t = k;
+ hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+ lo = t*ln2_lo;
+ }
+ x = hi - lo;
+ c = (hi-x)-lo;
+ } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
+ //t = huge+x; /* return x with inexact flags when x!=0 */
+ //return x - (t-(huge+x));
+ return x;
+ } else k = 0;
+ /* x is now in primary range */
+ hfx = (float)0.5*x;
+ hxs = x*hfx;
+ r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ t = (float)3.0-r1*hfx;
+ e = hxs*((r1-t)/((float)6.0 - x*t));
+ if(k==0)
+ return x - (x*e-hxs); /* c is 0 */
+ else{
+ e = (x*(e-c)-c);
+ e -= hxs;
+ if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+ if(k==1){
+ if(x < (float)-0.25)
+ return -(float)2.0*(e-(x+(float)0.5));
+ else
+ return (one+(float)2.0*(x-e));
+ }
+ if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
+ int i;
+ y = one-(e-x);
+ GET_FLOAT_WORD(i,y);
+ SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ return y-one;
+ }
+ t = one;
+ if(k<23) {
+ int i;
+ SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+ y = t-(e-x);
+ GET_FLOAT_WORD(i,y);
+ SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ } else {
+ int i;
+ SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
+ y = x-(e+t);
+ y += one;
+ GET_FLOAT_WORD(i,y);
+ SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ }
+ }
+ return y;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+ //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+ float one = 1.0,
+ ln2 = 6.9314718246e-01;/* 0x3f317218 */
+ float t;
+ int hx;
+ GET_FLOAT_WORD(hx,x);
+ if(hx<0x3f800000) { /* x < 1 */
+ return (x-x)/(x-x);
+ } else if(hx >=0x4d800000) { /* x > 2**28 */
+ if(hx >=0x7f800000) {/* x is inf of NaN */
+ return x+x;
+ } else
+ return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+ } else if (hx==0x3f800000) {
+ return 0.0; /* acosh(1) = 0 */
+ } else if (hx > 0x40000000) { /* 2**28 > x > 2 */
+ t=x*x;
+ return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+ } else { /* 1<x<2 */
+ t = x-one;
+ return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+ }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+ //return native_log(x + native_sqrt(x * x + 1));
+ float one = 1.0000000000e+00, /* 0x3F800000 */
+ ln2 = 6.9314718246e-01, /* 0x3f317218 */
+ huge= 1.0000000000e+30;
+ float w;
+ int hx,ix;
+ GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix< 0x38000000) { /* |x|<2**-14 */
+ if(huge+x>one) return x; /* return x inexact except 0 */
+ }
+ if(ix>0x47000000) {/* |x| > 2**14 */
+ if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+ w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+ } else {
+ float xa = __gen_ocl_internal_fabs(x);
+ if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+ w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+ } else { /* 2.0 > |x| > 2**-14 */
+ float t = xa*xa;
+ w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+ }
+ }
+ return __gen_ocl_internal_copysign(w, x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+ //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+ float one = 1.0,
+ shuge = 1.0e37;
+ float t,w,h;
+ int ix,jx;
+ GET_FLOAT_WORD(jx,x);
+ ix = jx&0x7fffffff;
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) return x+x;
+ h = 0.5;
+ if (jx<0) h = -h;
+ /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+ if (ix < 0x41b00000) { /* |x|<22 */
+ if (ix<0x31800000) /* |x|<2**-28 */
+ if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+ t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+ if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+ return h*(t+t/(t+one));
+ }
+ /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+ if (ix < 0x42b17180) return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+ /* |x| in [log(maxdouble), overflowthresold] */
+ if (ix<=0x42b2d4fc) {
+ w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+ t = h*w;
+ return t*w;
+ }
+ /* |x| > overflowthresold, sinh(x) overflow */
+ return x*shuge;
+}
+
// TODO use llvm intrinsics definitions
#define cos native_cos
#define cospi __gen_ocl_internal_cospi
--
1.7.10.4
More information about the Beignet
mailing list