[Beignet] [PATCH] [PATCH]GBE: improve precision of expm1, acosh, asinh, sinh

Wed Dec 18 10:17:11 PST 2013

From: Lv Meng <meng.lv at intel.com>


Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 backend/src/ocl_stdlib.tmpl.h |  198 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 188 insertions(+), 10 deletions(-)

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index e380b79..5e06919 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1623,7 +1623,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
 INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
 INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
 INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
 INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   return __gen_ocl_pow(x, 0.3333333333f);
 }
@@ -1635,9 +1634,6 @@ INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
 INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
 #undef BODY
 
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
-  return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
 INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
   return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
 }
@@ -1735,12 +1731,6 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
 INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
-  return native_log(x + native_sqrt(x * x + 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
 INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
   return 0.5f * native_sqrt((1 + x) / (1 - x));
 }
@@ -2025,6 +2015,194 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
   }
   return x;		/* exact output */
 }
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+  Q4 = 4.0082177293e-06, /* 0x36867e54 */
+  Q5 = -2.0109921195e-07, /* 0xb457edbb */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GET_FLOAT_WORD(i,y);
+      SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GET_FLOAT_WORD(i,y);
+      SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GET_FLOAT_WORD(i,y);
+      SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  GET_FLOAT_WORD(hx,x);
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));			
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+}
+
 // TODO use llvm intrinsics definitions
 #define cos native_cos
 #define cospi __gen_ocl_internal_cospi
-- 
1.7.10.4