[Beignet] [PATCH] [PATCH]GBE: improve precision of exp and fmod

Tue Dec 17 16:26:08 PST 2013

From: Lv Meng <meng.lv at intel.com>


Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 backend/src/ocl_stdlib.tmpl.h |  172 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 backend/src/ocl_stdlib.tmpl.h

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
old mode 100644
new mode 100755
index e5f356e..e380b79
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1851,13 +1851,180 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
 INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
 INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
+
+typedef union{  float value;  int word;} ieee_float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GET_FLOAT_WORD
+# define GET_FLOAT_WORD(i,d)        \
+do {                                \
+  ieee_float_shape_type gf_u;       \
+  gf_u.value = (d);                 \
+  (i) = gf_u.word;                  \
+} while (0)
+#endif
+/* Set a float from a 32 bit int.  */
+#ifndef SET_FLOAT_WORD
+# define SET_FLOAT_WORD(d,i)        \
+do {                                \
+  ieee_float_shape_type sf_u;       \
+  sf_u.word = (i);                  \
+  (d) = sf_u.value;                 \
+} while (0)
+#endif
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  //return native_exp(x);
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03, /* 0xbb360b61 */
+  P3 = 6.6137559770e-05, /* 0x388ab355 */
+  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5 =	4.1381369442e-08; /* 0x3331bb4c */
+  float ln2HI[2],ln2LO[2],halF[2];
+  float y,hi=0.0,lo=0.0,c,t;
+  int k=0,xsb;
+  unsigned hx;
+  ln2HI[0] = 6.9313812256e-01;	/* 0x3f317180 */
+  ln2HI[1] = -6.9313812256e-01;	/* 0xbf317180 */
+  ln2LO[0] = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  ln2LO[1] = -9.0580006145e-06; /* 0xb717f7d1 */
+  halF[0] = 0.5;
+  halF[1] =	-0.5;
+
+  GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;
+    } else {
+      k  = ivln2*x+halF[xsb];
+      t  = k;
+      hi = x - t*ln2HI[0];	/* t*ln2HI is exact here */
+      lo = t*ln2LO[0];
+    }
+    x  = hi - lo;
+  }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
+
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  if(k >= -125) {
+    unsigned hy;
+    GET_FLOAT_WORD(hy,y);
+    SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GET_FLOAT_WORD(hy,y);
+    SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  GET_FLOAT_WORD(hx,x);
+  GET_FLOAT_WORD(hy,y);
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+	SET_FLOAT_WORD(x,hx|sx);
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+}
 // TODO use llvm intrinsics definitions
 #define cos native_cos
 #define cospi __gen_ocl_internal_cospi
@@ -1885,6 +2052,7 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 #define copysign __gen_ocl_internal_copysign
 #define erf __gen_ocl_internal_erf
 #define erfc __gen_ocl_internal_erfc
+#define fmod __gen_ocl_internal_fmod
 
 PURE CONST float __gen_ocl_mad(float a, float b, float c);
 INLINE_OVERLOADABLE float mad(float a, float b, float c) {
-- 
1.7.10.4