[Beignet] [PATCH_V2] [PATCH]GBE: improve precision of exp

Meng Meng
Wed Dec 18 12:13:35 PST 2013


From: Lv Meng <meng.lv at intel.com>


Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 backend/src/ocl_stdlib.tmpl.h |  101 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 backend/src/ocl_stdlib.tmpl.h

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
old mode 100644
new mode 100755
index e5f356e..904904b
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -148,6 +148,31 @@ INLINE_OVERLOADABLE float __ocl_nanf(void) {
   u.u = 0x7F800001;
   return u.f;
 }
+typedef union
+{
+  float value;
+  uint  word;
+} float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GEN_OCL_GET_FLOAT_WORD
+# define GEN_OCL_GET_FLOAT_WORD(i,d)  \
+do {                                  \
+  float_shape_type gf_u;              \
+  gf_u.value = (d);                   \
+  (i) = gf_u.word;                    \
+} while (0)
+#endif
+/* Set a float from a 32 bit int.  */
+#ifndef GEN_OCL_SET_FLOAT_WORD
+# define GEN_OCL_SET_FLOAT_WORD(d,i)  \
+do {                                  \
+  float_shape_type sf_u;              \
+  sf_u.word = (i);                    \
+  (d) = sf_u.value;                   \
+} while (0)
+#endif
+
 #define HUGE_VALF    (__ocl_inff())
 #define INFINITY     (__ocl_inff())
 #define NAN          (__ocl_nanf())
@@ -1851,13 +1876,87 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
 INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
 INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
 INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  //return native_exp(x);
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03, /* 0xbb360b61 */
+  P3 = 6.6137559770e-05, /* 0x388ab355 */
+  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5 =	4.1381369442e-08; /* 0x3331bb4c */
+  float ln2HI[2],ln2LO[2],halF[2];
+  float y,hi=0.0,lo=0.0,c,t;
+  int k=0,xsb;
+  unsigned hx;
+  ln2HI[0] = 6.9313812256e-01;	/* 0x3f317180 */
+  ln2HI[1] = -6.9313812256e-01;	/* 0xbf317180 */
+  ln2LO[0] = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  ln2LO[1] = -9.0580006145e-06; /* 0xb717f7d1 */
+  halF[0] = 0.5;
+  halF[1] =	-0.5;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;
+    } else {
+      k  = ivln2*x+halF[xsb];
+      t  = k;
+      hi = x - t*ln2HI[0];	/* t*ln2HI is exact here */
+      lo = t*ln2LO[0];
+    }
+    x  = hi - lo;
+  }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
+
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+
 // TODO use llvm intrinsics definitions
 #define cos native_cos
 #define cospi __gen_ocl_internal_cospi
-- 
1.7.10.4



More information about the Beignet mailing list