[Beignet] [PATCH] [PATCH 1/3] GBE: improve precision of exp

Tue Dec 17 14:23:11 PST 2013

From: Lv Meng <meng.lv at intel.com>


Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 backend/src/ocl_stdlib.tmpl.h |  104 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 backend/src/ocl_stdlib.tmpl.h

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
old mode 100644
new mode 100755
index e5f356e..850acd3
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1851,13 +1851,115 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
 INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
 INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
 INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
+
+typedef union{  float value;  int word;} ieee_float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GET_FLOAT_WORD
+# define GET_FLOAT_WORD(i,d)		\
+do {								\
+  ieee_float_shape_type gf_u;		\
+  gf_u.value = (d);					\
+  (i) = gf_u.word;					\
+} while (0)
+#endif
+
+/* Set a float from a 32 bit int.  */
+#ifndef SET_FLOAT_WORD
+# define SET_FLOAT_WORD(d,i)	    \
+do {								\
+  ieee_float_shape_type sf_u;		\
+  sf_u.word = (i);					\
+  (d) = sf_u.value;					\
+} while (0)
+#endif							
+
+__constant float	
+ln2HI[2] = { 6.9313812256e-01,		/* 0x3f317180 */	     
+				-6.9313812256e-01,},	/* 0xbf317180 */	
+ln2LO[2] = { 9.0580006145e-06,  	/* 0x3717f7d1 */	     
+				-9.0580006145e-06,},	/* 0xb717f7d1 */	
+halF[2]	= {0.5,-0.5,},	
+Zero[] = {0.0, -0.0,},		
+bp[] = {1.0, 1.5,},	
+dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */	
+dp_l[] = { 0.0, 1.56322085e-06,}; /* 0x35d1cfdc */
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { 
+	//return native_exp(x); 
+	
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */ 				
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */				
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */				
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */				
+  one = 1.0, 		
+  huge = 1.0e+30, 			
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */				
+  P2 = -2.7777778450e-03, /* 0xbb360b61 */				
+  P3 = 6.6137559770e-05, /* 0x388ab355 */				
+  P4 = -1.6533901999e-06, /* 0xb5ddea0e */				
+  P5 =	4.1381369442e-08; /* 0x3331bb4c */				
+  float y,hi=0.0,lo=0.0,c,t;				
+  int k=0,xsb;				
+  unsigned hx;				
+						
+  GET_FLOAT_WORD(hx,x);				
+  xsb = (hx>>31)&1;		/* sign bit of x */ 	
+  hx &= 0x7fffffff;		/* high word of |x| */		
+						
+  /* filter out non-finite argument */					
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */ 
+    if(hx>0x7f800000)				
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)					
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */	
+    if(x > o_threshold) return huge*huge; /* overflow */				
+    if(x < u_threshold) return twom100*twom100; /* underflow */ 			
+  }				
+						
+  /* argument reduction */					
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */ 	
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */ 		
+      hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;			
+    } else {				
+      k  = ivln2*x+halF[xsb]; 		
+      t  = k; 		
+      hi = x - t*ln2HI[0];	/* t*ln2HI is exact here */ 	
+      lo = t*ln2LO[0];			
+    }				
+    x  = hi - lo;				
+  }				
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */			
+    if(huge+x>one) return one+x;/* trigger inexact */				
+  }				
+  else k = 0; 			
+						
+  /* x is now in primary range */ 				
+  t  = x*x;				
+  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));				
+  if(k==0)	
+    return one-((x*c)/(c-(float)2.0)-x);			
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi); 	
+  if(k >= -125) { 			
+    unsigned hy;				
+    GET_FLOAT_WORD(hy,y);				
+    SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */ 		
+    return y;				
+  } else {				
+    unsigned hy;				
+    GET_FLOAT_WORD(hy,y);				
+    SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */ 		
+    return y*twom100;				
+  }				
+}
+
 // TODO use llvm intrinsics definitions
 #define cos native_cos
 #define cospi __gen_ocl_internal_cospi
-- 
1.7.10.4