[Beignet] [PATCH 3/3] Backend: Optimization experiment by using AMD clc

Sat May 14 09:45:41 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl | 1020 ++++++++++++++++--------------
 backend/src/libocl/tmpl/ocl_math.tmpl.h  |   71 +++
 kernels/bench_math.cl                    |   30 +-
 3 files changed, 626 insertions(+), 495 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 4d70c4d..23b56b5 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -82,6 +82,9 @@ OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
 OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
     return __gen_ocl_cos(x * M_PI_F);
 }
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp2 (float x) {
+    return native_exp2(x);
+}
 OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
     return native_exp(x);
 }
@@ -164,6 +167,22 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
   return ux.f;
 }
 
+OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+}
+OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return __gen_ocl_rnde(x);
+}
+
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+
 OVERLOADABLE float __gen_ocl_internal_log(float x) {
 /*
  *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
@@ -361,7 +380,6 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
   }
 }
 
-
 float __gen_ocl_scalbnf (float x, int n){
   /* copy from fdlibm */
   float two25 = 3.355443200e+07,	/* 0x4c000000 */
@@ -545,9 +563,13 @@ OVERLOADABLE float __kernel_sinf(float x)
   float z,r,v;
   z =  x*x;
   v =  z*x;
-  r =  S2+z*(S3+z*(S4));
+  r = mad(z,
+		  mad(z,
+			  mad(z, S4, S3),
+		  S2),
+	  S1);
 
-  return x+v*(S1+z*r);
+  return mad(v, r, x);
 }
 
 float __kernel_cosf(float x, float y)
@@ -563,7 +585,9 @@ float __kernel_cosf(float x, float y)
   GEN_OCL_GET_FLOAT_WORD(ix,x);
   ix &= 0x7fffffff;     /* ix = |x|'s high word*/
   z  = x*x;
-  r = z*(C1+z*(C2+z*(C3)));
+  r = z * mad(z,
+		  mad(z, C3, C2),
+	  C1);
 
   if(ix < 0x3e99999a)       /* if |x| < 0.3 */
       return one - ((float)0.5*z - (z*r - x*y));
@@ -1239,6 +1263,271 @@ OVERLOADABLE int ilogb(float x) {
 OVERLOADABLE float nan(uint code) {
   return NAN;
 }
+
+OVERLOADABLE float __gen_ocl_internal_exp2(float x){
+	/* precision is enough on native/fastpath */
+	return __gen_ocl_internal_fastpath_exp2(x);
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+	// This treats subnormals as zeros
+	int i = as_int(x);
+	int e = (i >> 23) & 0xff;
+	int m = i & 0x007fffff;
+	int s = i & 0x80000000;
+	int v = add_sat(e, n);
+	v = clamp(v, 0, 0xff);
+	int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+	int c = e == 0xff;
+	mr = c ? m : mr;
+	int er = c ? e : v;
+	er = e ? er : e;
+	return as_float( s | (er << 23) | mr );
+
+	/* supports denormal values */
+	const int multiplier = 24;
+	float val_f;
+	uint val_ui;
+	uint sign;
+	int exponent;
+	val_ui = as_uint(x);
+	sign = val_ui & 0x80000000;
+	val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+	int val_x = val_ui;
+
+	exponent = val_ui >> 23; /* get the exponent */
+	int dexp = exponent;
+
+	/* denormal support */
+	int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+	int dexponent = 25 - fbh;
+	uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+	int ex = dexponent + n - multiplier;
+	dexponent = ex;
+	uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+	int ex1 = dexponent + multiplier;
+	ex1 = -ex1 +25;
+	dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+	dval_ui = dexponent > 0 ? val :dval_ui;
+	dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui;  /*overflow*/
+	dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
+	dval_ui = dval_ui | sign;
+	val_f = as_float(dval_ui);
+
+	exponent += n;
+
+	val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+	ex1 = exponent + multiplier;
+	ex1 = -ex1 +25;
+	val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+	val_ui = exponent > 0 ? val :val_ui;
+	val_ui = exponent > 254 ? 0x7f800000 :val_ui;  /*overflow*/
+	val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
+	val_ui = val_ui | sign;
+
+	val_ui = dexp == 0? dval_ui : val_ui;
+	val_f = as_float(val_ui);
+
+	val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+	return val_f;
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_exp(float x){
+  // Reduce x
+  const float ln2HI = 0x1.62e300p-1f;
+  const float ln2LO = 0x1.2fefa2p-17f;
+  const float invln2 = 0x1.715476p+0f;
+
+  float fhalF = x < 0.0f ? -0.5f : 0.5f;
+  int p  = mad(x, invln2, fhalF);
+  float fp = (float)p;
+  float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+  float lo = -fp*ln2LO;
+
+  // Evaluate poly
+  float t = hi + lo;
+  float tt  = t*t;
+  float v = mad(tt,
+                -mad(tt,
+                     mad(tt,
+                         mad(tt,
+                             mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+                             0x1.1566aap-14f),
+                         -0x1.6c16c2p-9f),
+                     0x1.555556p-3f),
+                t);
+
+  float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+  // Scale by 2^p
+  float r =  as_float(as_int(y) + (p << 23));
+
+  const float ulim =  0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+  const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+  r = x < llim ? 0.0f : r;
+  r = x < ulim ? r : as_float(0x7f800000);
+  return isnan(x) ? x : r;
+}
+
+/* MODIFIED, copy from fdlib */
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+
+  if( x < -MAXL10 ) return 0.0;
+
+  if( isinf(x))  return INFINITY;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  px = mad(x,
+		  mad(x,
+			  mad(x,
+				  mad(x,
+					  mad(x,
+						  mad(x, 2.063216740311022E-001, 5.420251702225484E-001),
+						1.171292686296281E+000),
+					2.034649854009453E+000),
+				2.650948748208892E+000),
+			2.302585167056758E+000),
+	  1.0f);
+
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+
+  return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+  Q4 = 4.0082177293e-06, /* 0x36867e54 */
+  Q5 = -2.0109921195e-07, /* 0xb457edbb */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+
 OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
   float sign = 1.0f;
   int ix;
@@ -1422,233 +1711,134 @@ __constant float atanlo[4] = {
   7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
 };
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  /* copied from fdlibm */
-  float aT[11];
-  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
-  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
-  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
-  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
-  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
-  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
-  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  const float one = 1.0, huge = 1.0e30;
-
-  float w,s1,s2,z;
-  int ix,hx,id;
+    const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
-      if(ix>0x7f800000)
-    return x+x;   /* NaN */
-      if(hx>0) return  atanhi[3]+atanlo[3];
-      else     return -atanhi[3]-atanlo[3];
-  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
-      if (ix < 0x31000000) {  /* |x| < 2^-29 */
-    if(huge+x>one) return x;  /* raise inexact */
-      }
-      id = -1;
-  } else {
-  x = __gen_ocl_fabs(x);
-  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
-      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
-    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
-      } else {      /* 11/16<=|x|< 19/16 */
-    id = 1; x  = (x-one)/(x+one);
-      }
-  } else {
-      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
-    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
-      } else {      /* 2.4375 <= |x| < 2^66 */
-    id = 3; x  = -(float)1.0/x;
-      }
-  }}
-    /* end of argument reduction */
-  z = x*x;
-  w = z*z;
-    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
-  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
-  if (id<0) return x - x*(s1+s2);
-  else {
-      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
-      return (hx<0)? -z:z;
-  }
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint sx = ux ^ aux;
+
+    float spiby2 = as_float(sx | as_uint(piby2));
+
+    float v = as_float(aux);
+
+    // Return for NaN
+    float ret = x;
+
+    // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+    ret = aux <= PINFBITPATT_SP32  ? spiby2 : ret;
 
+    // Reduce arguments 2^-19 <= |x| < 2^26
+
+    // 39/16 <= x < 2^26
+    x = -MATH_RECIP(v);
+    float c = 1.57079632679489655800f; // atan(infinity)
+
+    // 19/16 <= x < 39/16
+    int l = aux < 0x401c0000;
+    float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+    x = l ? xx : x;
+    c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+    // 11/16 <= x < 19/16
+    l = aux < 0x3f980000U;
+    xx =  MATH_DIVIDE(v - 1.0f, 1.0f + v);
+    x = l ? xx : x;
+    c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+    // 7/16 <= x < 11/16
+    l = aux < 0x3f300000;
+    xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+    x = l ? xx : x;
+    c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+    // 2^-19 <= x < 7/16
+    l = aux < 0x3ee00000;
+    x = l ? v : x;
+    c = l ? 0.0f : c;
+
+    // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+    float s = x * x;
+    float a = mad(s,
+                  mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+                  0.296528598819239217902158651186f);
+
+    float b = mad(s,
+                  mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+                  0.889585796862432286486651434570f);
+
+    float q = x * s * MATH_DIVIDE(a, b);
+
+    float z = c - (q - x);
+    float zs = as_float(sx | as_uint(z));
+
+    ret  = aux < 0x4c800000 ?  zs : ret;
+
+    // |x| < 2^-19
+    ret = aux < 0x36000000 ? as_float(ux) : ret;
+    return ret;
 }
 OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
 
-// XXX work-around PTX profile
-OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
-OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  /* copied from fdlibm */
-  float z;
-  int k,m,hx,hy,ix,iy;
-  const float
-  tiny  = 1.0e-30,
-  zero  = 0.0,
-  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
-  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
-  pi      = 3.1415927410e+00, /* 0x40490fdb */
-  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+    const float pi = 0x1.921fb6p+1f;
+    const float piby2 = 0x1.921fb6p+0f;
+    const float piby4 = 0x1.921fb6p-1f;
+    const float threepiby4 = 0x1.2d97c8p+1f;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  iy = hy&0x7fffffff;
+    float ax = fabs(x);
+    float ay = fabs(y);
+    float v = min(ax, ay);
+    float u = max(ax, ay);
 
-  if((ix>0x7f800000)||
-     (iy>0x7f800000)) /* x or y is NaN */
-     return x+y;
-  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
-  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+    // Scale since u could be large, as in "regular" divide
+    float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+    float vbyu = s * MATH_DIVIDE(v, s*u);
 
-    /* when y = 0 */
-  if(iy==0) {
-      switch(m) {
-    case 0:
-    case 1: return y;   /* atan(+-0,+anything)=+-0 */
-    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-      }
-  }
-    /* when x = 0 */
-  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+    float vbyu2 = vbyu * vbyu;
 
-  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
-  if(ix <= 0x7fffff && iy <= 0x7fffff) {
-    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
-    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
-  }
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+    float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+    float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
 
-    /* when x is INF */
-  if(ix==0x7f800000) {
-      if(iy==0x7f800000) {
-    switch(m) {
-        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-    }
-      } else {
-    switch(m) {
-        case 0: return  zero  ; /* atan(+...,+INF) */
-        case 1: return -zero  ; /* atan(-...,+INF) */
-        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
-        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
-    }
-      }
-  }
-    /* when y is INF */
-  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* compute y/x */
-  k = (iy-ix)>>23;
-  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
-  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
-  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
-  switch (m) {
-      case 0: return       z  ; /* atan(+,+) */
-      case 1: {
-              uint zh;
-          GEN_OCL_GET_FLOAT_WORD(zh,z);
-          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
-        }
-        return       z  ; /* atan(-,+) */
-      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-      default: /* case 3 */
-            return  (z-pi_lo)-pi;/* atan(-,-) */
-  }
-}
+    // Octant 0 result
+    float a = mad(p, MATH_RECIP(q), vbyu);
 
-OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
-  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-OVERLOADABLE float __gen_ocl_internal_round(float x) {
-  float y = __gen_ocl_rndz(x);
-  if (__gen_ocl_fabs(x - y) >= 0.5f)
-    y += __gen_ocl_internal_copysign(1.f, x);
-  return y;
-}
-OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return __gen_ocl_rnde(x);
-}
+    // Fix up 3 other octants
+    float at = piby2 - a;
+    a = ay > ax ? at : a;
+    at = pi - a;
+    a = x < 0.0F ? at : a;
 
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
-  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
-  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one = 1.0,
-  huge = 1.0e+30,
-  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03; /* 0xbb360b61 */
-  float y,hi=0.0,lo=0.0,c,t;
-  int k=0,xsb;
-  unsigned hx;
-  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
-  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
-  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
-  float half_0 = 0.5;
-  float half_1 =	-0.5;
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    at = as_int(x) < 0 ? pi : 0.0f;
+    a = y == 0.0f ? at : a;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = (hx>>31)&1;		/* sign bit of x */
-  hx &= 0x7fffffff;		/* high word of |x| */
+    // if (!FINITE_ONLY()) {
+        // x and y are +- Inf
+        at = x > 0.0f ? piby4 : threepiby4;
+        a = ax == INFINITY & ay == INFINITY ? at : a;
 
-  /* filter out non-finite argument */
-  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    if(hx>0x7f800000)
-      return x+x;			/* NaN */
-    if(hx==0x7f800000)
-      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
-    if(x > o_threshold) return huge*huge; /* overflow */
-    if(x < u_threshold) return twom100*twom100; /* underflow */
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
-    } else {
-      float tmp = xsb == 1 ? half_1 : half_0;
-      k  = ivln2*x+tmp;
-      t  = k;
-      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
-      lo = t*ln2LO_0;
-    }
-    x  = hi - lo;
-  }
-  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
-    if(huge+x>one) return one+x;/* trigger inexact */
-  }
-  else k = 0;
+	// x or y is NaN
+	a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+    // }
 
-  /* x is now in primary range */
-  t  = x*x;
-  c  = x - t*(P1+t*P2);
-  if(k==0)
-    return one-((x*c)/(c-(float)2.0)-x);
-  else
-    y = one-((lo-(x*c)/((float)2.0-c))-hi);
-  if(k >= -125) {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
-    return y;
-  } else {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
-    return y*twom100;
-  }
+    // Fixup sign and return
+    return copysign(a, y);
+}
+
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
 }
 
 /* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
@@ -2008,159 +2198,75 @@ OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
   return x;		/* exact output */
 }
 
-OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
-  //return __gen_ocl_pow(M_E_F, x) - 1;
-  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
-  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
-  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
-  huge = 1.0e30,
-  tiny = 1.0e-30,
-  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one	=  1.0,
-  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
-  float y,hi,lo,c,t,e,hxs,hfx,r1;
-  int k,xsb;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = hx&0x80000000;
-  /* sign bit of x */
-  //if(xsb==0)
-  //y=x;
-  //else
-  //y= -x; /* y = |x| */
-  y = __gen_ocl_internal_fabs(x);
-  hx &= 0x7fffffff;		/* high word of |x| */
-  /* filter out huge and non-finite argument */
-  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
-    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
-      if(hx>0x7f800000)
-        return x+x; 	 /* NaN */
-      if(hx==0x7f800000)
-        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
-      if(x > o_threshold)
-        return huge*huge; /* overflow */
-    }
-    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
-      if(x+tiny<(float)0.0)	/* raise inexact */
-        return tiny-one;	/* return -1 */
-    }
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
-      if(xsb==0){
-        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
-      }	else {
-        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
-      }
-    } else {
-      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
-      t  = k;
-      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
-      lo = t*ln2_lo;
-    }
-    x  = hi - lo;
-    c  = (hi-x)-lo;
-  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
-    //t = huge+x; /* return x with inexact flags when x!=0 */
-    //return x - (t-(huge+x));
-    return x;
-  } else k = 0;
-  /* x is now in primary range */
-  hfx = (float)0.5*x;
-  hxs = x*hfx;
-  r1 = one+hxs*(Q1+hxs*Q2);
-  t = (float)3.0-r1*hfx;
-  e = hxs*((r1-t)/((float)6.0 - x*t));
-  if(k==0)
-    return x - (x*e-hxs);		/* c is 0 */
-  else{
-    e = (x*(e-c)-c);
-    e -= hxs;
-    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
-    if(k==1){
-      if(x < (float)-0.25)
-        return -(float)2.0*(e-(x+(float)0.5));
-      else
-        return  (one+(float)2.0*(x-e));
-    }
-    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
-      int i;
-      y = one-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-      return y-one;
-    }
-    t = one;
-    if(k<23) {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
-      y = t-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    } else {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
-      y = x-(e+t);
-      y += one;
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    }
-  }
-  return y;
-}
-
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-  float one	= 1.0,
-  ln2	= 6.9314718246e-01;/* 0x3f317218 */
-  float t;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if(hx<0x3f800000) {	/* x < 1 */
-    return (x-x)/(x-x);
-  } else if(hx >=0x4d800000) {	/* x > 2**28 */
-    if(hx >=0x7f800000) {/* x is inf of NaN */
-      return x+x;
-    } else
-      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
-  } else if (hx==0x3f800000) {
-    return 0.0;			/* acosh(1) = 0 */
-  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
-    t=x*x;
-    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
-  } else {			/* 1<x<2 */
-    t = x-one;
-    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
-  }
+    uint ux = as_uint(x);
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by acosh(x) = ln(2) + ln(x)
+    // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+    // acosh(x) = ln(x + sqrt(x*x-1)) */
+    int high = ux > 0x46000000U;
+    int med = ux > 0x40000000U;
+
+    float w = x - 1.0f;
+    float s = w*w + 2.0f*w;
+    float t = x*x - 1.0f;
+    float r = sqrt(med ? t : s) + (med ? x : w);
+    float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+    float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+    z = ux >= PINFBITPATT_SP32 ? x : z;
+    z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+    return z;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_asinh(float x){
-  //return native_log(x + native_sqrt(x * x + 1));
-  float one =  1.0000000000e+00, /* 0x3F800000 */
-  ln2 =  6.9314718246e-01, /* 0x3f317218 */
-  huge=  1.0000000000e+30;
-  float w;
-  int hx,ix;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix< 0x38000000) {	/* |x|<2**-14 */
-    if(huge+x>one) return x;	/* return x inexact except 0 */
-  }
-  if(ix>0x47000000) {/* |x| > 2**14 */
-    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
-    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
-  } else {
-    float xa = __gen_ocl_internal_fabs(x);
-    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
-    } else {		/* 2.0 > |x| > 2**-14 */
-      float t = xa*xa;
-      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
-    }
-  }
-  return __gen_ocl_internal_copysign(w, x);
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xsgn = ax ^ ux;
+
+    // |x| <= 2
+    float t = x * x;
+    float a = mad(t,
+                  mad(t,
+		      mad(t,
+		          mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+		          -5.063201055468483248e-1f),
+		      -1.480204186473758321f),
+	          -1.152965835871758072f);
+    float b = mad(t,
+	          mad(t,
+		      mad(t,
+			  mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+			  6.582362487198468066f),
+		      11.99423176003939087f),
+		  6.917795026025976739f);
+
+    float q = MATH_DIVIDE(a, b);
+    float z1 = mad(x*t, q, x);
+
+    // |x| > 2
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+    // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+    // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+    // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+    float absx = as_float(ax);
+    int hi = ax > 0x46000000U;
+    float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+    y = hi ? absx : y;
+    float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+    float z2 = as_float(xsgn | as_uint(r));
+
+    float z = ax <= 0x40000000 ? z1 : z2;
+    z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+    return z;
 }
 
 OVERLOADABLE float __gen_ocl_internal_sinh(float x){
@@ -2195,38 +2301,51 @@ OVERLOADABLE float __gen_ocl_internal_sinh(float x){
   return x*shuge;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  //float y = native_exp(-2 * x);
-  //return (1 - y) / (1 + y);
-  float one=1.0, two=2.0, tiny = 1.0e-30;
-  float t,z;
-  int jx,ix;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) {
-    if (jx>=0)
-      return one/x+one; /* tanh(+-inf)=+-1 */
-    else
-      return one/x-one; /* tanh(NaN) = NaN */
-  }
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
 
-  if (ix < 0x41b00000) { /* |x|<22 */
-    if (ix == 0)
-      return x;		/* x == +-0 */
-    if (ix<0x24000000) 	/* |x|<2**-55 */
-      return x*(one+x);    	/* tanh(small) = small */
-    if (ix>=0x3f800000) {	/* |x|>=1  */
-      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
-      z = one - two/(t+two);
-    } else {
-      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
-      z= -t/(t+two);
-    }
-  } else { /* |x| > 22, return +-1 */
-    z = one - tiny;		/* raised inexact flag */
-  }
-  return (jx>=0)? z: -z;
+    const float large_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+
+    float y = as_float(aux);
+    float y2 = y*y;
+
+    float a1 = mad(y2,
+                   mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+                   -0.28192806108402678e0F);
+    float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+    float a2 = mad(y2,
+                   mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+                   -0.24069858695196524e0F);
+    float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+    int c = y < 0.9f;
+    float a = c ? a1 : a2;
+    float b = c ? b1 : b2;
+    float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+    float p = exp(2.0f * y) + 1.0f;
+    float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+    float z = y <= 1.0f ? zlo : zhi;
+    z = as_float(xs | as_uint(z));
+
+    // Edge cases
+    float sone = as_float(0x3f800000U | xs);
+    z = y > large_threshold ? sone : z;
+    z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+    return z;
 }
 
 OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
@@ -2303,77 +2422,38 @@ OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
   return x;
 }
 
-OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  x = __gen_ocl_scalbnf(x,n);
-  return x;
-}
-
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  //return 0.5f * native_sqrt((1 + x) / (1 - x));
-  float xa = __gen_ocl_fabs (x);
-  float t;
-  if (isless (xa, 0.5f)){
-    if (xa < 0x1.0p-28f) return x;
-    t = xa + xa;
-    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
-  } else if (isless (xa, 1.0f)){
-    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
-  } else{
-    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
-    return x / 0.0f;
-  }
-  return __gen_ocl_internal_copysign(t, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp10(float x){
-  float px, qx,ans;
-  short n;
-  int i;
-  float*p;
-  float MAXL10 = 38.230809449325611792;
-  float LOG210 = 3.32192809488736234787e0;
-  float LG102A = 3.00781250000000000000E-1;
-  float LG102B = 2.48745663981195213739E-4;
-  float P[6];
-  P[0] = 2.063216740311022E-001;
-  P[1] = 5.420251702225484E-001;
-  P[2] = 1.171292686296281E+000;
-  P[3] = 2.034649854009453E+000;
-  P[4] = 2.650948748208892E+000;
-  P[5] = 2.302585167056758E+000;
-
-  if( x < -MAXL10 ) return 0.0;
-
-  if( isinf(x))  return INFINITY;
-  /* The following is necessary because range reduction blows up: */
-  if( x == 0 )return 1.0;
-
-  /* Express 10**x = 10**g 2**n
-    *	 = 10**g 10**( n log10(2) )
-    *	 = 10**( g + n log10(2) )
-    */
-  px = x * LOG210;
-  qx = __gen_ocl_internal_floor( px + 0.5 );
-  n = qx;
-  x -= qx * LG102A;
-  x -= qx * LG102B;
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ ax;
+
+    // |x| > 1 or NaN
+    float z = as_float(QNANBITPATT_SP32);
+
+    // |x| == 1
+    float t = as_float(xs | PINFBITPATT_SP32);
+    z = ax == 0x3f800000U ? t : z;
+
+    // 1/2 <= |x| < 1
+    t = as_float(ax);
+    t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+    t = 0.5f * log1p(t);
+    t = as_float(xs | as_uint(t));
+    z = ax < 0x3f800000U ? t : z;
+
+    // |x| < 1/2
+    t = x * x;
+    float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+    float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+    float p = MATH_DIVIDE(a, b);
+    t = mad(x*t, p, x);
+    z = ax < 0x3f000000 ? t : z;
+
+    // |x| < 2^-13
+    z = ax < 0x39000000U ? x : z;
 
-  /* rational approximation for exponential
-    * of the fractional part:
-    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
-    */
-  p = P;
-  ans = *p++;
-  i = 5;
-  do{
-    ans = ans * x  +  *p++;
-  }
-  while( --i );
-  px = 1.0 + x * ans;
-
-  /* multiply by power of 2 */
-  x = __gen_ocl_internal_ldexp( px, n );
-  return x;
+    return z;
 }
 
 OVERLOADABLE float cospi(float x) {
@@ -3417,10 +3497,6 @@ OVERLOADABLE float log(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log(x);
 
-  /* Use native/faster instruction when it has enough precision */
-  if(x > 0x1.1p0)
-    return __gen_ocl_internal_fastpath_log(x);
-
   return  __gen_ocl_internal_log(x);
 }
 
@@ -3428,10 +3504,6 @@ OVERLOADABLE float log2(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log2(x);
 
-  /* Use native/faster instruction when it has enough precision */
-  if(x > 0x1.1p0)
-    return __gen_ocl_internal_fastpath_log2(x);
-
   return  __gen_ocl_internal_log2(x);
 }
 
@@ -3439,10 +3511,6 @@ OVERLOADABLE float log10(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log10(x);
 
-  /* Use native/faster instruction when it has enough precision */
-  if(x > 0x1.1p0)
-    return __gen_ocl_internal_fastpath_log10(x);
-
   return  __gen_ocl_internal_log10(x);
 }
 
@@ -3450,10 +3518,6 @@ OVERLOADABLE float exp(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_exp(x);
 
-  /* Use native/faster instruction when it has enough precision */
-  if (x > -0x1.6p1 && x < 0x1.6p1)
-    return __gen_ocl_internal_fastpath_exp(x);
-
   return  __gen_ocl_internal_exp(x);
 }
 
@@ -3466,10 +3530,6 @@ OVERLOADABLE float exp10(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_exp10(x);
 
-  /* Use native/faster instruction when it has enough precision */
-  if((x < -0x1.4p+5) || (x > +0x1.4p+5))
-    return __gen_ocl_internal_fastpath_exp10(x);
-
   return  __gen_ocl_internal_exp10(x);
 }
 
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 0de3642..4987b18 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -20,6 +20,77 @@
 
 #include "ocl_types.h"
 
+/******************** START AMD ***********************/
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32      0x80000000
+#define EXSIGNBIT_SP32    0x7fffffff
+#define EXPBITS_SP32      0x7f800000
+#define MANTBITS_SP32     0x007fffff
+#define ONEEXPBITS_SP32   0x3f800000
+#define TWOEXPBITS_SP32   0x40000000
+#define HALFEXPBITS_SP32  0x3f000000
+#define IMPBIT_SP32       0x00800000
+#define QNANBITPATT_SP32  0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32  0x7f800000
+#define NINFBITPATT_SP32  0xff800000
+#define EXPBIAS_SP32      127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32   1
+#define EMIN_SP32         -126
+#define BIASEDEMAX_SP32   254
+#define EMAX_SP32         127
+#define LAMBDA_SP32       1.0e30
+#define MANTLENGTH_SP32   24
+#define BASEDIGITS_SP32   7
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64      0x8000000000000000L
+#define EXSIGNBIT_DP64    0x7fffffffffffffffL
+#define EXPBITS_DP64      0x7ff0000000000000L
+#define MANTBITS_DP64     0x000fffffffffffffL
+#define ONEEXPBITS_DP64   0x3ff0000000000000L
+#define TWOEXPBITS_DP64   0x4000000000000000L
+#define HALFEXPBITS_DP64  0x3fe0000000000000L
+#define IMPBIT_DP64       0x0010000000000000L
+#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64  0x7ff0000000000000L
+#define NINFBITPATT_DP64  0xfff0000000000000L
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046 /* 0x7fe */
+#define EMAX_DP64         1023 /* 0x3ff */
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+
+#endif // cl_khr_fp64
+
+/******************** END AMD ***********************/
+
 OVERLOADABLE float cospi(float x);
 OVERLOADABLE float cosh(float x);
 OVERLOADABLE float acos(float x);
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
index a6b6b94..f2090bc 100644
--- a/kernels/bench_math.cl
+++ b/kernels/bench_math.cl
@@ -49,11 +49,11 @@ kernel void bench_math_exp(
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp(-0x1.6p1 - result); /* calls native */
+    result = native_exp((float)-0x1.6p1 - result); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp(-0x1.6p1 + result); /* calls internal fast */
+    result = exp((float)-0x1.6p1 + result); /* calls internal fast */
 #else
-    result = exp(-0x1.6p1 - result); /* calls internal slow */
+    result = exp((float)-0x1.6p1 - result); /* calls internal slow */
 #endif
   }
 
@@ -73,11 +73,11 @@ kernel void bench_math_exp10(
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp10(-0x1.4p+5 - result); /* calls native */
+    result = native_exp10((float)-0x1.4p+5 - result); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp10(-0x1.4p+5 - result); /* calls internal fast */
+    result = exp10((float)-0x1.4p+5 - result); /* calls internal fast */
 #else
-    result = exp10(-0x1.3p+5 - result); /* calls internal slow */
+    result = exp10((float)-0x1.3p+5 - result); /* calls internal slow */
 #endif
   }
 
@@ -97,11 +97,11 @@ kernel void bench_math_log2(
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log2(0x1.1p0 + result); /* calls native */
+    result = native_log2((float)0x1.1p0 + result); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log2(0x1.1p0 + result); /* calls internal fast */
+    result = log2((float)0x1.1p0 + result); /* calls internal fast */
 #else
-    result = log2(0x1.1p0 - result); /* calls internal slow */
+    result = log2((float)0x1.1p0 - result); /* calls internal slow */
 #endif
   }
 
@@ -121,11 +121,11 @@ kernel void bench_math_log(
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log(0x1.1p0 + result); /* calls native */
+    result = native_log((float)0x1.1p0 + result); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log(0x1.1p0 + result); /* calls internal fast */
+    result = log((float)0x1.1p0 + result); /* calls internal fast */
 #else
-    result = log(0x1.1p0 - result); /* calls internal slow */
+    result = log((float)0x1.1p0 - result); /* calls internal slow */
 #endif
   }
 
@@ -145,11 +145,11 @@ kernel void bench_math_log10(
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log10(0x1.1p0 + result); /* calls native */
+    result = native_log10((float)0x1.1p0 + result); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log10(0x1.1p0 + result); /* calls internal fast */
+    result = log10((float)0x1.1p0 + result); /* calls internal fast */
 #else
-    result = log10(0x1.1p0 - result); /* calls internal slow */
+    result = log10((float)0x1.1p0 - result); /* calls internal slow */
 #endif
   }
 
-- 
2.5.0