[Beignet] [PATCH 5/5] Backend: Optimization internal math, experiment clc AMD

Wed May 25 19:17:42 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Affected functions:
__gen_ocl_internal_atan
__gen_ocl_internal_atanpi
__gen_ocl_internal_atan2
__gen_ocl_internal_atan2pi
__gen_ocl_internal_atanh
__gen_ocl_internal_acosh
__gen_ocl_internal_sinh
__gen_ocl_internal_cosh
__gen_ocl_internal_tanh
__gen_ocl_internal_ldexp

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl | 631 +++++++++++++++++--------------
 backend/src/libocl/tmpl/ocl_math.tmpl.h  |  71 ++++
 2 files changed, 419 insertions(+), 283 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 9414c8e..a4413a6 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -1352,61 +1352,76 @@ __constant float atanlo[4] = {
   7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
 };
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  /* copied from fdlibm */
-  float aT[11];
-  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
-  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
-  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
-  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
-  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
-  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
-  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  const float one = 1.0, huge = 1.0e30;
-
-  float w,s1,s2,z;
-  int ix,hx,id;
+    const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
-      if(ix>0x7f800000)
-    return x+x;   /* NaN */
-      if(hx>0) return  atanhi[3]+atanlo[3];
-      else     return -atanhi[3]-atanlo[3];
-  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
-      if (ix < 0x31000000) {  /* |x| < 2^-29 */
-    if(huge+x>one) return x;  /* raise inexact */
-      }
-      id = -1;
-  } else {
-  x = __gen_ocl_fabs(x);
-  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
-      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
-    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
-      } else {      /* 11/16<=|x|< 19/16 */
-    id = 1; x  = (x-one)/(x+one);
-      }
-  } else {
-      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
-    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
-      } else {      /* 2.4375 <= |x| < 2^66 */
-    id = 3; x  = -(float)1.0/x;
-      }
-  }}
-    /* end of argument reduction */
-  z = x*x;
-  w = z*z;
-    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
-  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
-  if (id<0) return x - x*(s1+s2);
-  else {
-      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
-      return (hx<0)? -z:z;
-  }
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint sx = ux ^ aux;
+
+    float spiby2 = as_float(sx | as_uint(piby2));
+
+    float v = as_float(aux);
+
+    // Return for NaN
+    float ret = x;
 
+    // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+    ret = aux <= PINFBITPATT_SP32  ? spiby2 : ret;
+
+    // Reduce arguments 2^-19 <= |x| < 2^26
+
+    // 39/16 <= x < 2^26
+    x = -MATH_RECIP(v);
+    float c = 1.57079632679489655800f; // atan(infinity)
+
+    // 19/16 <= x < 39/16
+    int l = aux < 0x401c0000;
+    float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+    x = l ? xx : x;
+    c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+    // 11/16 <= x < 19/16
+    l = aux < 0x3f980000U;
+    xx =  MATH_DIVIDE(v - 1.0f, 1.0f + v);
+    x = l ? xx : x;
+    c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+    // 7/16 <= x < 11/16
+    l = aux < 0x3f300000;
+    xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+    x = l ? xx : x;
+    c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+    // 2^-19 <= x < 7/16
+    l = aux < 0x3ee00000;
+    x = l ? v : x;
+    c = l ? 0.0f : c;
+
+    // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+    float s = x * x;
+    float a = mad(s,
+                  mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+                  0.296528598819239217902158651186f);
+
+    float b = mad(s,
+                  mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+                  0.889585796862432286486651434570f);
+
+    float q = x * s * MATH_DIVIDE(a, b);
+
+    float z = c - (q - x);
+    float zs = as_float(sx | as_uint(z));
+
+    ret  = aux < 0x4c800000 ?  zs : ret;
+
+    // |x| < 2^-19
+    ret = aux < 0x36000000 ? as_float(ux) : ret;
+    return ret;
 }
+
 OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
@@ -1414,85 +1429,58 @@ OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
 // XXX work-around PTX profile
 OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
 OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  /* copied from fdlibm */
-  float z;
-  int k,m,hx,hy,ix,iy;
-  const float
-  tiny  = 1.0e-30,
-  zero  = 0.0,
-  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
-  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
-  pi      = 3.1415927410e+00, /* 0x40490fdb */
-  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+    const float pi = 0x1.921fb6p+1f;
+    const float piby2 = 0x1.921fb6p+0f;
+    const float piby4 = 0x1.921fb6p-1f;
+    const float threepiby4 = 0x1.2d97c8p+1f;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  iy = hy&0x7fffffff;
+    float ax = fabs(x);
+    float ay = fabs(y);
+    float v = min(ax, ay);
+    float u = max(ax, ay);
 
-  if((ix>0x7f800000)||
-     (iy>0x7f800000)) /* x or y is NaN */
-     return x+y;
-  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
-  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+    // Scale since u could be large, as in "regular" divide
+    float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+    float vbyu = s * MATH_DIVIDE(v, s*u);
 
-    /* when y = 0 */
-  if(iy==0) {
-      switch(m) {
-    case 0:
-    case 1: return y;   /* atan(+-0,+anything)=+-0 */
-    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-      }
-  }
-    /* when x = 0 */
-  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+    float vbyu2 = vbyu * vbyu;
 
-  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
-  if(ix <= 0x7fffff && iy <= 0x7fffff) {
-    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
-    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
-  }
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+    float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+    float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
 
-    /* when x is INF */
-  if(ix==0x7f800000) {
-      if(iy==0x7f800000) {
-    switch(m) {
-        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-    }
-      } else {
-    switch(m) {
-        case 0: return  zero  ; /* atan(+...,+INF) */
-        case 1: return -zero  ; /* atan(-...,+INF) */
-        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
-        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
-    }
-      }
-  }
-    /* when y is INF */
-  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* compute y/x */
-  k = (iy-ix)>>23;
-  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
-  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
-  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
-  switch (m) {
-      case 0: return       z  ; /* atan(+,+) */
-      case 1: {
-              uint zh;
-          GEN_OCL_GET_FLOAT_WORD(zh,z);
-          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
-        }
-        return       z  ; /* atan(-,+) */
-      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-      default: /* case 3 */
-            return  (z-pi_lo)-pi;/* atan(-,-) */
-  }
+    // Octant 0 result
+    float a = mad(p, MATH_RECIP(q), vbyu);
+
+    // Fix up 3 other octants
+    float at = piby2 - a;
+    a = ay > ax ? at : a;
+    at = pi - a;
+    a = x < 0.0F ? at : a;
+
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    at = as_int(x) < 0 ? pi : 0.0f;
+    a = y == 0.0f ? at : a;
+
+    // if (!FINITE_ONLY()) {
+        // x and y are +- Inf
+        at = x > 0.0f ? piby4 : threepiby4;
+        a = ax == INFINITY & ay == INFINITY ? at : a;
+
+	// x or y is NaN
+	a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+    // }
+
+    // Fixup sign and return
+    return copysign(a, y);
 }
 
 OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
@@ -1511,74 +1499,43 @@ OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
 
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
-  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
-  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one = 1.0,
-  huge = 1.0e+30,
-  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03; /* 0xbb360b61 */
-  float y,hi=0.0,lo=0.0,c,t;
-  int k=0,xsb;
-  unsigned hx;
-  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
-  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
-  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
-  float half_0 = 0.5;
-  float half_1 =	-0.5;
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_exp(float x){
+  // Reduce x
+  const float ln2HI = 0x1.62e300p-1f;
+  const float ln2LO = 0x1.2fefa2p-17f;
+  const float invln2 = 0x1.715476p+0f;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = (hx>>31)&1;		/* sign bit of x */
-  hx &= 0x7fffffff;		/* high word of |x| */
+  float fhalF = x < 0.0f ? -0.5f : 0.5f;
+  int p  = mad(x, invln2, fhalF);
+  float fp = (float)p;
+  float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+  float lo = -fp*ln2LO;
 
-  /* filter out non-finite argument */
-  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    if(hx>0x7f800000)
-      return x+x;			/* NaN */
-    if(hx==0x7f800000)
-      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
-    if(x > o_threshold) return huge*huge; /* overflow */
-    if(x < u_threshold) return twom100*twom100; /* underflow */
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
-    } else {
-      float tmp = xsb == 1 ? half_1 : half_0;
-      k  = ivln2*x+tmp;
-      t  = k;
-      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
-      lo = t*ln2LO_0;
-    }
-    x  = hi - lo;
-  }
-  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
-    if(huge+x>one) return one+x;/* trigger inexact */
-  }
-  else k = 0;
+  // Evaluate poly
+  float t = hi + lo;
+  float tt  = t*t;
+  float v = mad(tt,
+                -mad(tt,
+                     mad(tt,
+                         mad(tt,
+                             mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+                             0x1.1566aap-14f),
+                         -0x1.6c16c2p-9f),
+                     0x1.555556p-3f),
+                t);
 
-  /* x is now in primary range */
-  t  = x*x;
-  c  = x - t*(P1+t*P2);
-  if(k==0)
-    return one-((x*c)/(c-(float)2.0)-x);
-  else
-    y = one-((lo-(x*c)/((float)2.0-c))-hi);
-  if(k >= -125) {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
-    return y;
-  } else {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
-    return y*twom100;
-  }
+  float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+  // Scale by 2^p
+  float r =  as_float(as_int(y) + (p << 23));
+
+  const float ulim =  0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+  const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+  r = x < llim ? 0.0f : r;
+  r = x < ulim ? r : as_float(0x7f800000);
+  return isnan(x) ? x : r;
 }
 
 /* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
@@ -2041,56 +1998,75 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
   return y;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-  float one	= 1.0,
-  ln2	= 6.9314718246e-01;/* 0x3f317218 */
-  float t;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if(hx<0x3f800000) {	/* x < 1 */
-    return (x-x)/(x-x);
-  } else if(hx >=0x4d800000) {	/* x > 2**28 */
-    if(hx >=0x7f800000) {/* x is inf of NaN */
-      return x+x;
-    } else
-      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
-  } else if (hx==0x3f800000) {
-    return 0.0;			/* acosh(1) = 0 */
-  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
-    t=x*x;
-    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
-  } else {			/* 1<x<2 */
-    t = x-one;
-    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
-  }
+    uint ux = as_uint(x);
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by acosh(x) = ln(2) + ln(x)
+    // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+    // acosh(x) = ln(x + sqrt(x*x-1)) */
+    int high = ux > 0x46000000U;
+    int med = ux > 0x40000000U;
+
+    float w = x - 1.0f;
+    float s = w*w + 2.0f*w;
+    float t = x*x - 1.0f;
+    float r = sqrt(med ? t : s) + (med ? x : w);
+    float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+    float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+    z = ux >= PINFBITPATT_SP32 ? x : z;
+    z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+    return z;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_asinh(float x){
-  //return native_log(x + native_sqrt(x * x + 1));
-  float one =  1.0000000000e+00, /* 0x3F800000 */
-  ln2 =  6.9314718246e-01, /* 0x3f317218 */
-  huge=  1.0000000000e+30;
-  float w;
-  int hx,ix;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix< 0x38000000) {	/* |x|<2**-14 */
-    if(huge+x>one) return x;	/* return x inexact except 0 */
-  }
-  if(ix>0x47000000) {/* |x| > 2**14 */
-    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
-    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
-  } else {
-    float xa = __gen_ocl_internal_fabs(x);
-    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
-    } else {		/* 2.0 > |x| > 2**-14 */
-      float t = xa*xa;
-      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
-    }
-  }
-  return __gen_ocl_internal_copysign(w, x);
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xsgn = ax ^ ux;
+
+    // |x| <= 2
+    float t = x * x;
+    float a = mad(t,
+                  mad(t,
+		      mad(t,
+		          mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+		          -5.063201055468483248e-1f),
+		      -1.480204186473758321f),
+	          -1.152965835871758072f);
+    float b = mad(t,
+	          mad(t,
+		      mad(t,
+			  mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+			  6.582362487198468066f),
+		      11.99423176003939087f),
+		  6.917795026025976739f);
+
+    float q = MATH_DIVIDE(a, b);
+    float z1 = mad(x*t, q, x);
+
+    // |x| > 2
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+    // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+    // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+    // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+    float absx = as_float(ax);
+    int hi = ax > 0x46000000U;
+    float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+    y = hi ? absx : y;
+    float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+    float z2 = as_float(xsgn | as_uint(r));
+
+    float z = ax <= 0x40000000 ? z1 : z2;
+    z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+    return z;
 }
 
 OVERLOADABLE float __gen_ocl_internal_sinh(float x){
@@ -2125,38 +2101,51 @@ OVERLOADABLE float __gen_ocl_internal_sinh(float x){
   return x*shuge;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  //float y = native_exp(-2 * x);
-  //return (1 - y) / (1 + y);
-  float one=1.0, two=2.0, tiny = 1.0e-30;
-  float t,z;
-  int jx,ix;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) {
-    if (jx>=0)
-      return one/x+one; /* tanh(+-inf)=+-1 */
-    else
-      return one/x-one; /* tanh(NaN) = NaN */
-  }
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
 
-  if (ix < 0x41b00000) { /* |x|<22 */
-    if (ix == 0)
-      return x;		/* x == +-0 */
-    if (ix<0x24000000) 	/* |x|<2**-55 */
-      return x*(one+x);    	/* tanh(small) = small */
-    if (ix>=0x3f800000) {	/* |x|>=1  */
-      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
-      z = one - two/(t+two);
-    } else {
-      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
-      z= -t/(t+two);
-    }
-  } else { /* |x| > 22, return +-1 */
-    z = one - tiny;		/* raised inexact flag */
-  }
-  return (jx>=0)? z: -z;
+    const float large_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+
+    float y = as_float(aux);
+    float y2 = y*y;
+
+    float a1 = mad(y2,
+                   mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+                   -0.28192806108402678e0F);
+    float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+    float a2 = mad(y2,
+                   mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+                   -0.24069858695196524e0F);
+    float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+    int c = y < 0.9f;
+    float a = c ? a1 : a2;
+    float b = c ? b1 : b2;
+    float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+    float p = exp(2.0f * y) + 1.0f;
+    float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+    float z = y <= 1.0f ? zlo : zhi;
+    z = as_float(xs | as_uint(z));
+
+    // Edge cases
+    float sone = as_float(0x3f800000U | xs);
+    z = y > large_threshold ? sone : z;
+    z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+    return z;
 }
 
 OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
@@ -2233,26 +2222,102 @@ OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
   return x;
 }
 
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  x = __gen_ocl_scalbnf(x,n);
-  return x;
-}
-
+	// This treats subnormals as zeros
+	int i = as_int(x);
+	int e = (i >> 23) & 0xff;
+	int m = i & 0x007fffff;
+	int s = i & 0x80000000;
+	int v = add_sat(e, n);
+	v = clamp(v, 0, 0xff);
+	int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+	int c = e == 0xff;
+	mr = c ? m : mr;
+	int er = c ? e : v;
+	er = e ? er : e;
+	return as_float( s | (er << 23) | mr );
+
+	/* supports denormal values */
+	const int multiplier = 24;
+	float val_f;
+	uint val_ui;
+	uint sign;
+	int exponent;
+	val_ui = as_uint(x);
+	sign = val_ui & 0x80000000;
+	val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+	int val_x = val_ui;
+
+	exponent = val_ui >> 23; /* get the exponent */
+	int dexp = exponent;
+
+	/* denormal support */
+	int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+	int dexponent = 25 - fbh;
+	uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+	int ex = dexponent + n - multiplier;
+	dexponent = ex;
+	uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+	int ex1 = dexponent + multiplier;
+	ex1 = -ex1 +25;
+	dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+	dval_ui = dexponent > 0 ? val :dval_ui;
+	dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui;  /*overflow*/
+	dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
+	dval_ui = dval_ui | sign;
+	val_f = as_float(dval_ui);
+
+	exponent += n;
+
+	val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+	ex1 = exponent + multiplier;
+	ex1 = -ex1 +25;
+	val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+	val_ui = exponent > 0 ? val :val_ui;
+	val_ui = exponent > 254 ? 0x7f800000 :val_ui;  /*overflow*/
+	val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
+	val_ui = val_ui | sign;
+
+	val_ui = dexp == 0? dval_ui : val_ui;
+	val_f = as_float(val_ui);
+
+	val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+	return val_f;
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
 OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  //return 0.5f * native_sqrt((1 + x) / (1 - x));
-  float xa = __gen_ocl_fabs (x);
-  float t;
-  if (isless (xa, 0.5f)){
-    if (xa < 0x1.0p-28f) return x;
-    t = xa + xa;
-    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
-  } else if (isless (xa, 1.0f)){
-    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
-  } else{
-    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
-    return x / 0.0f;
-  }
-  return __gen_ocl_internal_copysign(t, x);
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ ax;
+
+    // |x| > 1 or NaN
+    float z = as_float(QNANBITPATT_SP32);
+
+    // |x| == 1
+    float t = as_float(xs | PINFBITPATT_SP32);
+    z = ax == 0x3f800000U ? t : z;
+
+    // 1/2 <= |x| < 1
+    t = as_float(ax);
+    t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+    t = 0.5f * log1p(t);
+    t = as_float(xs | as_uint(t));
+    z = ax < 0x3f800000U ? t : z;
+
+    // |x| < 1/2
+    t = x * x;
+    float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+    float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+    float p = MATH_DIVIDE(a, b);
+    t = mad(x*t, p, x);
+    z = ax < 0x3f000000 ? t : z;
+
+    // |x| < 2^-13
+    z = ax < 0x39000000U ? x : z;
+
+    return z;
 }
 
 OVERLOADABLE float __gen_ocl_internal_exp10(float x){
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 0de3642..4987b18 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -20,6 +20,77 @@
 
 #include "ocl_types.h"
 
+/******************** START AMD ***********************/
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32      0x80000000
+#define EXSIGNBIT_SP32    0x7fffffff
+#define EXPBITS_SP32      0x7f800000
+#define MANTBITS_SP32     0x007fffff
+#define ONEEXPBITS_SP32   0x3f800000
+#define TWOEXPBITS_SP32   0x40000000
+#define HALFEXPBITS_SP32  0x3f000000
+#define IMPBIT_SP32       0x00800000
+#define QNANBITPATT_SP32  0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32  0x7f800000
+#define NINFBITPATT_SP32  0xff800000
+#define EXPBIAS_SP32      127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32   1
+#define EMIN_SP32         -126
+#define BIASEDEMAX_SP32   254
+#define EMAX_SP32         127
+#define LAMBDA_SP32       1.0e30
+#define MANTLENGTH_SP32   24
+#define BASEDIGITS_SP32   7
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64      0x8000000000000000L
+#define EXSIGNBIT_DP64    0x7fffffffffffffffL
+#define EXPBITS_DP64      0x7ff0000000000000L
+#define MANTBITS_DP64     0x000fffffffffffffL
+#define ONEEXPBITS_DP64   0x3ff0000000000000L
+#define TWOEXPBITS_DP64   0x4000000000000000L
+#define HALFEXPBITS_DP64  0x3fe0000000000000L
+#define IMPBIT_DP64       0x0010000000000000L
+#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64  0x7ff0000000000000L
+#define NINFBITPATT_DP64  0xfff0000000000000L
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046 /* 0x7fe */
+#define EMAX_DP64         1023 /* 0x3ff */
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+
+#endif // cl_khr_fp64
+
+/******************** END AMD ***********************/
+
 OVERLOADABLE float cospi(float x);
 OVERLOADABLE float cosh(float x);
 OVERLOADABLE float acos(float x);
-- 
2.5.0