[Beignet] [PATCH 5/5] Backend: Optimization internal math, experiment clc AMD
Grigore Lupescu
grigore.lupescu at intel.com
Wed May 25 19:17:42 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Affected functions:
__gen_ocl_internal_atan
__gen_ocl_internal_atanpi
__gen_ocl_internal_atan2
__gen_ocl_internal_atan2pi
__gen_ocl_internal_atanh
__gen_ocl_internal_acosh
__gen_ocl_internal_sinh
__gen_ocl_internal_cosh
__gen_ocl_internal_tanh
__gen_ocl_internal_ldexp
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 631 +++++++++++++++++--------------
backend/src/libocl/tmpl/ocl_math.tmpl.h | 71 ++++
2 files changed, 419 insertions(+), 283 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 9414c8e..a4413a6 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -1352,61 +1352,76 @@ __constant float atanlo[4] = {
7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
};
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atan(float x) {
- /* copied from fdlibm */
- float aT[11];
- aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
- aT[1] = -2.0000000298e-01; /* 0xbe4ccccd */
- aT[2] = 1.4285714924e-01; /* 0x3e124925 */
- aT[3] = -1.1111110449e-01; /* 0xbde38e38 */
- aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
- aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
- aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
- const float one = 1.0, huge = 1.0e30;
-
- float w,s1,s2,z;
- int ix,hx,id;
+ const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- if(ix>=0x50800000) { /* if |x| >= 2^34 */
- if(ix>0x7f800000)
- return x+x; /* NaN */
- if(hx>0) return atanhi[3]+atanlo[3];
- else return -atanhi[3]-atanlo[3];
- } if (ix < 0x3ee00000) { /* |x| < 0.4375 */
- if (ix < 0x31000000) { /* |x| < 2^-29 */
- if(huge+x>one) return x; /* raise inexact */
- }
- id = -1;
- } else {
- x = __gen_ocl_fabs(x);
- if (ix < 0x3f980000) { /* |x| < 1.1875 */
- if (ix < 0x3f300000) { /* 7/16 <=|x|<11/16 */
- id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
- } else { /* 11/16<=|x|< 19/16 */
- id = 1; x = (x-one)/(x+one);
- }
- } else {
- if (ix < 0x401c0000) { /* |x| < 2.4375 */
- id = 2; x = (x-(float)1.5)/(one+(float)1.5*x);
- } else { /* 2.4375 <= |x| < 2^66 */
- id = 3; x = -(float)1.0/x;
- }
- }}
- /* end of argument reduction */
- z = x*x;
- w = z*z;
- /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
- s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
- s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
- if (id<0) return x - x*(s1+s2);
- else {
- z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
- return (hx<0)? -z:z;
- }
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint sx = ux ^ aux;
+
+ float spiby2 = as_float(sx | as_uint(piby2));
+
+ float v = as_float(aux);
+
+ // Return for NaN
+ float ret = x;
+ // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+ ret = aux <= PINFBITPATT_SP32 ? spiby2 : ret;
+
+ // Reduce arguments 2^-19 <= |x| < 2^26
+
+ // 39/16 <= x < 2^26
+ x = -MATH_RECIP(v);
+ float c = 1.57079632679489655800f; // atan(infinity)
+
+ // 19/16 <= x < 39/16
+ int l = aux < 0x401c0000;
+ float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+ x = l ? xx : x;
+ c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+ // 11/16 <= x < 19/16
+ l = aux < 0x3f980000U;
+ xx = MATH_DIVIDE(v - 1.0f, 1.0f + v);
+ x = l ? xx : x;
+ c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+ // 7/16 <= x < 11/16
+ l = aux < 0x3f300000;
+ xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+ x = l ? xx : x;
+ c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+ // 2^-19 <= x < 7/16
+ l = aux < 0x3ee00000;
+ x = l ? v : x;
+ c = l ? 0.0f : c;
+
+ // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+ float s = x * x;
+ float a = mad(s,
+ mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+ 0.296528598819239217902158651186f);
+
+ float b = mad(s,
+ mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+ 0.889585796862432286486651434570f);
+
+ float q = x * s * MATH_DIVIDE(a, b);
+
+ float z = c - (q - x);
+ float zs = as_float(sx | as_uint(z));
+
+ ret = aux < 0x4c800000 ? zs : ret;
+
+ // |x| < 2^-19
+ ret = aux < 0x36000000 ? as_float(ux) : ret;
+ return ret;
}
+
OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
return __gen_ocl_internal_atan(x) / M_PI_F;
}
@@ -1414,85 +1429,58 @@ OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
// XXX work-around PTX profile
OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
- /* copied from fdlibm */
- float z;
- int k,m,hx,hy,ix,iy;
- const float
- tiny = 1.0e-30,
- zero = 0.0,
- pi_o_4 = 7.8539818525e-01, /* 0x3f490fdb */
- pi_o_2 = 1.5707963705e+00, /* 0x3fc90fdb */
- pi = 3.1415927410e+00, /* 0x40490fdb */
- pi_lo = -8.7422776573e-08; /* 0xb3bbbd2e */
+ const float pi = 0x1.921fb6p+1f;
+ const float piby2 = 0x1.921fb6p+0f;
+ const float piby4 = 0x1.921fb6p-1f;
+ const float threepiby4 = 0x1.2d97c8p+1f;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- iy = hy&0x7fffffff;
+ float ax = fabs(x);
+ float ay = fabs(y);
+ float v = min(ax, ay);
+ float u = max(ax, ay);
- if((ix>0x7f800000)||
- (iy>0x7f800000)) /* x or y is NaN */
- return x+y;
- if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y); /* x=1.0 */
- m = ((hy>>31)&1)|((hx>>30)&2); /* 2*sign(x)+sign(y) */
+ // Scale since u could be large, as in "regular" divide
+ float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+ float vbyu = s * MATH_DIVIDE(v, s*u);
- /* when y = 0 */
- if(iy==0) {
- switch(m) {
- case 0:
- case 1: return y; /* atan(+-0,+anything)=+-0 */
- case 2: return pi+tiny;/* atan(+0,-anything) = pi */
- case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
- }
- }
- /* when x = 0 */
- if(ix==0) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+ float vbyu2 = vbyu * vbyu;
- /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
- if(ix <= 0x7fffff && iy <= 0x7fffff) {
- x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
- y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
- }
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+ float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+ float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
- /* when x is INF */
- if(ix==0x7f800000) {
- if(iy==0x7f800000) {
- switch(m) {
- case 0: return pi_o_4+tiny;/* atan(+INF,+INF) */
- case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
- case 2: return (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
- case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
- }
- } else {
- switch(m) {
- case 0: return zero ; /* atan(+...,+INF) */
- case 1: return -zero ; /* atan(-...,+INF) */
- case 2: return pi+tiny ; /* atan(+...,-INF) */
- case 3: return -pi-tiny ; /* atan(-...,-INF) */
- }
- }
- }
- /* when y is INF */
- if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
- /* compute y/x */
- k = (iy-ix)>>23;
- if(k > 60) z=pi_o_2+(float)0.5*pi_lo; /* |y/x| > 2**60 */
- else if(hx<0&&k<-60) z=0.0; /* |y|/x < -2**60 */
- else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
- switch (m) {
- case 0: return z ; /* atan(+,+) */
- case 1: {
- uint zh;
- GEN_OCL_GET_FLOAT_WORD(zh,z);
- GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
- }
- return z ; /* atan(-,+) */
- case 2: return pi-(z-pi_lo);/* atan(+,-) */
- default: /* case 3 */
- return (z-pi_lo)-pi;/* atan(-,-) */
- }
+ // Octant 0 result
+ float a = mad(p, MATH_RECIP(q), vbyu);
+
+ // Fix up 3 other octants
+ float at = piby2 - a;
+ a = ay > ax ? at : a;
+ at = pi - a;
+ a = x < 0.0F ? at : a;
+
+ // y == 0 => 0 for x >= 0, pi for x < 0
+ at = as_int(x) < 0 ? pi : 0.0f;
+ a = y == 0.0f ? at : a;
+
+ // if (!FINITE_ONLY()) {
+ // x and y are +- Inf
+ at = x > 0.0f ? piby4 : threepiby4;
+ a = ax == INFINITY & ay == INFINITY ? at : a;
+
+ // x or y is NaN
+ a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+ // }
+
+ // Fixup sign and return
+ return copysign(a, y);
}
OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
@@ -1511,74 +1499,43 @@ OVERLOADABLE float __gen_ocl_internal_rint(float x) {
return __gen_ocl_rnde(x);
}
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
- float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
- u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
- twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
- ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
- one = 1.0,
- huge = 1.0e+30,
- P1 = 1.6666667163e-01, /* 0x3e2aaaab */
- P2 = -2.7777778450e-03; /* 0xbb360b61 */
- float y,hi=0.0,lo=0.0,c,t;
- int k=0,xsb;
- unsigned hx;
- float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
- float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
- float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
- float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
- float half_0 = 0.5;
- float half_1 = -0.5;
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_exp(float x){
+ // Reduce x
+ const float ln2HI = 0x1.62e300p-1f;
+ const float ln2LO = 0x1.2fefa2p-17f;
+ const float invln2 = 0x1.715476p+0f;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- xsb = (hx>>31)&1; /* sign bit of x */
- hx &= 0x7fffffff; /* high word of |x| */
+ float fhalF = x < 0.0f ? -0.5f : 0.5f;
+ int p = mad(x, invln2, fhalF);
+ float fp = (float)p;
+ float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+ float lo = -fp*ln2LO;
- /* filter out non-finite argument */
- if(hx >= 0x42b17218) { /* if |x|>=88.721... */
- if(hx>0x7f800000)
- return x+x; /* NaN */
- if(hx==0x7f800000)
- return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */
- if(x > o_threshold) return huge*huge; /* overflow */
- if(x < u_threshold) return twom100*twom100; /* underflow */
- }
- /* argument reduction */
- if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
- if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */
- hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
- } else {
- float tmp = xsb == 1 ? half_1 : half_0;
- k = ivln2*x+tmp;
- t = k;
- hi = x - t*ln2HI_0; /* t*ln2HI is exact here */
- lo = t*ln2LO_0;
- }
- x = hi - lo;
- }
- else if(hx < 0x31800000) { /* when |x|<2**-28 */
- if(huge+x>one) return one+x;/* trigger inexact */
- }
- else k = 0;
+ // Evaluate poly
+ float t = hi + lo;
+ float tt = t*t;
+ float v = mad(tt,
+ -mad(tt,
+ mad(tt,
+ mad(tt,
+ mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+ 0x1.1566aap-14f),
+ -0x1.6c16c2p-9f),
+ 0x1.555556p-3f),
+ t);
- /* x is now in primary range */
- t = x*x;
- c = x - t*(P1+t*P2);
- if(k==0)
- return one-((x*c)/(c-(float)2.0)-x);
- else
- y = one-((lo-(x*c)/((float)2.0-c))-hi);
- if(k >= -125) {
- unsigned hy;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
- return y;
- } else {
- unsigned hy;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
- return y*twom100;
- }
+ float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+ // Scale by 2^p
+ float r = as_float(as_int(y) + (p << 23));
+
+ const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+ const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+ r = x < llim ? 0.0f : r;
+ r = x < ulim ? r : as_float(0x7f800000);
+ return isnan(x) ? x : r;
}
/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
@@ -2041,56 +1998,75 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
return y;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
- //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
- float one = 1.0,
- ln2 = 6.9314718246e-01;/* 0x3f317218 */
- float t;
- int hx;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- if(hx<0x3f800000) { /* x < 1 */
- return (x-x)/(x-x);
- } else if(hx >=0x4d800000) { /* x > 2**28 */
- if(hx >=0x7f800000) {/* x is inf of NaN */
- return x+x;
- } else
- return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
- } else if (hx==0x3f800000) {
- return 0.0; /* acosh(1) = 0 */
- } else if (hx > 0x40000000) { /* 2**28 > x > 2 */
- t=x*x;
- return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
- } else { /* 1<x<2 */
- t = x-one;
- return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
- }
+ uint ux = as_uint(x);
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by acosh(x) = ln(2) + ln(x)
+ // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+ // acosh(x) = ln(x + sqrt(x*x-1)) */
+ int high = ux > 0x46000000U;
+ int med = ux > 0x40000000U;
+
+ float w = x - 1.0f;
+ float s = w*w + 2.0f*w;
+ float t = x*x - 1.0f;
+ float r = sqrt(med ? t : s) + (med ? x : w);
+ float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+ float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+ z = ux >= PINFBITPATT_SP32 ? x : z;
+ z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+ return z;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_asinh(float x){
- //return native_log(x + native_sqrt(x * x + 1));
- float one = 1.0000000000e+00, /* 0x3F800000 */
- ln2 = 6.9314718246e-01, /* 0x3f317218 */
- huge= 1.0000000000e+30;
- float w;
- int hx,ix;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- if(ix< 0x38000000) { /* |x|<2**-14 */
- if(huge+x>one) return x; /* return x inexact except 0 */
- }
- if(ix>0x47000000) {/* |x| > 2**14 */
- if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
- w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
- } else {
- float xa = __gen_ocl_internal_fabs(x);
- if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
- w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
- } else { /* 2.0 > |x| > 2**-14 */
- float t = xa*xa;
- w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
- }
- }
- return __gen_ocl_internal_copysign(w, x);
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xsgn = ax ^ ux;
+
+ // |x| <= 2
+ float t = x * x;
+ float a = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+ -5.063201055468483248e-1f),
+ -1.480204186473758321f),
+ -1.152965835871758072f);
+ float b = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+ 6.582362487198468066f),
+ 11.99423176003939087f),
+ 6.917795026025976739f);
+
+ float q = MATH_DIVIDE(a, b);
+ float z1 = mad(x*t, q, x);
+
+ // |x| > 2
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+ // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+ // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+ // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+ float absx = as_float(ax);
+ int hi = ax > 0x46000000U;
+ float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+ y = hi ? absx : y;
+ float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+ float z2 = as_float(xsgn | as_uint(r));
+
+ float z = ax <= 0x40000000 ? z1 : z2;
+ z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+ return z;
}
OVERLOADABLE float __gen_ocl_internal_sinh(float x){
@@ -2125,38 +2101,51 @@ OVERLOADABLE float __gen_ocl_internal_sinh(float x){
return x*shuge;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
- //float y = native_exp(-2 * x);
- //return (1 - y) / (1 + y);
- float one=1.0, two=2.0, tiny = 1.0e-30;
- float t,z;
- int jx,ix;
- GEN_OCL_GET_FLOAT_WORD(jx,x);
- ix = jx&0x7fffffff;
- /* x is INF or NaN */
- if(ix>=0x7f800000) {
- if (jx>=0)
- return one/x+one; /* tanh(+-inf)=+-1 */
- else
- return one/x-one; /* tanh(NaN) = NaN */
- }
+ // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+ // to the following three formulae:
+ // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
+ // 2. (1 - (2/(exp(2*x) + 1 )))
+ // 3. (exp(2*x) - 1)/(exp(2*x) + 1)
+ // but computationally, some formulae are better on some ranges.
- if (ix < 0x41b00000) { /* |x|<22 */
- if (ix == 0)
- return x; /* x == +-0 */
- if (ix<0x24000000) /* |x|<2**-55 */
- return x*(one+x); /* tanh(small) = small */
- if (ix>=0x3f800000) { /* |x|>=1 */
- t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
- z = one - two/(t+two);
- } else {
- t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
- z= -t/(t+two);
- }
- } else { /* |x| > 22, return +-1 */
- z = one - tiny; /* raised inexact flag */
- }
- return (jx>=0)? z: -z;
+ const float large_threshold = 0x1.0a2b24p+3f;
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ aux;
+
+ float y = as_float(aux);
+ float y2 = y*y;
+
+ float a1 = mad(y2,
+ mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+ -0.28192806108402678e0F);
+ float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+ float a2 = mad(y2,
+ mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+ -0.24069858695196524e0F);
+ float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+ int c = y < 0.9f;
+ float a = c ? a1 : a2;
+ float b = c ? b1 : b2;
+ float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+ float p = exp(2.0f * y) + 1.0f;
+ float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+ float z = y <= 1.0f ? zlo : zhi;
+ z = as_float(xs | as_uint(z));
+
+ // Edge cases
+ float sone = as_float(0x3f800000U | xs);
+ z = y > large_threshold ? sone : z;
+ z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+ return z;
}
OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
@@ -2233,26 +2222,102 @@ OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
return x;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
- x = __gen_ocl_scalbnf(x,n);
- return x;
-}
-
+ // This treats subnormals as zeros
+ int i = as_int(x);
+ int e = (i >> 23) & 0xff;
+ int m = i & 0x007fffff;
+ int s = i & 0x80000000;
+ int v = add_sat(e, n);
+ v = clamp(v, 0, 0xff);
+ int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+ int c = e == 0xff;
+ mr = c ? m : mr;
+ int er = c ? e : v;
+ er = e ? er : e;
+ return as_float( s | (er << 23) | mr );
+
+ /* supports denormal values */
+ const int multiplier = 24;
+ float val_f;
+ uint val_ui;
+ uint sign;
+ int exponent;
+ val_ui = as_uint(x);
+ sign = val_ui & 0x80000000;
+ val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+ int val_x = val_ui;
+
+ exponent = val_ui >> 23; /* get the exponent */
+ int dexp = exponent;
+
+ /* denormal support */
+ int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+ int dexponent = 25 - fbh;
+ uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+ int ex = dexponent + n - multiplier;
+ dexponent = ex;
+ uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+ int ex1 = dexponent + multiplier;
+ ex1 = -ex1 +25;
+ dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+ dval_ui = dexponent > 0 ? val :dval_ui;
+ dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/
+ dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/
+ dval_ui = dval_ui | sign;
+ val_f = as_float(dval_ui);
+
+ exponent += n;
+
+ val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+ ex1 = exponent + multiplier;
+ ex1 = -ex1 +25;
+ val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+ val_ui = exponent > 0 ? val :val_ui;
+ val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/
+ val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/
+ val_ui = val_ui | sign;
+
+ val_ui = dexp == 0? dval_ui : val_ui;
+ val_f = as_float(val_ui);
+
+ val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+ return val_f;
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
- //return 0.5f * native_sqrt((1 + x) / (1 - x));
- float xa = __gen_ocl_fabs (x);
- float t;
- if (isless (xa, 0.5f)){
- if (xa < 0x1.0p-28f) return x;
- t = xa + xa;
- t = 0.5f * log1p (t + t * xa / (1.0f - xa));
- } else if (isless (xa, 1.0f)){
- t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
- } else{
- if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
- return x / 0.0f;
- }
- return __gen_ocl_internal_copysign(t, x);
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ ax;
+
+ // |x| > 1 or NaN
+ float z = as_float(QNANBITPATT_SP32);
+
+ // |x| == 1
+ float t = as_float(xs | PINFBITPATT_SP32);
+ z = ax == 0x3f800000U ? t : z;
+
+ // 1/2 <= |x| < 1
+ t = as_float(ax);
+ t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+ t = 0.5f * log1p(t);
+ t = as_float(xs | as_uint(t));
+ z = ax < 0x3f800000U ? t : z;
+
+ // |x| < 1/2
+ t = x * x;
+ float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+ float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+ float p = MATH_DIVIDE(a, b);
+ t = mad(x*t, p, x);
+ z = ax < 0x3f000000 ? t : z;
+
+ // |x| < 2^-13
+ z = ax < 0x39000000U ? x : z;
+
+ return z;
}
OVERLOADABLE float __gen_ocl_internal_exp10(float x){
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 0de3642..4987b18 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -20,6 +20,77 @@
#include "ocl_types.h"
+/******************** START AMD ***********************/
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32 0x80000000
+#define EXSIGNBIT_SP32 0x7fffffff
+#define EXPBITS_SP32 0x7f800000
+#define MANTBITS_SP32 0x007fffff
+#define ONEEXPBITS_SP32 0x3f800000
+#define TWOEXPBITS_SP32 0x40000000
+#define HALFEXPBITS_SP32 0x3f000000
+#define IMPBIT_SP32 0x00800000
+#define QNANBITPATT_SP32 0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32 0x7f800000
+#define NINFBITPATT_SP32 0xff800000
+#define EXPBIAS_SP32 127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32 1
+#define EMIN_SP32 -126
+#define BIASEDEMAX_SP32 254
+#define EMAX_SP32 127
+#define LAMBDA_SP32 1.0e30
+#define MANTLENGTH_SP32 24
+#define BASEDIGITS_SP32 7
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64 0x8000000000000000L
+#define EXSIGNBIT_DP64 0x7fffffffffffffffL
+#define EXPBITS_DP64 0x7ff0000000000000L
+#define MANTBITS_DP64 0x000fffffffffffffL
+#define ONEEXPBITS_DP64 0x3ff0000000000000L
+#define TWOEXPBITS_DP64 0x4000000000000000L
+#define HALFEXPBITS_DP64 0x3fe0000000000000L
+#define IMPBIT_DP64 0x0010000000000000L
+#define QNANBITPATT_DP64 0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64 0x7ff0000000000000L
+#define NINFBITPATT_DP64 0xfff0000000000000L
+#define EXPBIAS_DP64 1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64 1
+#define EMIN_DP64 -1022
+#define BIASEDEMAX_DP64 2046 /* 0x7fe */
+#define EMAX_DP64 1023 /* 0x3ff */
+#define LAMBDA_DP64 1.0e300
+#define MANTLENGTH_DP64 53
+#define BASEDIGITS_DP64 15
+
+#endif // cl_khr_fp64
+
+/******************** END AMD ***********************/
+
OVERLOADABLE float cospi(float x);
OVERLOADABLE float cosh(float x);
OVERLOADABLE float acos(float x);
--
2.5.0
More information about the Beignet
mailing list