[Beignet] [PATCH 3/3] Backend: Optimization experiment by using AMD clc
Grigore Lupescu
grigore.lupescu at intel.com
Sat May 14 09:45:41 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 1020 ++++++++++++++++--------------
backend/src/libocl/tmpl/ocl_math.tmpl.h | 71 +++
kernels/bench_math.cl | 30 +-
3 files changed, 626 insertions(+), 495 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 4d70c4d..23b56b5 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -82,6 +82,9 @@ OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
return __gen_ocl_cos(x * M_PI_F);
}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp2 (float x) {
+ return native_exp2(x);
+}
OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
return native_exp(x);
}
@@ -164,6 +167,22 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
return ux.f;
}
+OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+ float y = __gen_ocl_rndz(x);
+ if (__gen_ocl_fabs(x - y) >= 0.5f)
+ y += __gen_ocl_internal_copysign(1.f, x);
+ return y;
+}
+OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+ return __gen_ocl_rnde(x);
+}
+
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+
OVERLOADABLE float __gen_ocl_internal_log(float x) {
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
@@ -361,7 +380,6 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
}
}
-
float __gen_ocl_scalbnf (float x, int n){
/* copy from fdlibm */
float two25 = 3.355443200e+07, /* 0x4c000000 */
@@ -545,9 +563,13 @@ OVERLOADABLE float __kernel_sinf(float x)
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4));
+ r = mad(z,
+ mad(z,
+ mad(z, S4, S3),
+ S2),
+ S1);
- return x+v*(S1+z*r);
+ return mad(v, r, x);
}
float __kernel_cosf(float x, float y)
@@ -563,7 +585,9 @@ float __kernel_cosf(float x, float y)
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3)));
+ r = z * mad(z,
+ mad(z, C3, C2),
+ C1);
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
@@ -1239,6 +1263,271 @@ OVERLOADABLE int ilogb(float x) {
OVERLOADABLE float nan(uint code) {
return NAN;
}
+
+OVERLOADABLE float __gen_ocl_internal_exp2(float x){
+ /* precision is enough on native/fastpath */
+ return __gen_ocl_internal_fastpath_exp2(x);
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+ // This treats subnormals as zeros
+ int i = as_int(x);
+ int e = (i >> 23) & 0xff;
+ int m = i & 0x007fffff;
+ int s = i & 0x80000000;
+ int v = add_sat(e, n);
+ v = clamp(v, 0, 0xff);
+ int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+ int c = e == 0xff;
+ mr = c ? m : mr;
+ int er = c ? e : v;
+ er = e ? er : e;
+ return as_float( s | (er << 23) | mr );
+
+ /* supports denormal values */
+ const int multiplier = 24;
+ float val_f;
+ uint val_ui;
+ uint sign;
+ int exponent;
+ val_ui = as_uint(x);
+ sign = val_ui & 0x80000000;
+ val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+ int val_x = val_ui;
+
+ exponent = val_ui >> 23; /* get the exponent */
+ int dexp = exponent;
+
+ /* denormal support */
+ int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+ int dexponent = 25 - fbh;
+ uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+ int ex = dexponent + n - multiplier;
+ dexponent = ex;
+ uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+ int ex1 = dexponent + multiplier;
+ ex1 = -ex1 +25;
+ dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+ dval_ui = dexponent > 0 ? val :dval_ui;
+ dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/
+ dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/
+ dval_ui = dval_ui | sign;
+ val_f = as_float(dval_ui);
+
+ exponent += n;
+
+ val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+ ex1 = exponent + multiplier;
+ ex1 = -ex1 +25;
+ val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+ val_ui = exponent > 0 ? val :val_ui;
+ val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/
+ val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/
+ val_ui = val_ui | sign;
+
+ val_ui = dexp == 0? dval_ui : val_ui;
+ val_f = as_float(val_ui);
+
+ val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+ return val_f;
+}
+
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
+OVERLOADABLE float __gen_ocl_internal_exp(float x){
+ // Reduce x
+ const float ln2HI = 0x1.62e300p-1f;
+ const float ln2LO = 0x1.2fefa2p-17f;
+ const float invln2 = 0x1.715476p+0f;
+
+ float fhalF = x < 0.0f ? -0.5f : 0.5f;
+ int p = mad(x, invln2, fhalF);
+ float fp = (float)p;
+ float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+ float lo = -fp*ln2LO;
+
+ // Evaluate poly
+ float t = hi + lo;
+ float tt = t*t;
+ float v = mad(tt,
+ -mad(tt,
+ mad(tt,
+ mad(tt,
+ mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+ 0x1.1566aap-14f),
+ -0x1.6c16c2p-9f),
+ 0x1.555556p-3f),
+ t);
+
+ float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+ // Scale by 2^p
+ float r = as_float(as_int(y) + (p << 23));
+
+ const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+ const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+ r = x < llim ? 0.0f : r;
+ r = x < ulim ? r : as_float(0x7f800000);
+ return isnan(x) ? x : r;
+}
+
+/* MODIFIED, copy from fdlib */
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+
+ float px, qx,ans;
+ short n;
+ int i;
+ float*p;
+ float MAXL10 = 38.230809449325611792;
+ float LOG210 = 3.32192809488736234787e0;
+ float LG102A = 3.00781250000000000000E-1;
+ float LG102B = 2.48745663981195213739E-4;
+
+ if( x < -MAXL10 ) return 0.0;
+
+ if( isinf(x)) return INFINITY;
+ /* The following is necessary because range reduction blows up: */
+ if( x == 0 )return 1.0;
+
+ /* Express 10**x = 10**g 2**n
+ * = 10**g 10**( n log10(2) )
+ * = 10**( g + n log10(2) )
+ */
+ px = x * LOG210;
+ qx = __gen_ocl_internal_floor( px + 0.5 );
+ n = qx;
+ x -= qx * LG102A;
+ x -= qx * LG102B;
+
+ /* rational approximation for exponential
+ * of the fractional part:
+ * 10**x - 1 = 2x P(x**2)/( Q(x**2) - P(x**2) )
+ */
+ px = mad(x,
+ mad(x,
+ mad(x,
+ mad(x,
+ mad(x,
+ mad(x, 2.063216740311022E-001, 5.420251702225484E-001),
+ 1.171292686296281E+000),
+ 2.034649854009453E+000),
+ 2.650948748208892E+000),
+ 2.302585167056758E+000),
+ 1.0f);
+
+ /* multiply by power of 2 */
+ x = __gen_ocl_internal_ldexp( px, n );
+
+ return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+ //return __gen_ocl_pow(M_E_F, x) - 1;
+ float Q1 = -3.3333335072e-02, /* 0xbd088889 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+ Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+ Q4 = 4.0082177293e-06, /* 0x36867e54 */
+ Q5 = -2.0109921195e-07, /* 0xb457edbb */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ o_threshold= 8.8721679688e+01; /* 0x42b17180 */
+ float y,hi,lo,c,t,e,hxs,hfx,r1;
+ int k,xsb;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = hx&0x80000000;
+ /* sign bit of x */
+ //if(xsb==0)
+ //y=x;
+ //else
+ //y= -x; /* y = |x| */
+ y = __gen_ocl_internal_fabs(x);
+ hx &= 0x7fffffff; /* high word of |x| */
+ /* filter out huge and non-finite argument */
+ if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+ if(x > o_threshold)
+ return huge*huge; /* overflow */
+ }
+ if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+ if(x+tiny<(float)0.0) /* raise inexact */
+ return tiny-one; /* return -1 */
+ }
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+ if(xsb==0){
+ hi = x - ln2_hi; lo = ln2_lo; k = 1;
+ } else {
+ hi = x + ln2_hi; lo = -ln2_lo; k = -1;
+ }
+ } else {
+ k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+ t = k;
+ hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+ lo = t*ln2_lo;
+ }
+ x = hi - lo;
+ c = (hi-x)-lo;
+ } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
+ //t = huge+x; /* return x with inexact flags when x!=0 */
+ //return x - (t-(huge+x));
+ return x;
+ } else k = 0;
+ /* x is now in primary range */
+ hfx = (float)0.5*x;
+ hxs = x*hfx;
+ r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ t = (float)3.0-r1*hfx;
+ e = hxs*((r1-t)/((float)6.0 - x*t));
+ if(k==0)
+ return x - (x*e-hxs); /* c is 0 */
+ else{
+ e = (x*(e-c)-c);
+ e -= hxs;
+ if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+ if(k==1){
+ if(x < (float)-0.25)
+ return -(float)2.0*(e-(x+(float)0.5));
+ else
+ return (one+(float)2.0*(x-e));
+ }
+ if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
+ int i;
+ y = one-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ return y-one;
+ }
+ t = one;
+ if(k<23) {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+ y = t-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ } else {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
+ y = x-(e+t);
+ y += one;
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ }
+ }
+ return y;
+}
+
OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
float sign = 1.0f;
int ix;
@@ -1422,233 +1711,134 @@ __constant float atanlo[4] = {
7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
};
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atan(float x) {
- /* copied from fdlibm */
- float aT[11];
- aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
- aT[1] = -2.0000000298e-01; /* 0xbe4ccccd */
- aT[2] = 1.4285714924e-01; /* 0x3e124925 */
- aT[3] = -1.1111110449e-01; /* 0xbde38e38 */
- aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
- aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
- aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
- const float one = 1.0, huge = 1.0e30;
-
- float w,s1,s2,z;
- int ix,hx,id;
+ const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- if(ix>=0x50800000) { /* if |x| >= 2^34 */
- if(ix>0x7f800000)
- return x+x; /* NaN */
- if(hx>0) return atanhi[3]+atanlo[3];
- else return -atanhi[3]-atanlo[3];
- } if (ix < 0x3ee00000) { /* |x| < 0.4375 */
- if (ix < 0x31000000) { /* |x| < 2^-29 */
- if(huge+x>one) return x; /* raise inexact */
- }
- id = -1;
- } else {
- x = __gen_ocl_fabs(x);
- if (ix < 0x3f980000) { /* |x| < 1.1875 */
- if (ix < 0x3f300000) { /* 7/16 <=|x|<11/16 */
- id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
- } else { /* 11/16<=|x|< 19/16 */
- id = 1; x = (x-one)/(x+one);
- }
- } else {
- if (ix < 0x401c0000) { /* |x| < 2.4375 */
- id = 2; x = (x-(float)1.5)/(one+(float)1.5*x);
- } else { /* 2.4375 <= |x| < 2^66 */
- id = 3; x = -(float)1.0/x;
- }
- }}
- /* end of argument reduction */
- z = x*x;
- w = z*z;
- /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
- s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
- s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
- if (id<0) return x - x*(s1+s2);
- else {
- z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
- return (hx<0)? -z:z;
- }
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint sx = ux ^ aux;
+
+ float spiby2 = as_float(sx | as_uint(piby2));
+
+ float v = as_float(aux);
+
+ // Return for NaN
+ float ret = x;
+
+ // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+ ret = aux <= PINFBITPATT_SP32 ? spiby2 : ret;
+ // Reduce arguments 2^-19 <= |x| < 2^26
+
+ // 39/16 <= x < 2^26
+ x = -MATH_RECIP(v);
+ float c = 1.57079632679489655800f; // atan(infinity)
+
+ // 19/16 <= x < 39/16
+ int l = aux < 0x401c0000;
+ float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+ x = l ? xx : x;
+ c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+ // 11/16 <= x < 19/16
+ l = aux < 0x3f980000U;
+ xx = MATH_DIVIDE(v - 1.0f, 1.0f + v);
+ x = l ? xx : x;
+ c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+ // 7/16 <= x < 11/16
+ l = aux < 0x3f300000;
+ xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+ x = l ? xx : x;
+ c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+ // 2^-19 <= x < 7/16
+ l = aux < 0x3ee00000;
+ x = l ? v : x;
+ c = l ? 0.0f : c;
+
+ // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+ float s = x * x;
+ float a = mad(s,
+ mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+ 0.296528598819239217902158651186f);
+
+ float b = mad(s,
+ mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+ 0.889585796862432286486651434570f);
+
+ float q = x * s * MATH_DIVIDE(a, b);
+
+ float z = c - (q - x);
+ float zs = as_float(sx | as_uint(z));
+
+ ret = aux < 0x4c800000 ? zs : ret;
+
+ // |x| < 2^-19
+ ret = aux < 0x36000000 ? as_float(ux) : ret;
+ return ret;
}
OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
return __gen_ocl_internal_atan(x) / M_PI_F;
}
-// XXX work-around PTX profile
-OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
-OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
- /* copied from fdlibm */
- float z;
- int k,m,hx,hy,ix,iy;
- const float
- tiny = 1.0e-30,
- zero = 0.0,
- pi_o_4 = 7.8539818525e-01, /* 0x3f490fdb */
- pi_o_2 = 1.5707963705e+00, /* 0x3fc90fdb */
- pi = 3.1415927410e+00, /* 0x40490fdb */
- pi_lo = -8.7422776573e-08; /* 0xb3bbbd2e */
+ const float pi = 0x1.921fb6p+1f;
+ const float piby2 = 0x1.921fb6p+0f;
+ const float piby4 = 0x1.921fb6p-1f;
+ const float threepiby4 = 0x1.2d97c8p+1f;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- iy = hy&0x7fffffff;
+ float ax = fabs(x);
+ float ay = fabs(y);
+ float v = min(ax, ay);
+ float u = max(ax, ay);
- if((ix>0x7f800000)||
- (iy>0x7f800000)) /* x or y is NaN */
- return x+y;
- if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y); /* x=1.0 */
- m = ((hy>>31)&1)|((hx>>30)&2); /* 2*sign(x)+sign(y) */
+ // Scale since u could be large, as in "regular" divide
+ float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+ float vbyu = s * MATH_DIVIDE(v, s*u);
- /* when y = 0 */
- if(iy==0) {
- switch(m) {
- case 0:
- case 1: return y; /* atan(+-0,+anything)=+-0 */
- case 2: return pi+tiny;/* atan(+0,-anything) = pi */
- case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
- }
- }
- /* when x = 0 */
- if(ix==0) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+ float vbyu2 = vbyu * vbyu;
- /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
- if(ix <= 0x7fffff && iy <= 0x7fffff) {
- x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
- y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
- }
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+ float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+ float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
- /* when x is INF */
- if(ix==0x7f800000) {
- if(iy==0x7f800000) {
- switch(m) {
- case 0: return pi_o_4+tiny;/* atan(+INF,+INF) */
- case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
- case 2: return (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
- case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
- }
- } else {
- switch(m) {
- case 0: return zero ; /* atan(+...,+INF) */
- case 1: return -zero ; /* atan(-...,+INF) */
- case 2: return pi+tiny ; /* atan(+...,-INF) */
- case 3: return -pi-tiny ; /* atan(-...,-INF) */
- }
- }
- }
- /* when y is INF */
- if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
- /* compute y/x */
- k = (iy-ix)>>23;
- if(k > 60) z=pi_o_2+(float)0.5*pi_lo; /* |y/x| > 2**60 */
- else if(hx<0&&k<-60) z=0.0; /* |y|/x < -2**60 */
- else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
- switch (m) {
- case 0: return z ; /* atan(+,+) */
- case 1: {
- uint zh;
- GEN_OCL_GET_FLOAT_WORD(zh,z);
- GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
- }
- return z ; /* atan(-,+) */
- case 2: return pi-(z-pi_lo);/* atan(+,-) */
- default: /* case 3 */
- return (z-pi_lo)-pi;/* atan(-,-) */
- }
-}
+ // Octant 0 result
+ float a = mad(p, MATH_RECIP(q), vbyu);
-OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
- return __gen_ocl_internal_atan2(y, x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
-OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-OVERLOADABLE float __gen_ocl_internal_round(float x) {
- float y = __gen_ocl_rndz(x);
- if (__gen_ocl_fabs(x - y) >= 0.5f)
- y += __gen_ocl_internal_copysign(1.f, x);
- return y;
-}
-OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
-OVERLOADABLE float __gen_ocl_internal_rint(float x) {
- return __gen_ocl_rnde(x);
-}
+ // Fix up 3 other octants
+ float at = piby2 - a;
+ a = ay > ax ? at : a;
+ at = pi - a;
+ a = x < 0.0F ? at : a;
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
- float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
- u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
- twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
- ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
- one = 1.0,
- huge = 1.0e+30,
- P1 = 1.6666667163e-01, /* 0x3e2aaaab */
- P2 = -2.7777778450e-03; /* 0xbb360b61 */
- float y,hi=0.0,lo=0.0,c,t;
- int k=0,xsb;
- unsigned hx;
- float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
- float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
- float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
- float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
- float half_0 = 0.5;
- float half_1 = -0.5;
+ // y == 0 => 0 for x >= 0, pi for x < 0
+ at = as_int(x) < 0 ? pi : 0.0f;
+ a = y == 0.0f ? at : a;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- xsb = (hx>>31)&1; /* sign bit of x */
- hx &= 0x7fffffff; /* high word of |x| */
+ // if (!FINITE_ONLY()) {
+ // x and y are +- Inf
+ at = x > 0.0f ? piby4 : threepiby4;
+ a = ax == INFINITY & ay == INFINITY ? at : a;
- /* filter out non-finite argument */
- if(hx >= 0x42b17218) { /* if |x|>=88.721... */
- if(hx>0x7f800000)
- return x+x; /* NaN */
- if(hx==0x7f800000)
- return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */
- if(x > o_threshold) return huge*huge; /* overflow */
- if(x < u_threshold) return twom100*twom100; /* underflow */
- }
- /* argument reduction */
- if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
- if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */
- hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
- } else {
- float tmp = xsb == 1 ? half_1 : half_0;
- k = ivln2*x+tmp;
- t = k;
- hi = x - t*ln2HI_0; /* t*ln2HI is exact here */
- lo = t*ln2LO_0;
- }
- x = hi - lo;
- }
- else if(hx < 0x31800000) { /* when |x|<2**-28 */
- if(huge+x>one) return one+x;/* trigger inexact */
- }
- else k = 0;
+ // x or y is NaN
+ a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+ // }
- /* x is now in primary range */
- t = x*x;
- c = x - t*(P1+t*P2);
- if(k==0)
- return one-((x*c)/(c-(float)2.0)-x);
- else
- y = one-((lo-(x*c)/((float)2.0-c))-hi);
- if(k >= -125) {
- unsigned hy;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
- return y;
- } else {
- unsigned hy;
- GEN_OCL_GET_FLOAT_WORD(hy,y);
- GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
- return y*twom100;
- }
+ // Fixup sign and return
+ return copysign(a, y);
+}
+
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+ return __gen_ocl_internal_atan2(y, x) / M_PI_F;
}
/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
@@ -2008,159 +2198,75 @@ OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
return x; /* exact output */
}
-OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
- //return __gen_ocl_pow(M_E_F, x) - 1;
- float Q1 = -3.3333335072e-02, /* 0xbd088889 */
- ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
- ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
- Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
- huge = 1.0e30,
- tiny = 1.0e-30,
- ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
- one = 1.0,
- o_threshold= 8.8721679688e+01; /* 0x42b17180 */
- float y,hi,lo,c,t,e,hxs,hfx,r1;
- int k,xsb;
- int hx;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- xsb = hx&0x80000000;
- /* sign bit of x */
- //if(xsb==0)
- //y=x;
- //else
- //y= -x; /* y = |x| */
- y = __gen_ocl_internal_fabs(x);
- hx &= 0x7fffffff; /* high word of |x| */
- /* filter out huge and non-finite argument */
- if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
- if(hx >= 0x42b17218) { /* if |x|>=88.721... */
- if(hx>0x7f800000)
- return x+x; /* NaN */
- if(hx==0x7f800000)
- return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
- if(x > o_threshold)
- return huge*huge; /* overflow */
- }
- if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
- if(x+tiny<(float)0.0) /* raise inexact */
- return tiny-one; /* return -1 */
- }
- }
- /* argument reduction */
- if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
- if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
- if(xsb==0){
- hi = x - ln2_hi; lo = ln2_lo; k = 1;
- } else {
- hi = x + ln2_hi; lo = -ln2_lo; k = -1;
- }
- } else {
- k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
- t = k;
- hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
- lo = t*ln2_lo;
- }
- x = hi - lo;
- c = (hi-x)-lo;
- } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
- //t = huge+x; /* return x with inexact flags when x!=0 */
- //return x - (t-(huge+x));
- return x;
- } else k = 0;
- /* x is now in primary range */
- hfx = (float)0.5*x;
- hxs = x*hfx;
- r1 = one+hxs*(Q1+hxs*Q2);
- t = (float)3.0-r1*hfx;
- e = hxs*((r1-t)/((float)6.0 - x*t));
- if(k==0)
- return x - (x*e-hxs); /* c is 0 */
- else{
- e = (x*(e-c)-c);
- e -= hxs;
- if(k== -1)return (float)0.5*(x-e)-(float)0.5;
- if(k==1){
- if(x < (float)-0.25)
- return -(float)2.0*(e-(x+(float)0.5));
- else
- return (one+(float)2.0*(x-e));
- }
- if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
- int i;
- y = one-(e-x);
- GEN_OCL_GET_FLOAT_WORD(i,y);
- GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
- return y-one;
- }
- t = one;
- if(k<23) {
- int i;
- GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
- y = t-(e-x);
- GEN_OCL_GET_FLOAT_WORD(i,y);
- GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
- } else {
- int i;
- GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
- y = x-(e+t);
- y += one;
- GEN_OCL_GET_FLOAT_WORD(i,y);
- GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
- }
- }
- return y;
-}
-
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
- //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
- float one = 1.0,
- ln2 = 6.9314718246e-01;/* 0x3f317218 */
- float t;
- int hx;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- if(hx<0x3f800000) { /* x < 1 */
- return (x-x)/(x-x);
- } else if(hx >=0x4d800000) { /* x > 2**28 */
- if(hx >=0x7f800000) {/* x is inf of NaN */
- return x+x;
- } else
- return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
- } else if (hx==0x3f800000) {
- return 0.0; /* acosh(1) = 0 */
- } else if (hx > 0x40000000) { /* 2**28 > x > 2 */
- t=x*x;
- return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
- } else { /* 1<x<2 */
- t = x-one;
- return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
- }
+ uint ux = as_uint(x);
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by acosh(x) = ln(2) + ln(x)
+ // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+ // acosh(x) = ln(x + sqrt(x*x-1)) */
+ int high = ux > 0x46000000U;
+ int med = ux > 0x40000000U;
+
+ float w = x - 1.0f;
+ float s = w*w + 2.0f*w;
+ float t = x*x - 1.0f;
+ float r = sqrt(med ? t : s) + (med ? x : w);
+ float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+ float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+ z = ux >= PINFBITPATT_SP32 ? x : z;
+ z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+ return z;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_asinh(float x){
- //return native_log(x + native_sqrt(x * x + 1));
- float one = 1.0000000000e+00, /* 0x3F800000 */
- ln2 = 6.9314718246e-01, /* 0x3f317218 */
- huge= 1.0000000000e+30;
- float w;
- int hx,ix;
- GEN_OCL_GET_FLOAT_WORD(hx,x);
- ix = hx&0x7fffffff;
- if(ix< 0x38000000) { /* |x|<2**-14 */
- if(huge+x>one) return x; /* return x inexact except 0 */
- }
- if(ix>0x47000000) {/* |x| > 2**14 */
- if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
- w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
- } else {
- float xa = __gen_ocl_internal_fabs(x);
- if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
- w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
- } else { /* 2.0 > |x| > 2**-14 */
- float t = xa*xa;
- w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
- }
- }
- return __gen_ocl_internal_copysign(w, x);
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xsgn = ax ^ ux;
+
+ // |x| <= 2
+ float t = x * x;
+ float a = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+ -5.063201055468483248e-1f),
+ -1.480204186473758321f),
+ -1.152965835871758072f);
+ float b = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+ 6.582362487198468066f),
+ 11.99423176003939087f),
+ 6.917795026025976739f);
+
+ float q = MATH_DIVIDE(a, b);
+ float z1 = mad(x*t, q, x);
+
+ // |x| > 2
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+ // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+ // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+ // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+ float absx = as_float(ax);
+ int hi = ax > 0x46000000U;
+ float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+ y = hi ? absx : y;
+ float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+ float z2 = as_float(xsgn | as_uint(r));
+
+ float z = ax <= 0x40000000 ? z1 : z2;
+ z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+ return z;
}
OVERLOADABLE float __gen_ocl_internal_sinh(float x){
@@ -2195,38 +2301,51 @@ OVERLOADABLE float __gen_ocl_internal_sinh(float x){
return x*shuge;
}
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
- //float y = native_exp(-2 * x);
- //return (1 - y) / (1 + y);
- float one=1.0, two=2.0, tiny = 1.0e-30;
- float t,z;
- int jx,ix;
- GEN_OCL_GET_FLOAT_WORD(jx,x);
- ix = jx&0x7fffffff;
- /* x is INF or NaN */
- if(ix>=0x7f800000) {
- if (jx>=0)
- return one/x+one; /* tanh(+-inf)=+-1 */
- else
- return one/x-one; /* tanh(NaN) = NaN */
- }
+ // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+ // to the following three formulae:
+ // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
+ // 2. (1 - (2/(exp(2*x) + 1 )))
+ // 3. (exp(2*x) - 1)/(exp(2*x) + 1)
+ // but computationally, some formulae are better on some ranges.
- if (ix < 0x41b00000) { /* |x|<22 */
- if (ix == 0)
- return x; /* x == +-0 */
- if (ix<0x24000000) /* |x|<2**-55 */
- return x*(one+x); /* tanh(small) = small */
- if (ix>=0x3f800000) { /* |x|>=1 */
- t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
- z = one - two/(t+two);
- } else {
- t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
- z= -t/(t+two);
- }
- } else { /* |x| > 22, return +-1 */
- z = one - tiny; /* raised inexact flag */
- }
- return (jx>=0)? z: -z;
+ const float large_threshold = 0x1.0a2b24p+3f;
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ aux;
+
+ float y = as_float(aux);
+ float y2 = y*y;
+
+ float a1 = mad(y2,
+ mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+ -0.28192806108402678e0F);
+ float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+ float a2 = mad(y2,
+ mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+ -0.24069858695196524e0F);
+ float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+ int c = y < 0.9f;
+ float a = c ? a1 : a2;
+ float b = c ? b1 : b2;
+ float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+ float p = exp(2.0f * y) + 1.0f;
+ float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+ float z = y <= 1.0f ? zlo : zhi;
+ z = as_float(xs | as_uint(z));
+
+ // Edge cases
+ float sone = as_float(0x3f800000U | xs);
+ z = y > large_threshold ? sone : z;
+ z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+ return z;
}
OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
@@ -2303,77 +2422,38 @@ OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
return x;
}
-OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
- x = __gen_ocl_scalbnf(x,n);
- return x;
-}
-
+/* EXPERIMENTAL, copy from libclc, AMD copyright */
OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
- //return 0.5f * native_sqrt((1 + x) / (1 - x));
- float xa = __gen_ocl_fabs (x);
- float t;
- if (isless (xa, 0.5f)){
- if (xa < 0x1.0p-28f) return x;
- t = xa + xa;
- t = 0.5f * log1p (t + t * xa / (1.0f - xa));
- } else if (isless (xa, 1.0f)){
- t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
- } else{
- if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
- return x / 0.0f;
- }
- return __gen_ocl_internal_copysign(t, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp10(float x){
- float px, qx,ans;
- short n;
- int i;
- float*p;
- float MAXL10 = 38.230809449325611792;
- float LOG210 = 3.32192809488736234787e0;
- float LG102A = 3.00781250000000000000E-1;
- float LG102B = 2.48745663981195213739E-4;
- float P[6];
- P[0] = 2.063216740311022E-001;
- P[1] = 5.420251702225484E-001;
- P[2] = 1.171292686296281E+000;
- P[3] = 2.034649854009453E+000;
- P[4] = 2.650948748208892E+000;
- P[5] = 2.302585167056758E+000;
-
- if( x < -MAXL10 ) return 0.0;
-
- if( isinf(x)) return INFINITY;
- /* The following is necessary because range reduction blows up: */
- if( x == 0 )return 1.0;
-
- /* Express 10**x = 10**g 2**n
- * = 10**g 10**( n log10(2) )
- * = 10**( g + n log10(2) )
- */
- px = x * LOG210;
- qx = __gen_ocl_internal_floor( px + 0.5 );
- n = qx;
- x -= qx * LG102A;
- x -= qx * LG102B;
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ ax;
+
+ // |x| > 1 or NaN
+ float z = as_float(QNANBITPATT_SP32);
+
+ // |x| == 1
+ float t = as_float(xs | PINFBITPATT_SP32);
+ z = ax == 0x3f800000U ? t : z;
+
+ // 1/2 <= |x| < 1
+ t = as_float(ax);
+ t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+ t = 0.5f * log1p(t);
+ t = as_float(xs | as_uint(t));
+ z = ax < 0x3f800000U ? t : z;
+
+ // |x| < 1/2
+ t = x * x;
+ float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+ float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+ float p = MATH_DIVIDE(a, b);
+ t = mad(x*t, p, x);
+ z = ax < 0x3f000000 ? t : z;
+
+ // |x| < 2^-13
+ z = ax < 0x39000000U ? x : z;
- /* rational approximation for exponential
- * of the fractional part:
- * 10**x - 1 = 2x P(x**2)/( Q(x**2) - P(x**2) )
- */
- p = P;
- ans = *p++;
- i = 5;
- do{
- ans = ans * x + *p++;
- }
- while( --i );
- px = 1.0 + x * ans;
-
- /* multiply by power of 2 */
- x = __gen_ocl_internal_ldexp( px, n );
- return x;
+ return z;
}
OVERLOADABLE float cospi(float x) {
@@ -3417,10 +3497,6 @@ OVERLOADABLE float log(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log(x);
- /* Use native/faster instruction when it has enough precision */
- if(x > 0x1.1p0)
- return __gen_ocl_internal_fastpath_log(x);
-
return __gen_ocl_internal_log(x);
}
@@ -3428,10 +3504,6 @@ OVERLOADABLE float log2(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log2(x);
- /* Use native/faster instruction when it has enough precision */
- if(x > 0x1.1p0)
- return __gen_ocl_internal_fastpath_log2(x);
-
return __gen_ocl_internal_log2(x);
}
@@ -3439,10 +3511,6 @@ OVERLOADABLE float log10(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log10(x);
- /* Use native/faster instruction when it has enough precision */
- if(x > 0x1.1p0)
- return __gen_ocl_internal_fastpath_log10(x);
-
return __gen_ocl_internal_log10(x);
}
@@ -3450,10 +3518,6 @@ OVERLOADABLE float exp(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_exp(x);
- /* Use native/faster instruction when it has enough precision */
- if (x > -0x1.6p1 && x < 0x1.6p1)
- return __gen_ocl_internal_fastpath_exp(x);
-
return __gen_ocl_internal_exp(x);
}
@@ -3466,10 +3530,6 @@ OVERLOADABLE float exp10(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_exp10(x);
- /* Use native/faster instruction when it has enough precision */
- if((x < -0x1.4p+5) || (x > +0x1.4p+5))
- return __gen_ocl_internal_fastpath_exp10(x);
-
return __gen_ocl_internal_exp10(x);
}
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 0de3642..4987b18 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -20,6 +20,77 @@
#include "ocl_types.h"
+/******************** START AMD ***********************/
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32 0x80000000
+#define EXSIGNBIT_SP32 0x7fffffff
+#define EXPBITS_SP32 0x7f800000
+#define MANTBITS_SP32 0x007fffff
+#define ONEEXPBITS_SP32 0x3f800000
+#define TWOEXPBITS_SP32 0x40000000
+#define HALFEXPBITS_SP32 0x3f000000
+#define IMPBIT_SP32 0x00800000
+#define QNANBITPATT_SP32 0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32 0x7f800000
+#define NINFBITPATT_SP32 0xff800000
+#define EXPBIAS_SP32 127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32 1
+#define EMIN_SP32 -126
+#define BIASEDEMAX_SP32 254
+#define EMAX_SP32 127
+#define LAMBDA_SP32 1.0e30
+#define MANTLENGTH_SP32 24
+#define BASEDIGITS_SP32 7
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64 0x8000000000000000L
+#define EXSIGNBIT_DP64 0x7fffffffffffffffL
+#define EXPBITS_DP64 0x7ff0000000000000L
+#define MANTBITS_DP64 0x000fffffffffffffL
+#define ONEEXPBITS_DP64 0x3ff0000000000000L
+#define TWOEXPBITS_DP64 0x4000000000000000L
+#define HALFEXPBITS_DP64 0x3fe0000000000000L
+#define IMPBIT_DP64 0x0010000000000000L
+#define QNANBITPATT_DP64 0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64 0x7ff0000000000000L
+#define NINFBITPATT_DP64 0xfff0000000000000L
+#define EXPBIAS_DP64 1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64 1
+#define EMIN_DP64 -1022
+#define BIASEDEMAX_DP64 2046 /* 0x7fe */
+#define EMAX_DP64 1023 /* 0x3ff */
+#define LAMBDA_DP64 1.0e300
+#define MANTLENGTH_DP64 53
+#define BASEDIGITS_DP64 15
+
+#endif // cl_khr_fp64
+
+/******************** END AMD ***********************/
+
OVERLOADABLE float cospi(float x);
OVERLOADABLE float cosh(float x);
OVERLOADABLE float acos(float x);
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
index a6b6b94..f2090bc 100644
--- a/kernels/bench_math.cl
+++ b/kernels/bench_math.cl
@@ -49,11 +49,11 @@ kernel void bench_math_exp(
for(; loop > 0; loop--)
{
#if defined(BENCHMARK_NATIVE)
- result = native_exp(-0x1.6p1 - result); /* calls native */
+ result = native_exp((float)-0x1.6p1 - result); /* calls native */
#elif defined(BENCHMARK_INTERNAL_FAST)
- result = exp(-0x1.6p1 + result); /* calls internal fast */
+ result = exp((float)-0x1.6p1 + result); /* calls internal fast */
#else
- result = exp(-0x1.6p1 - result); /* calls internal slow */
+ result = exp((float)-0x1.6p1 - result); /* calls internal slow */
#endif
}
@@ -73,11 +73,11 @@ kernel void bench_math_exp10(
for(; loop > 0; loop--)
{
#if defined(BENCHMARK_NATIVE)
- result = native_exp10(-0x1.4p+5 - result); /* calls native */
+ result = native_exp10((float)-0x1.4p+5 - result); /* calls native */
#elif defined(BENCHMARK_INTERNAL_FAST)
- result = exp10(-0x1.4p+5 - result); /* calls internal fast */
+ result = exp10((float)-0x1.4p+5 - result); /* calls internal fast */
#else
- result = exp10(-0x1.3p+5 - result); /* calls internal slow */
+ result = exp10((float)-0x1.3p+5 - result); /* calls internal slow */
#endif
}
@@ -97,11 +97,11 @@ kernel void bench_math_log2(
for(; loop > 0; loop--)
{
#if defined(BENCHMARK_NATIVE)
- result = native_log2(0x1.1p0 + result); /* calls native */
+ result = native_log2((float)0x1.1p0 + result); /* calls native */
#elif defined(BENCHMARK_INTERNAL_FAST)
- result = log2(0x1.1p0 + result); /* calls internal fast */
+ result = log2((float)0x1.1p0 + result); /* calls internal fast */
#else
- result = log2(0x1.1p0 - result); /* calls internal slow */
+ result = log2((float)0x1.1p0 - result); /* calls internal slow */
#endif
}
@@ -121,11 +121,11 @@ kernel void bench_math_log(
for(; loop > 0; loop--)
{
#if defined(BENCHMARK_NATIVE)
- result = native_log(0x1.1p0 + result); /* calls native */
+ result = native_log((float)0x1.1p0 + result); /* calls native */
#elif defined(BENCHMARK_INTERNAL_FAST)
- result = log(0x1.1p0 + result); /* calls internal fast */
+ result = log((float)0x1.1p0 + result); /* calls internal fast */
#else
- result = log(0x1.1p0 - result); /* calls internal slow */
+ result = log((float)0x1.1p0 - result); /* calls internal slow */
#endif
}
@@ -145,11 +145,11 @@ kernel void bench_math_log10(
for(; loop > 0; loop--)
{
#if defined(BENCHMARK_NATIVE)
- result = native_log10(0x1.1p0 + result); /* calls native */
+ result = native_log10((float)0x1.1p0 + result); /* calls native */
#elif defined(BENCHMARK_INTERNAL_FAST)
- result = log10(0x1.1p0 + result); /* calls internal fast */
+ result = log10((float)0x1.1p0 + result); /* calls internal fast */
#else
- result = log10(0x1.1p0 - result); /* calls internal slow */
+ result = log10((float)0x1.1p0 - result); /* calls internal slow */
#endif
}
--
2.5.0
More information about the Beignet
mailing list