[Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad
grigore.lupescu at intel.com
grigore.lupescu at intel.com
Tue Jul 26 13:24:53 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Affected functions:
__gen_ocl_internal_log
__gen_ocl_internal_log10
__gen_ocl_internal_log2
__gen_ocl_internal_log_valid
__kernel_sinf
__kernel_cosf
__gen_ocl_internal_cbrt
__gen_ocl_internal_asinh
__gen_ocl_internal_atan
__gen_ocl_asin_util
tan
log1p
lgamma_r
lgamma
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 417 ++++++++++++-------------------
1 file changed, 164 insertions(+), 253 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index c8969a1..0d2a57d 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -164,7 +164,7 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
return ux.f;
}
-OVERLOADABLE float __gen_ocl_internal_log(float x) {
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
* ====================================================
@@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
*/
union { unsigned int i; float f; } u;
const float
- ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
- ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
- two25 = 3.355443200e+07, /* 0x4c000000 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
Lg3 = 2.8571429849e-01, /* 3E924925 */
Lg4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
- float hfsq,f,s,z,R,w,t1,t2,dk;
- int k,ix,i,j;
+ float fsq, f, s, z, R, w, t1, t2, partial;
+ int k, ix, i, j;
u.f = x; ix = u.i;
- k=0;
- if (ix < 0x00800000) { /* x < 2**-126 */
- if ((ix&0x7fffffff)==0)
- return -two25/zero; /* log(+-0)=-inf */
- if (ix<0) return (x-x)/zero; /* log(-#) = NaN */
- return -INFINITY; /* Gen does not support subnormal number now */
- //k -= 25; x *= two25; /* subnormal number, scale up x */
- //u.f = x; ix = u.i;
- }
- if (ix >= 0x7f800000) return x+x;
- k += (ix>>23)-127;
+ k = 0;
+
+ k += (ix>>23) - 127;
ix &= 0x007fffff;
- i = (ix+(0x95f64<<3))&0x800000;
- u.i = ix|(i^0x3f800000); x = u.f;
+ i = (ix + (0x95f64<<3)) & 0x800000;
+ u.i = ix | (i^0x3f800000); x = u.f;
k += (i>>23);
- f = x-(float)1.0;
- if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
- if(f==zero) {
- if(k==0) return zero;
- else {
- dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
- }
- }
- R = f*f*((float)0.5-(float)0.33333333333333333*f);
- if(k==0)
- return f-R;
- else {
- dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f);
- }
+ f = x - 1.0f;
+ fsq = f * f;
+
+ if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+ R = fsq * (0.5f - 0.33333333333333333f * f);
+ return k * ln2_hi + k * ln2_lo + f - R;
}
- s = f/((float)2.0+f);
- dk = (float)k;
- z = s*s;
- i = ix-(0x6147a<<3);
- w = z*z;
- j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*Lg4);
- t2= z*(Lg1+w*Lg3);
+
+ s = f / (2.0f + f);
+ z = s * s;
+ i = ix - (0x6147a << 3);
+ w = z * z;
+ j = (0x6b851 << 3) - ix;
+ t1= w * mad(w, Lg4, Lg2);
+ t2= z * mad(w, Lg3, Lg1);
i |= j;
- R = t2+t1;
- if(i>0) {
- hfsq=(float)0.5*f*f;
- if(k==0) return f-(hfsq-s*(hfsq+R)); else
- return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
- } else {
- if(k==0) return f-s*(f-R); else
- return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
- }
+ R = t2 + t1;
+ partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+ return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
}
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+ union { unsigned int i; float f; } u;
+ u.f = x;
+ int ix = u.i;
-OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+ if (ix < 0 )
+ return NAN; /* log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
- union {float f; unsigned i; }u;
+ return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+ union { float f; unsigned i; } u;
const float
- zero = 0.0,
- two25 = 3.3554432000e+07, /* 0x4c000000 */
ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
log10_2lo = 7.9034151668e-07; /* 0x355427db */
- float y,z;
- int i,k,hx;
+ float y, z;
+ int i, k, hx;
u.f = x; hx = u.i;
- k=0;
- if (hx < 0x00800000) { /* x < 2**-126 */
- if ((hx&0x7fffffff)==0)
- return -two25/zero; /* log(+-0)=-inf */
- if (hx<0) return NAN; /* log(-#) = NaN */
- return -INFINITY; /* Gen does not support subnormal now */
- //k -= 25; x *= two25; /* subnormal number, scale up x */
- //u.f = x; hx = u.i;
- }
- if (hx >= 0x7f800000) return x+x;
- k += (hx>>23)-127;
- i = ((unsigned)k&0x80000000)>>31;
- hx = (hx&0x007fffff)|((0x7f-i)<<23);
- y = (float)(k+i);
+
+ if (hx<0)
+ return NAN; /* log(-#) = NaN */
+ if (hx >= 0x7f800000)
+ return NAN;
+
+ k = (hx >> 23) - 127;
+ i = ((unsigned)k & 0x80000000) >> 31;
+ hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+ y = (float)(k + i);
u.i = hx; x = u.f;
- z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
- return z+y*log10_2hi;
+
+ return y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
}
-OVERLOADABLE float __gen_ocl_internal_log2(float x) {
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
const float zero = 0.0,
- ln2 = 0.69314718055994530942,
- two25 = 3.355443200e+07, /** 0x4c000000 */
- Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
- Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
- Lg3 = 2.8571429849e-01, /** 3E924925 */
- Lg4 = 2.2222198546e-01; /** 3E638E29 */
-
- float hfsq,f,s,z,R,w,t1,t2,dk;
- int k,ix,i,j;
+ invln2 = 0x1.715476p+0f;
+ int ix;
- union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+ union { float f; int i; } u;
u.f = x; ix = u.i;
- k=0;
- if (ix < 0x00800000) { /** x < 2**-126 */
- if ((ix&0x7fffffff)==0)
- return -two25/(x-x); /** log(+-0)=-inf */
-
- if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */
- return -INFINITY;
- k -= 25; x *= two25; /** subnormal number, scale up x */
- u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
- }
-
- if (ix >= 0x7f800000) return x+x;
-
- k += (ix>>23)-127;
- ix &= 0x007fffff;
- i = (ix+(0x95f64<<3))&0x800000;
-
- u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000)); /** normalize x or x/2 */
- k += (i>>23);
- dk = (float)k;
- f = x-(float)1.0;
+ if (ix < 0)
+ return NAN; /** log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
- if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */
- if(f==zero) return dk;
-
- R = f*f*((float)0.5-(float)0.33333333333333333*f);
- return dk-(R-f)/ln2;
- }
-
- s = f/((float)2.0+f);
- z = s*s;
- i = ix-(0x6147a<<3);
- w = z*z;
- j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*Lg4);
- t2= z*(Lg1+w*Lg3);
- i |= j;
- R = t2+t1;
-
- if(i>0) {
- hfsq=(float)0.5*f*f;
- return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
- } else {
- return dk-((s*(f-R))-f)/ln2;
- }
+ return invln2 * __gen_ocl_internal_log_valid(x);
}
@@ -545,9 +463,9 @@ OVERLOADABLE float __kernel_sinf(float x)
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4));
+ r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
- return x+v*(S1+z*r);
+ return mad(v, r, x);
}
float __kernel_cosf(float x, float y)
@@ -563,7 +481,7 @@ float __kernel_cosf(float x, float y)
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3)));
+ r = z * mad(z, mad(z, C3, C2), C1);
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
@@ -671,24 +589,22 @@ float __kernel_tanf(float x, float y, int iy)
}
if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
if(hx<0) {x = -x; y = -y;}
-
-
z = pio4-x;
w = pio4lo-y;
x = z+w; y = 0.0;
}
z = x*x;
w = z*z;
- /* Break x^5*(T[1]+x^2*T[2]+...) into
- * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
- * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
- */
+ /* Break x^5*(T[1]+x^2*T[2]+...) into
+ * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+ * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+ */
- r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
- v = z*(T[2]+w*(T[4]+w*T[6]));
+ r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
+ v = z* mad(w, mad(w, T[6], T[4]), T[2]);
s = z*x;
- r = y + z*(s*(r+v)+y);
+ r = mad(z, mad(s, r + v, y), y);
r += T[0]*s;
w = x+r;
if(ix>=0x3f2ca140) {
@@ -696,21 +612,8 @@ float __kernel_tanf(float x, float y, int iy)
return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
}
if(iy==1) return w;
- else { /* if allow error up to 2 ulp
- simply return -1.0/(x+r) here */
- /* compute -1.0/(x+r) accurately */
- float a,t;
- int i;
- z = w;
- GEN_OCL_GET_FLOAT_WORD(i,z);
- GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
- v = r-(z - x); /* z+v = r+x */
- t = a = -(float)1.0/w; /* a = -1.0/w */
- GEN_OCL_GET_FLOAT_WORD(i,t);
- GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
- s = (float)1.0+t*z;
- return t+a*(s+t*v);
- }
+ else
+ return -1.0/(x+r);
}
OVERLOADABLE float tan(float x)
@@ -931,44 +834,46 @@ OVERLOADABLE float lgamma(float x) {
switch (i) {
case 0:
z = y * y;
- p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
- p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
- p = y * p1 + p2;
+ p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+ p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+ p = mad(y, p1, p2);
r += (p - (float) 0.5 * y);
break;
case 1:
z = y * y;
w = z * y;
- p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
- p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
- p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
- p = z * p1 - (tt - w * (p2 + y * p3));
+ p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+ p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+ p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+ p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
r += (tf + p);
break;
case 2:
- p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
- p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+ p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
r += (-(float) 0.5 * y + p1 / p2);
}
} else if (ix < 0x41000000) {
i = (int) x;
t = zero;
y = x - (float) i;
- p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
- q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+
+ p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+ q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
r = .5f * y + p / q;
z = one;
+
switch (i) {
case 7:
- z *= (y + (float) 6.0);
+ z *= (y + 6.0f);
case 6:
- z *= (y + (float) 5.0);
+ z *= (y + 5.0f);
case 5:
- z *= (y + (float) 4.0);
+ z *= (y + 4.0f);
case 4:
- z *= (y + (float) 3.0);
+ z *= (y + 3.0f);
case 3:
- z *= (y + (float) 2.0);
+ z *= (y + 2.0f);
r += native_log(z);
break;
}
@@ -977,7 +882,7 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x);
z = one / x;
y = z * z;
- w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
r = (x - .5f) * (t - one) + w;
} else
r = x * (native_log(x) - one);
@@ -1123,32 +1028,32 @@ OVERLOADABLE float lgamma(float x) {
switch (i) { \
case 0: \
z = y * y; \
- p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); \
- p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); \
- p = y * p1 + p2; \
- r += (p - (float) 0.5 * y); \
+ p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0); \
+ p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1); \
+ p = mad(y, p1, p2); \
+ r = r - mad(y, 0.5f, -p); \
break; \
case 1: \
z = y * y; \
w = z * y; \
- p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \
- p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \
- p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \
- p = z * p1 - (tt - w * (p2 + y * p3)); \
+ p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0); \
+ p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1); \
+ p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2); \
+ p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
r += (tf + p); \
break; \
case 2: \
- p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); \
- p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); \
- r += (-(float) 0.5 * y + p1 / p2); \
+ p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0); \
+ p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one); \
+ r = r + mad(y, -0.5f, p1 / p2); \
} \
} else if (ix < 0x41000000) { \
i = (int) x; \
t = zero; \
y = x - (float) i; \
- p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); \
- q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); \
- r = .5f * y + p / q; \
+ p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0); \
+ q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one); \
+ r = mad(y, 0.5f, p / q); \
z = one; \
switch (i) { \
case 7: \
@@ -1169,10 +1074,10 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x); \
z = one / x; \
y = z * z; \
- w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); \
+ w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0); \
r = (x - .5f) * (t - one) + w; \
} else \
- r = x * (native_log(x) - one); \
+ r = x * (native_log(x) - one); \
if (hx < 0) \
r = nadj - r; \
return r;
@@ -1253,20 +1158,26 @@ OVERLOADABLE float log1p(float x) {
f = u-(float)1.0;
}
hfsq=(float)0.5*f*f;
- if(hu==0) { /* |f| < 2**-20 */
- if(f==zero) { if(k==0) return zero;
- else {c += k*ln2_lo; return k*ln2_hi+c;} }
- R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+ if(hu==0)
+ { /* |f| < 2**-20 */
+ if(f==zero)
+ {
+ if(k==0) return zero;
+ else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+ }
+ R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
if(k==0) return f-R; else
- return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+ return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
}
s = f/((float)2.0+f);
z = s*s;
- R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
- if(k==0) return f-(hfsq-s*(hfsq+R)); else
- return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
-
+ R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+ if(k==0)
+ return f + mad(hfsq + R, s, -hfsq);
+ else
+ return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
}
+
OVERLOADABLE float logb(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_logb(x);
@@ -1378,14 +1289,14 @@ OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
/* new cbrt to 23 bits */
r=t*t/x;
- s=C+r*t;
+ s=mad(r, t, C);
t*=G+F/(s+E+D/s);
/* one step newton iteration to 53 bits with error less than 0.667 ulps */
s=t*t; /* t*t is exact */
r=x/s;
w=t+t;
r=(r-t)/(w+r); /* r-s is exact */
- t=t+t*r;
+ t=mad(t, r, t);
/* retore the sign bit */
GEN_OCL_GET_FLOAT_WORD(high,t);
@@ -1437,10 +1348,10 @@ INLINE float __gen_ocl_asin_util(float x) {
qS4 = 7.70381505559019352791e-02;
float t = x*x;
- float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
- float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+ float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
+ float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
float w = p / q;
- return x + x*w;
+ return mad(x, w, x);
}
OVERLOADABLE float __gen_ocl_internal_asin(float x) {
@@ -1538,8 +1449,8 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
- s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
- s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
+ s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
+ s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
if (id<0) return x - x*(s1+s2);
else {
z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
@@ -1829,15 +1740,15 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
return x + efx*x;
}
z = x*x;
- r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
- s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
- y = r/s;
- return x + x*y;
+ r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+ s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
+ y = r / s;
+ return mad(x, y, x);
}
if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
s = __gen_ocl_internal_fabs(x)-one;
- P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
- Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+ P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+ Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
if(hx>=0) return erx + P/Q; else return -erx - P/Q;
}
if (ix >= 0x40c00000) { /* inf>|x|>=6 */
@@ -1846,15 +1757,15 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
x = __gen_ocl_internal_fabs(x);
s = one/(x*x);
if(ix< 0x4036DB6E) { /* |x| < 1/0.35 */
- R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
- ra5+s*(ra6+s*ra7))))));
- S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
- sa5+s*(sa6+s*(sa7+s*sa8)))))));
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
} else { /* |x| >= 1/0.35 */
- R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
- rb5+s*rb6)))));
- S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
- sb5+s*(sb6+s*sb7))))));
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
}
GEN_OCL_GET_FLOAT_WORD(ix,x);
GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
@@ -1949,8 +1860,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
if(ix < 0x23800000) /* |x|<2**-56 */
return one-x;
z = x*x;
- r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
- s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+ r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+ s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
y = r/s;
if(hx < 0x3e800000) { /* x<1/4 */
return one-(x+x*y);
@@ -1962,8 +1873,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
}
if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
s = __gen_ocl_internal_fabs(x)-one;
- P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
- Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+ P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+ Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
if(hx>=0) {
z = one-erx; return z - P/Q;
} else {
@@ -1974,16 +1885,16 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
x = __gen_ocl_internal_fabs(x);
s = one/(x*x);
if(ix< 0x4036DB6D) { /* |x| < 1/.35 ~ 2.857143*/
- R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
- ra5+s*(ra6+s*ra7))))));
- S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
- sa5+s*(sa6+s*(sa7+s*sa8)))))));
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
} else { /* |x| >= 1/.35 ~ 2.857143 */
if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
- R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
- rb5+s*rb6)))));
- S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
- sb5+s*(sb6+s*sb7))))));
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
}
GEN_OCL_GET_FLOAT_WORD(ix,x);
GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
@@ -2224,7 +2135,7 @@ OVERLOADABLE float __gen_ocl_internal_asinh(float x){
} else {
float xa = __gen_ocl_internal_fabs(x);
if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
- w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+ w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
} else { /* 2.0 > |x| > 2**-14 */
float t = xa*xa;
w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
--
2.5.0
More information about the Beignet
mailing list