[Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad
Yang, Rong R
rong.r.yang at intel.com
Wed Aug 3 08:52:09 UTC 2016
The patchset pushed, thanks.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> grigore.lupescu at intel.com
> Sent: Tuesday, July 26, 2016 21:25
> To: beignet at lists.freedesktop.org
> Subject: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use
> mad
>
> From: Grigore Lupescu <grigore.lupescu at intel.com>
>
> Affected functions:
> __gen_ocl_internal_log
> __gen_ocl_internal_log10
> __gen_ocl_internal_log2
> __gen_ocl_internal_log_valid
> __kernel_sinf
> __kernel_cosf
> __gen_ocl_internal_cbrt
> __gen_ocl_internal_asinh
> __gen_ocl_internal_atan
> __gen_ocl_asin_util
> tan
> log1p
> lgamma_r
> lgamma
>
> Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
> ---
> backend/src/libocl/tmpl/ocl_math.tmpl.cl | 417 ++++++++++++----------------
> ---
> 1 file changed, 164 insertions(+), 253 deletions(-)
>
> diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> index c8969a1..0d2a57d 100644
> --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> @@ -164,7 +164,7 @@ OVERLOADABLE float
> __gen_ocl_internal_copysign(float x, float y) {
> return ux.f;
> }
>
> -OVERLOADABLE float __gen_ocl_internal_log(float x) {
> +OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
> /*
> * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> * ====================================================
> @@ -178,187 +178,105 @@ OVERLOADABLE float
> __gen_ocl_internal_log(float x) {
> */
> union { unsigned int i; float f; } u;
> const float
> - ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
> - ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
> - two25 = 3.355443200e+07, /* 0x4c000000 */
> + ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ ln2_lo =
> + 9.0580006145e-06, /* 0x3717f7d1 */
> + two25 = 3.355443200e+07, /* 0x4c000000 */
> Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
> Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
> Lg3 = 2.8571429849e-01, /* 3E924925 */
> Lg4 = 2.2222198546e-01; /* 3E638E29 */
>
> const float zero = 0.0;
> - float hfsq,f,s,z,R,w,t1,t2,dk;
> - int k,ix,i,j;
> + float fsq, f, s, z, R, w, t1, t2, partial; int k, ix, i, j;
>
> u.f = x; ix = u.i;
> - k=0;
> - if (ix < 0x00800000) { /* x < 2**-126 */
> - if ((ix&0x7fffffff)==0)
> - return -two25/zero; /* log(+-0)=-inf */
> - if (ix<0) return (x-x)/zero; /* log(-#) = NaN */
> - return -INFINITY; /* Gen does not support subnormal number now */
> - //k -= 25; x *= two25; /* subnormal number, scale up x */
> - //u.f = x; ix = u.i;
> - }
> - if (ix >= 0x7f800000) return x+x;
> - k += (ix>>23)-127;
> + k = 0;
> +
> + k += (ix>>23) - 127;
> ix &= 0x007fffff;
> - i = (ix+(0x95f64<<3))&0x800000;
> - u.i = ix|(i^0x3f800000); x = u.f;
> + i = (ix + (0x95f64<<3)) & 0x800000;
> + u.i = ix | (i^0x3f800000); x = u.f;
> k += (i>>23);
> - f = x-(float)1.0;
> - if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
> - if(f==zero) {
> - if(k==0) return zero;
> - else {
> - dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
> - }
> - }
> - R = f*f*((float)0.5-(float)0.33333333333333333*f);
> - if(k==0)
> - return f-R;
> - else {
> - dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f);
> - }
> + f = x - 1.0f;
> + fsq = f * f;
> +
> + if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
> + R = fsq * (0.5f - 0.33333333333333333f * f);
> + return k * ln2_hi + k * ln2_lo + f - R;
> }
> - s = f/((float)2.0+f);
> - dk = (float)k;
> - z = s*s;
> - i = ix-(0x6147a<<3);
> - w = z*z;
> - j = (0x6b851<<3)-ix;
> - t1= w*(Lg2+w*Lg4);
> - t2= z*(Lg1+w*Lg3);
> +
> + s = f / (2.0f + f);
> + z = s * s;
> + i = ix - (0x6147a << 3);
> + w = z * z;
> + j = (0x6b851 << 3) - ix;
> + t1= w * mad(w, Lg4, Lg2);
> + t2= z * mad(w, Lg3, Lg1);
> i |= j;
> - R = t2+t1;
> - if(i>0) {
> - hfsq=(float)0.5*f*f;
> - if(k==0) return f-(hfsq-s*(hfsq+R)); else
> - return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
> - } else {
> - if(k==0) return f-s*(f-R); else
> - return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
> - }
> + R = t2 + t1;
> + partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
> +
> + return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
> }
>
> +OVERLOADABLE float __gen_ocl_internal_log(float x) {
> + union { unsigned int i; float f; } u;
> + u.f = x;
> + int ix = u.i;
>
> -OVERLOADABLE float __gen_ocl_internal_log10(float x) {
> -/*
> - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> + if (ix < 0 )
> + return NAN; /* log(-#) = NaN */
> + if (ix >= 0x7f800000)
> + return NAN;
>
> - union {float f; unsigned i; }u;
> + return __gen_ocl_internal_log_valid(x); }
> +
> +OVERLOADABLE float __gen_ocl_internal_log10(float x) {
> + union { float f; unsigned i; } u;
> const float
> - zero = 0.0,
> - two25 = 3.3554432000e+07, /* 0x4c000000 */
> ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
> log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
> log10_2lo = 7.9034151668e-07; /* 0x355427db */
>
> - float y,z;
> - int i,k,hx;
> + float y, z;
> + int i, k, hx;
>
> u.f = x; hx = u.i;
> - k=0;
> - if (hx < 0x00800000) { /* x < 2**-126 */
> - if ((hx&0x7fffffff)==0)
> - return -two25/zero; /* log(+-0)=-inf */
> - if (hx<0) return NAN; /* log(-#) = NaN */
> - return -INFINITY; /* Gen does not support subnormal now */
> - //k -= 25; x *= two25; /* subnormal number, scale up x */
> - //u.f = x; hx = u.i;
> - }
> - if (hx >= 0x7f800000) return x+x;
> - k += (hx>>23)-127;
> - i = ((unsigned)k&0x80000000)>>31;
> - hx = (hx&0x007fffff)|((0x7f-i)<<23);
> - y = (float)(k+i);
> +
> + if (hx<0)
> + return NAN; /* log(-#) = NaN */
> + if (hx >= 0x7f800000)
> + return NAN;
> +
> + k = (hx >> 23) - 127;
> + i = ((unsigned)k & 0x80000000) >> 31; hx = (hx&0x007fffff) |
> + ((0x7f-i) << 23); y = (float)(k + i);
> u.i = hx; x = u.f;
> - z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
> - return z+y*log10_2hi;
> +
> + return y * log10_2lo + y * log10_2hi + ivln10 *
> + __gen_ocl_internal_log_valid(x);
> }
>
>
> -OVERLOADABLE float __gen_ocl_internal_log2(float x) {
> -/*
> - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> - * adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> +OVERLOADABLE float __gen_ocl_internal_log2(float x) {
> const float zero = 0.0,
> - ln2 = 0.69314718055994530942,
> - two25 = 3.355443200e+07, /** 0x4c000000 */
> - Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
> - Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
> - Lg3 = 2.8571429849e-01, /** 3E924925 */
> - Lg4 = 2.2222198546e-01; /** 3E638E29 */
> -
> - float hfsq,f,s,z,R,w,t1,t2,dk;
> - int k,ix,i,j;
> + invln2 = 0x1.715476p+0f;
> + int ix;
>
> - union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
> + union { float f; int i; } u;
> u.f = x; ix = u.i;
>
> - k=0;
> - if (ix < 0x00800000) { /** x < 2**-126 */
> - if ((ix&0x7fffffff)==0)
> - return -two25/(x-x); /** log(+-0)=-inf */
> -
> - if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */
> - return -INFINITY;
> - k -= 25; x *= two25; /** subnormal number, scale up x */
> - u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
> - }
> -
> - if (ix >= 0x7f800000) return x+x;
> -
> - k += (ix>>23)-127;
> - ix &= 0x007fffff;
> - i = (ix+(0x95f64<<3))&0x800000;
> -
> - u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));
> /** normalize x or x/2 */
> - k += (i>>23);
> - dk = (float)k;
> - f = x-(float)1.0;
> + if (ix < 0)
> + return NAN; /** log(-#) = NaN */
> + if (ix >= 0x7f800000)
> + return NAN;
>
> - if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */
> - if(f==zero) return dk;
> -
> - R = f*f*((float)0.5-(float)0.33333333333333333*f);
> - return dk-(R-f)/ln2;
> - }
> -
> - s = f/((float)2.0+f);
> - z = s*s;
> - i = ix-(0x6147a<<3);
> - w = z*z;
> - j = (0x6b851<<3)-ix;
> - t1= w*(Lg2+w*Lg4);
> - t2= z*(Lg1+w*Lg3);
> - i |= j;
> - R = t2+t1;
> -
> - if(i>0) {
> - hfsq=(float)0.5*f*f;
> - return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
> - } else {
> - return dk-((s*(f-R))-f)/ln2;
> - }
> + return invln2 * __gen_ocl_internal_log_valid(x);
> }
>
>
> @@ -545,9 +463,9 @@ OVERLOADABLE float __kernel_sinf(float x)
> float z,r,v;
> z = x*x;
> v = z*x;
> - r = S2+z*(S3+z*(S4));
> + r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
>
> - return x+v*(S1+z*r);
> + return mad(v, r, x);
> }
>
> float __kernel_cosf(float x, float y)
> @@ -563,7 +481,7 @@ float __kernel_cosf(float x, float y)
> GEN_OCL_GET_FLOAT_WORD(ix,x);
> ix &= 0x7fffffff; /* ix = |x|'s high word*/
> z = x*x;
> - r = z*(C1+z*(C2+z*(C3)));
> + r = z * mad(z, mad(z, C3, C2), C1);
>
> if(ix < 0x3e99999a) /* if |x| < 0.3 */
> return one - ((float)0.5*z - (z*r - x*y)); @@ -671,24 +589,22 @@ float
> __kernel_tanf(float x, float y, int iy)
> }
> if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
> if(hx<0) {x = -x; y = -y;}
> -
> -
> z = pio4-x;
> w = pio4lo-y;
> x = z+w; y = 0.0;
> }
> z = x*x;
> w = z*z;
> - /* Break x^5*(T[1]+x^2*T[2]+...) into
> - * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> - * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> - */
> + /* Break x^5*(T[1]+x^2*T[2]+...) into
> + * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> + * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> + */
>
> - r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
> - v = z*(T[2]+w*(T[4]+w*T[6]));
> + r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
> + v = z* mad(w, mad(w, T[6], T[4]), T[2]);
>
> s = z*x;
> - r = y + z*(s*(r+v)+y);
> + r = mad(z, mad(s, r + v, y), y);
> r += T[0]*s;
> w = x+r;
> if(ix>=0x3f2ca140) {
> @@ -696,21 +612,8 @@ float __kernel_tanf(float x, float y, int iy)
> return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
> }
> if(iy==1) return w;
> - else { /* if allow error up to 2 ulp
> - simply return -1.0/(x+r) here */
> - /* compute -1.0/(x+r) accurately */
> - float a,t;
> - int i;
> - z = w;
> - GEN_OCL_GET_FLOAT_WORD(i,z);
> - GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
> - v = r-(z - x); /* z+v = r+x */
> - t = a = -(float)1.0/w; /* a = -1.0/w */
> - GEN_OCL_GET_FLOAT_WORD(i,t);
> - GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
> - s = (float)1.0+t*z;
> - return t+a*(s+t*v);
> - }
> + else
> + return -1.0/(x+r);
> }
>
> OVERLOADABLE float tan(float x)
> @@ -931,44 +834,46 @@ OVERLOADABLE float lgamma(float x) {
> switch (i) {
> case 0:
> z = y * y;
> - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z *
> a10))));
> - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z *
> a11)))));
> - p = y * p1 + p2;
> + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8),
> a6), a4), a2), a0);
> + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9),
> a7), a5), a3), a1);
> + p = mad(y, p1, p2);
> r += (p - (float) 0.5 * y);
> break;
> case 1:
> z = y * y;
> w = z * y;
> - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
> - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
> - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
> - p = z * p1 - (tt - w * (p2 + y * p3));
> + p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3),
> t0);
> + p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7),
> t4), t1);
> + p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8),
> t5), t2);
> + p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
> r += (tf + p);
> break;
> case 2:
> - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y *
> u5)))));
> - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y *
> v5))));
> + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4),
> u3), u2), u1), u0);
> + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3),
> v2), v1),
> +one);
> r += (-(float) 0.5 * y + p1 / p2);
> }
> } else if (ix < 0x41000000) {
> i = (int) x;
> t = zero;
> y = x - (float) i;
> - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y *
> s6))))));
> - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y *
> r6)))));
> +
> + p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5),
> s4), s3), s2), s1), s0);
> + q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4),
> r3), r2),
> +r1), one);
> r = .5f * y + p / q;
> z = one;
> +
> switch (i) {
> case 7:
> - z *= (y + (float) 6.0);
> + z *= (y + 6.0f);
> case 6:
> - z *= (y + (float) 5.0);
> + z *= (y + 5.0f);
> case 5:
> - z *= (y + (float) 4.0);
> + z *= (y + 4.0f);
> case 4:
> - z *= (y + (float) 3.0);
> + z *= (y + 3.0f);
> case 3:
> - z *= (y + (float) 2.0);
> + z *= (y + 2.0f);
> r += native_log(z);
> break;
> }
> @@ -977,7 +882,7 @@ OVERLOADABLE float lgamma(float x) {
> t = native_log(x);
> z = one / x;
> y = z * z;
> - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y *
> w6)))));
> + w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5),
> w4), w3), w2),
> +w1), w0);
> r = (x - .5f) * (t - one) + w;
> } else
> r = x * (native_log(x) - one);
> @@ -1123,32 +1028,32 @@ OVERLOADABLE float lgamma(float x) {
> switch (i) { \
> case 0: \
> z = y * y; \
> - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z *
> a10)))); \
> - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z *
> a11))))); \
> - p = y * p1 + p2; \
> - r += (p - (float) 0.5 * y); \
> + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8),
> a6), a4), a2), a0); \
> + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9),
> a7), a5), a3), a1); \
> + p = mad(y, p1, p2); \
> + r = r - mad(y, 0.5f, -p); \
> break; \
> case 1: \
> z = y * y; \
> w = z * y; \
> - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \
> - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \
> - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \
> - p = z * p1 - (tt - w * (p2 + y * p3)); \
> + p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3),
> t0); \
> + p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7),
> t4), t1); \
> + p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8),
> t5), t2); \
> + p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
> r += (tf + p); \
> break; \
> case 2: \
> - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y *
> u5))))); \
> - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y *
> v5)))); \
> - r += (-(float) 0.5 * y + p1 / p2); \
> + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4),
> u3), u2), u1), u0); \
> + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3),
> v2), v1), one); \
> + r = r + mad(y, -0.5f, p1 / p2); \
> } \
> } else if (ix < 0x41000000) { \
> i = (int) x; \
> t = zero; \
> y = x - (float) i; \
> - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y *
> s6)))))); \
> - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y *
> r6))))); \
> - r = .5f * y + p / q; \
> + p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5),
> s4), s3), s2), s1), s0); \
> + q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4),
> r3), r2), r1), one); \
> + r = mad(y, 0.5f, p / q); \
> z = one; \
> switch (i) { \
> case 7: \
> @@ -1169,10 +1074,10 @@ OVERLOADABLE float lgamma(float x) {
> t = native_log(x); \
> z = one / x; \
> y = z * z; \
> - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y *
> w6))))); \
> + w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5),
> w4), w3), w2),
> +w1), w0); \
> r = (x - .5f) * (t - one) + w; \
> } else \
> - r = x * (native_log(x) - one); \
> + r = x * (native_log(x) - one); \
> if (hx < 0) \
> r = nadj - r; \
> return r;
> @@ -1253,20 +1158,26 @@ OVERLOADABLE float log1p(float x) {
> f = u-(float)1.0;
> }
> hfsq=(float)0.5*f*f;
> - if(hu==0) { /* |f| < 2**-20 */
> - if(f==zero) { if(k==0) return zero;
> - else {c += k*ln2_lo; return k*ln2_hi+c;} }
> - R = hfsq*((float)1.0-(float)0.66666666666666666*f);
> + if(hu==0)
> + { /* |f| < 2**-20 */
> + if(f==zero)
> + {
> + if(k==0) return zero;
> + else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
> + }
> + R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
> if(k==0) return f-R; else
> - return k*ln2_hi-((R-(k*ln2_lo+c))-f);
> + return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
> }
> s = f/((float)2.0+f);
> z = s*s;
> - R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
> - if(k==0) return f-(hfsq-s*(hfsq+R)); else
> - return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
> -
> + R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
> + if(k==0)
> + return f + mad(hfsq + R, s, -hfsq);
> + else
> + return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
> }
> +
> OVERLOADABLE float logb(float x) {
> if (__ocl_math_fastpath_flag)
> return __gen_ocl_internal_fastpath_logb(x);
> @@ -1378,14 +1289,14 @@ OVERLOADABLE float
> __gen_ocl_internal_cbrt(float x) {
>
> /* new cbrt to 23 bits */
> r=t*t/x;
> - s=C+r*t;
> + s=mad(r, t, C);
> t*=G+F/(s+E+D/s);
> /* one step newton iteration to 53 bits with error less than 0.667 ulps */
> s=t*t; /* t*t is exact */
> r=x/s;
> w=t+t;
> r=(r-t)/(w+r); /* r-s is exact */
> - t=t+t*r;
> + t=mad(t, r, t);
>
> /* retore the sign bit */
> GEN_OCL_GET_FLOAT_WORD(high,t);
> @@ -1437,10 +1348,10 @@ INLINE float __gen_ocl_asin_util(float x) {
> qS4 = 7.70381505559019352791e-02;
>
> float t = x*x;
> - float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
> - float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
> + float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1),
> + pS0); float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1),
> + 1.0f);
> float w = p / q;
> - return x + x*w;
> + return mad(x, w, x);
> }
>
> OVERLOADABLE float __gen_ocl_internal_asin(float x) { @@ -1538,8 +1449,8
> @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
> z = x*x;
> w = z*z;
> /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
> - s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
> - s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
> + s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
> + s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
> if (id<0) return x - x*(s1+s2);
> else {
> z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x); @@ -1829,15 +1740,15 @@
> sb7 = -2.2440952301e+01; /* 0xc1b38712 */
> return x + efx*x;
> }
> z = x*x;
> - r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
> - s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
> - y = r/s;
> - return x + x*y;
> + r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
> + s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1),
> one);
> + y = r / s;
> + return mad(x, y, x);
> }
> if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
> s = __gen_ocl_internal_fabs(x)-one;
> - P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
> - Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
> + P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4),
> pa3), pa2), pa1), pa0);
> + Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4),
> +qa3), qa2), qa1), one);
> if(hx>=0) return erx + P/Q; else return -erx - P/Q;
> }
> if (ix >= 0x40c00000) { /* inf>|x|>=6 */
> @@ -1846,15 +1757,15 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
> x = __gen_ocl_internal_fabs(x);
> s = one/(x*x);
> if(ix< 0x4036DB6E) { /* |x| < 1/0.35 */
> - R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
> - ra5+s*(ra6+s*ra7))))));
> - S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
> - sa5+s*(sa6+s*(sa7+s*sa8)))))));
> + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
> + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
> } else { /* |x| >= 1/0.35 */
> - R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
> - rb5+s*rb6)))));
> - S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
> - sb5+s*(sb6+s*sb7))))));
> + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + rb6, rb5), rb4), rb3), rb2), rb1), rb0);
> + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
> }
> GEN_OCL_GET_FLOAT_WORD(ix,x);
> GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
> @@ -1949,8 +1860,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
> if(ix < 0x23800000) /* |x|<2**-56 */
> return one-x;
> z = x*x;
> - r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
> - s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
> + r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
> + s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2),
> +qq1), one);
> y = r/s;
> if(hx < 0x3e800000) { /* x<1/4 */
> return one-(x+x*y);
> @@ -1962,8 +1873,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
> }
> if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
> s = __gen_ocl_internal_fabs(x)-one;
> - P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
> - Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
> + P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4),
> pa3), pa2), pa1), pa0);
> + Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4),
> +qa3), qa2), qa1), one);
> if(hx>=0) {
> z = one-erx; return z - P/Q;
> } else {
> @@ -1974,16 +1885,16 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */
> x = __gen_ocl_internal_fabs(x);
> s = one/(x*x);
> if(ix< 0x4036DB6D) { /* |x| < 1/.35 ~ 2.857143*/
> - R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
> - ra5+s*(ra6+s*ra7))))));
> - S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
> - sa5+s*(sa6+s*(sa7+s*sa8)))))));
> + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
> + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> mad(s,
> + sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1),
> one);
> } else { /* |x| >= 1/.35 ~ 2.857143 */
> if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
> - R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
> - rb5+s*rb6)))));
> - S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
> - sb5+s*(sb6+s*sb7))))));
> + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + rb6, rb5), rb4), rb3), rb2), rb1), rb0);
> + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
> + sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
> }
> GEN_OCL_GET_FLOAT_WORD(ix,x);
> GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
> @@ -2224,7 +2135,7 @@ OVERLOADABLE float
> __gen_ocl_internal_asinh(float x){
> } else {
> float xa = __gen_ocl_internal_fabs(x);
> if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
> - w =
> __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
> + w = __gen_ocl_internal_log(mad(xa, 2.0f, one /
> + (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
> } else { /* 2.0 > |x| > 2**-14 */
> float t = xa*xa;
> w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
> --
> 2.5.0
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list