[Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad
Song, Ruiling
ruiling.song at intel.com
Tue Jul 19 06:21:20 UTC 2016
This one plus the previous version three patches looks good. This is big performance improvement to the math functions.
Thanks!
Ruiling
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Grigore Lupescu
> Sent: Tuesday, July 19, 2016 4:38 AM
> To: beignet at lists.freedesktop.org
> Subject: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad
>
> From: Grigore Lupescu <grigore.lupescu at intel.com>
>
> Affected functions:
> __gen_ocl_internal_log
> __gen_ocl_internal_log10
> __gen_ocl_internal_log2
> __kernel_sinf
> __kernel_cosf
> __gen_ocl_internal_cbrt
> __gen_ocl_asin_util
> tan
> log1p
>
> Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
> ---
> backend/src/libocl/tmpl/ocl_math.tmpl.cl | 385 +++++++++++++------------------
> 1 file changed, 159 insertions(+), 226 deletions(-)
>
> diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> index 9ea0817..149ba01 100644
> --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> @@ -164,7 +164,7 @@ OVERLOADABLE float
> __gen_ocl_internal_copysign(float x, float y) {
> return ux.f;
> }
>
> -OVERLOADABLE float __gen_ocl_internal_log(float x) {
> +OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
> /*
> * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> * ====================================================
> @@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float
> x) {
> */
> union { unsigned int i; float f; } u;
> const float
> - ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
> - ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
> - two25 = 3.355443200e+07, /* 0x4c000000 */
> + ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
> + ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
> + two25 = 3.355443200e+07, /* 0x4c000000 */
> Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
> Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
> Lg3 = 2.8571429849e-01, /* 3E924925 */
> Lg4 = 2.2222198546e-01; /* 3E638E29 */
>
> const float zero = 0.0;
> - float hfsq,f,s,z,R,w,t1,t2,dk;
> - int k,ix,i,j;
> + float fsq, f, s, z, R, w, t1, t2, partial;
> + int k, ix, i, j;
>
> u.f = x; ix = u.i;
> - k=0;
> - if (ix < 0x00800000) { /* x < 2**-126 */
> - if ((ix&0x7fffffff)==0)
> - return -two25/zero; /* log(+-0)=-inf */
> - if (ix<0) return (x-x)/zero; /* log(-#) = NaN */
> - return -INFINITY; /* Gen does not support subnormal number now */
> - //k -= 25; x *= two25; /* subnormal number, scale up x */
> - //u.f = x; ix = u.i;
> - }
> - if (ix >= 0x7f800000) return x+x;
> - k += (ix>>23)-127;
> + k = 0;
> +
> + k += (ix>>23) - 127;
> ix &= 0x007fffff;
> - i = (ix+(0x95f64<<3))&0x800000;
> - u.i = ix|(i^0x3f800000); x = u.f;
> + i = (ix + (0x95f64<<3)) & 0x800000;
> + u.i = ix | (i^0x3f800000); x = u.f;
> k += (i>>23);
> - f = x-(float)1.0;
> - if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
> - if(f==zero) {
> - if(k==0) return zero;
> - else {
> - dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
> - }
> - }
> - R = f*f*((float)0.5-(float)0.33333333333333333*f);
> - if(k==0)
> - return f-R;
> - else {
> - dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f);
> - }
> + f = x - 1.0f;
> + fsq = f * f;
> +
> + if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
> + R = fsq * (0.5f - 0.33333333333333333f * f);
> + return k * ln2_hi + k * ln2_lo + f - R;
> }
> - s = f/((float)2.0+f);
> - dk = (float)k;
> - z = s*s;
> - i = ix-(0x6147a<<3);
> - w = z*z;
> - j = (0x6b851<<3)-ix;
> - t1= w*(Lg2+w*Lg4);
> - t2= z*(Lg1+w*Lg3);
> +
> + s = f / (2.0f + f);
> + z = s * s;
> + i = ix - (0x6147a << 3);
> + w = z * z;
> + j = (0x6b851 << 3) - ix;
> + t1= w * mad(w, Lg4, Lg2);
> + t2= z * mad(w, Lg3, Lg1);
> i |= j;
> - R = t2+t1;
> - if(i>0) {
> - hfsq=(float)0.5*f*f;
> - if(k==0) return f-(hfsq-s*(hfsq+R)); else
> - return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
> - } else {
> - if(k==0) return f-s*(f-R); else
> - return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
> - }
> + R = t2 + t1;
> + partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
> +
> + return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
> }
>
> +OVERLOADABLE float __gen_ocl_internal_log(float x)
> +{
> + union { unsigned int i; float f; } u;
> + u.f = x;
> + int ix = u.i;
>
> -OVERLOADABLE float __gen_ocl_internal_log10(float x) {
> -/*
> - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> + if (ix < 0 )
> + return NAN; /* log(-#) = NaN */
> + if (ix >= 0x7f800000)
> + return NAN;
>
> - union {float f; unsigned i; }u;
> + return __gen_ocl_internal_log_valid(x);
> +}
> +
> +OVERLOADABLE float __gen_ocl_internal_log10(float x)
> +{
> + union { float f; unsigned i; } u;
> const float
> - zero = 0.0,
> - two25 = 3.3554432000e+07, /* 0x4c000000 */
> ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
> log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
> log10_2lo = 7.9034151668e-07; /* 0x355427db */
>
> - float y,z;
> - int i,k,hx;
> + float y, z;
> + int i, k, hx;
>
> u.f = x; hx = u.i;
> - k=0;
> - if (hx < 0x00800000) { /* x < 2**-126 */
> - if ((hx&0x7fffffff)==0)
> - return -two25/zero; /* log(+-0)=-inf */
> - if (hx<0) return NAN; /* log(-#) = NaN */
> - return -INFINITY; /* Gen does not support subnormal now */
> - //k -= 25; x *= two25; /* subnormal number, scale up x */
> - //u.f = x; hx = u.i;
> - }
> - if (hx >= 0x7f800000) return x+x;
> - k += (hx>>23)-127;
> - i = ((unsigned)k&0x80000000)>>31;
> - hx = (hx&0x007fffff)|((0x7f-i)<<23);
> - y = (float)(k+i);
> +
> + if (hx<0)
> + return NAN; /* log(-#) = NaN */
> + if (hx >= 0x7f800000)
> + return NAN;
> +
> + k = (hx >> 23) - 127;
> + i = ((unsigned)k & 0x80000000) >> 31;
> + hx = (hx&0x007fffff) | ((0x7f-i) << 23);
> + y = (float)(k + i);
> u.i = hx; x = u.f;
> - z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
> - return z+y*log10_2hi;
> +
> + return y * log10_2lo + y * log10_2hi + ivln10 *
> __gen_ocl_internal_log_valid(x);
> }
>
>
> -OVERLOADABLE float __gen_ocl_internal_log2(float x) {
> -/*
> - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
> - * adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> +OVERLOADABLE float __gen_ocl_internal_log2(float x)
> +{
> const float zero = 0.0,
> - ln2 = 0.69314718055994530942,
> - two25 = 3.355443200e+07, /** 0x4c000000 */
> - Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
> - Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
> - Lg3 = 2.8571429849e-01, /** 3E924925 */
> - Lg4 = 2.2222198546e-01; /** 3E638E29 */
> -
> - float hfsq,f,s,z,R,w,t1,t2,dk;
> - int k,ix,i,j;
> + invln2 = 0x1.715476p+0f;
> + int ix;
>
> - union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
> + union { float f; int i; } u;
> u.f = x; ix = u.i;
>
> - k=0;
> - if (ix < 0x00800000) { /** x < 2**-126 */
> - if ((ix&0x7fffffff)==0)
> - return -two25/(x-x); /** log(+-0)=-inf */
> -
> - if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */
> - return -INFINITY;
> - k -= 25; x *= two25; /** subnormal number, scale up x */
> - u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
> - }
> -
> - if (ix >= 0x7f800000) return x+x;
> -
> - k += (ix>>23)-127;
> - ix &= 0x007fffff;
> - i = (ix+(0x95f64<<3))&0x800000;
> -
> - u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));
> /** normalize x or x/2 */
> - k += (i>>23);
> - dk = (float)k;
> - f = x-(float)1.0;
> -
> - if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */
> - if(f==zero) return dk;
> + if (ix < 0)
> + return NAN; /** log(-#) = NaN */
> + if (ix >= 0x7f800000)
> + return NAN;
>
> - R = f*f*((float)0.5-(float)0.33333333333333333*f);
> - return dk-(R-f)/ln2;
> - }
> -
> - s = f/((float)2.0+f);
> - z = s*s;
> - i = ix-(0x6147a<<3);
> - w = z*z;
> - j = (0x6b851<<3)-ix;
> - t1= w*(Lg2+w*Lg4);
> - t2= z*(Lg1+w*Lg3);
> - i |= j;
> - R = t2+t1;
> -
> - if(i>0) {
> - hfsq=(float)0.5*f*f;
> - return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
> - } else {
> - return dk-((s*(f-R))-f)/ln2;
> - }
> + return invln2 * __gen_ocl_internal_log_valid(x);
> }
>
>
> @@ -545,9 +463,13 @@ OVERLOADABLE float __kernel_sinf(float x)
> float z,r,v;
> z = x*x;
> v = z*x;
> - r = S2+z*(S3+z*(S4));
> + r = mad(z,
> + mad(z,
> + mad(z, S4, S3),
> + S2),
> + S1);
>
> - return x+v*(S1+z*r);
> + return mad(v, r, x);
> }
>
> float __kernel_cosf(float x, float y)
> @@ -563,7 +485,9 @@ float __kernel_cosf(float x, float y)
> GEN_OCL_GET_FLOAT_WORD(ix,x);
> ix &= 0x7fffffff; /* ix = |x|'s high word*/
> z = x*x;
> - r = z*(C1+z*(C2+z*(C3)));
> + r = z * mad(z,
> + mad(z, C3, C2),
> + C1);
>
> if(ix < 0x3e99999a) /* if |x| < 0.3 */
> return one - ((float)0.5*z - (z*r - x*y));
> @@ -671,24 +595,27 @@ float __kernel_tanf(float x, float y, int iy)
> }
> if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
> if(hx<0) {x = -x; y = -y;}
> -
> -
> z = pio4-x;
> w = pio4lo-y;
> x = z+w; y = 0.0;
> }
> z = x*x;
> w = z*z;
> - /* Break x^5*(T[1]+x^2*T[2]+...) into
> - * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> - * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> - */
> -
> - r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
> - v = z*(T[2]+w*(T[4]+w*T[6]));
> + /* Break x^5*(T[1]+x^2*T[2]+...) into
> + * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> + * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> + */
> +
> + r = mad(w,
> + mad(w,
> + mad(w, T[7],
> + T[5]),
> + T[3]),
> + T[1]);
> + v = z* mad(w, mad(w, T[6], T[4]), T[2]);
>
> s = z*x;
> - r = y + z*(s*(r+v)+y);
> + r = mad(z, mad(s, r + v, y), y);
> r += T[0]*s;
> w = x+r;
> if(ix>=0x3f2ca140) {
> @@ -696,21 +623,8 @@ float __kernel_tanf(float x, float y, int iy)
> return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
> }
> if(iy==1) return w;
> - else { /* if allow error up to 2 ulp
> - simply return -1.0/(x+r) here */
> - /* compute -1.0/(x+r) accurately */
> - float a,t;
> - int i;
> - z = w;
> - GEN_OCL_GET_FLOAT_WORD(i,z);
> - GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
> - v = r-(z - x); /* z+v = r+x */
> - t = a = -(float)1.0/w; /* a = -1.0/w */
> - GEN_OCL_GET_FLOAT_WORD(i,t);
> - GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
> - s = (float)1.0+t*z;
> - return t+a*(s+t*v);
> - }
> + else
> + return -1.0/(x+r);
> }
>
> OVERLOADABLE float tan(float x)
> @@ -897,44 +811,45 @@ OVERLOADABLE float lgamma(float x) {
> switch (i) {
> case 0:
> z = y * y;
> - p1 = a0 + z * a2;
> - p2 = z * (a1 + z * a3);
> - p = y * p1 + p2;
> + p1 = mad(z, a2, a0);
> + p2 = z * mad(z, a3, a1);
> + p = mad(y, p1, p2);
> r += (p - (float) 0.5 * y);
> break;
> case 1:
> z = y * y;
> w = z * y;
> - p1 = t0 + w * t3;
> - p2 = t1 + w * t4;
> - p3 = t2 + w * t5;
> - p = z * p1 - (tt - w * (p2 + y * p3));
> + p1 = mad(w, t3, t0);
> + p2 = mad(w, t4, t1);
> + p3 = mad(w, t5, t2);
> + p = z * p1 - (tt - w * mad(y, p3, p2));
> r += (tf + p);
> break;
> case 2:
> - p1 = y * (u0 + y * u1);
> - p2 = one + y * (v1 + y * v2);
> + p1 = y * mad(y, u1, u0);
> + p2 = one + y * mad(y, v2, v1);
> r += (-(float) 0.5 * y + p1 / p2);
> }
> } else if (ix < 0x41000000) {
> i = (int) x;
> t = zero;
> y = x - (float) i;
> - p = y * (s0 + y * (s1 + y * s2));
> - q = one + y * (r1 + y * r2);
> + p = y * mad(y, mad(y, s2, s1), s0);
> + q = mad(y, mad(y, r2, r1), 1.0f);
> r = .5f * y + p / q;
> z = one;
> +
> switch (i) {
> case 7:
> - z *= (y + (float) 6.0);
> + z *= (y + 6.0f);
> case 6:
> - z *= (y + (float) 5.0);
> + z *= (y + 5.0f);
> case 5:
> - z *= (y + (float) 4.0);
> + z *= (y + 4.0f);
> case 4:
> - z *= (y + (float) 3.0);
> + z *= (y + 3.0f);
> case 3:
> - z *= (y + (float) 2.0);
> + z *= (y + 2.0f);
> r += native_log(z);
> break;
> }
> @@ -943,7 +858,7 @@ OVERLOADABLE float lgamma(float x) {
> t = native_log(x);
> z = one / x;
> y = z * z;
> - w = w0 + z * (w1 + y * w2);
> + w = mad(z, mad(y, w2, 21), w0);
> r = (x - .5f) * (t - one) + w;
> } else
> r = x * (native_log(x) - one);
> @@ -1053,32 +968,32 @@ OVERLOADABLE float lgamma(float x) {
> switch (i) { \
> case 0: \
> z = y * y; \
> - p1 = a0 + z * a2; \
> - p2 = z * (a1 + z *a3); \
> - p = y * p1 + p2; \
> - r += (p - (float) 0.5 * y); \
> + p1 = mad(z, a2, a0); \
> + p2 = z * mad(z, a3, a1); \
> + p = mad(y, p1, p2); \
> + r -= mad(y, 0.5f, -p); \
> break; \
> case 1: \
> z = y * y; \
> w = z * y; \
> - p1 = t0 + w * t3; \
> - p2 = t1 + w * t4; \
> - p3 = t2 + w * t5; \
> - p = z * p1 - (tt - w * (p2 + y * p3)); \
> + p1 = mad(w, t3, t0); \
> + p2 = mad(w, t4, t1); \
> + p3 = mad(w, t5, t2); \
> + p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
> r += (tf + p); \
> break; \
> case 2: \
> - p1 = y * (u0 + y * u1); \
> - p2 = one + y * (v1 + y * v2); \
> - r += (-(float) 0.5 * y + p1 / p2); \
> + p1 = y * mad(y, u1, u0); \
> + p2 = mad(y, mad(y, v2, v1), 1.0f); \
> + r -= mad(y, 0.5f, -p1 / p2); \
> } \
> } else if (ix < 0x41000000) { \
> i = (int) x; \
> t = zero; \
> y = x - (float) i; \
> - p = y * (s0 + y * (s1 + y * s2)); \
> - q = one + y * (r1 + y * r2); \
> - r = .5f * y + p / q; \
> + p = y * mad(y, mad(y, s2, s1), s0); \
> + q = mad(y, mad(y, r2, r1), 1.0f); \
> + r = mad(y, 0.5f, p / q); \
> z = one; \
> switch (i) { \
> case 7: \
> @@ -1099,10 +1014,10 @@ OVERLOADABLE float lgamma(float x) {
> t = native_log(x); \
> z = one / x; \
> y = z * z; \
> - w = w0 + z * (w1 + y * w2); \
> - r = (x - .5f) * (t - one) + w; \
> + w = mad(z, mad(y, w2, w1), w0); \
> + r = mad(x - 0.5f, t - 1.0f, w); \
> } else \
> - r = x * (native_log(x) - one); \
> + r = mad(x, native_log(x), -1.0f); \
> if (hx < 0) \
> r = nadj - r; \
> return r;
> @@ -1183,20 +1098,26 @@ OVERLOADABLE float log1p(float x) {
> f = u-(float)1.0;
> }
> hfsq=(float)0.5*f*f;
> - if(hu==0) { /* |f| < 2**-20 */
> - if(f==zero) { if(k==0) return zero;
> - else {c += k*ln2_lo; return k*ln2_hi+c;} }
> - R = hfsq*((float)1.0-(float)0.66666666666666666*f);
> + if(hu==0)
> + { /* |f| < 2**-20 */
> + if(f==zero)
> + {
> + if(k==0) return zero;
> + else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
> + }
> + R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
> if(k==0) return f-R; else
> - return k*ln2_hi-((R-(k*ln2_lo+c))-f);
> + return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
> }
> s = f/((float)2.0+f);
> z = s*s;
> - R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
> - if(k==0) return f-(hfsq-s*(hfsq+R)); else
> - return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
> -
> + R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
> + if(k==0)
> + return f + mad(hfsq + R, s, -hfsq);
> + else
> + return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
> }
> +
> OVERLOADABLE float logb(float x) {
> if (__ocl_math_fastpath_flag)
> return __gen_ocl_internal_fastpath_logb(x);
> @@ -1308,14 +1229,14 @@ OVERLOADABLE float
> __gen_ocl_internal_cbrt(float x) {
>
> /* new cbrt to 23 bits */
> r=t*t/x;
> - s=C+r*t;
> + s=mad(r, t, C);
> t*=G+F/(s+E+D/s);
> /* one step newton iteration to 53 bits with error less than 0.667 ulps */
> s=t*t; /* t*t is exact */
> r=x/s;
> w=t+t;
> r=(r-t)/(w+r); /* r-s is exact */
> - t=t+t*r;
> + t=mad(t, r, t);
>
> /* retore the sign bit */
> GEN_OCL_GET_FLOAT_WORD(high,t);
> @@ -1367,10 +1288,22 @@ INLINE float __gen_ocl_asin_util(float x) {
> qS4 = 7.70381505559019352791e-02;
>
> float t = x*x;
> - float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
> - float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
> + float p = t * mad(t,
> + mad(t,
> + mad(t,
> + mad(t, pS4, pS3),
> + pS2),
> + pS1),
> + pS0);
> + float q = mad(t,
> + mad(t,
> + mad(t,
> + mad(t, qS4, qS3),
> + qS2),
> + qS1),
> + 1.0f);
> float w = p / q;
> - return x + x*w;
> + return mad(x, w, x);
> }
>
> OVERLOADABLE float __gen_ocl_internal_asin(float x) {
> --
> 2.5.0
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list