[Beignet] [PATCH 4/5] Backend: Optimization internal math, use mad
Grigore Lupescu
grigore.lupescu at intel.com
Wed May 25 19:17:28 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Affected functions:
__gen_ocl_internal_log
__gen_ocl_internal_log10
__gen_ocl_internal_log2
__kernel_sinf
__kernel_cosf
__gen_ocl_internal_cbrt
__gen_ocl_asin_util
tan
log1p
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 380 +++++++++++++------------------
1 file changed, 155 insertions(+), 225 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index c262ae7..9414c8e 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -164,7 +164,7 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
return ux.f;
}
-OVERLOADABLE float __gen_ocl_internal_log(float x) {
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
* ====================================================
@@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
*/
union { unsigned int i; float f; } u;
const float
- ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
- ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
- two25 = 3.355443200e+07, /* 0x4c000000 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
Lg3 = 2.8571429849e-01, /* 3E924925 */
Lg4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
- float hfsq,f,s,z,R,w,t1,t2,dk;
- int k,ix,i,j;
+ float fsq, f, s, z, R, w, t1, t2, partial;
+ int k, ix, i, j;
u.f = x; ix = u.i;
- k=0;
- if (ix < 0x00800000) { /* x < 2**-126 */
- if ((ix&0x7fffffff)==0)
- return -two25/zero; /* log(+-0)=-inf */
- if (ix<0) return (x-x)/zero; /* log(-#) = NaN */
- return -INFINITY; /* Gen does not support subnormal number now */
- //k -= 25; x *= two25; /* subnormal number, scale up x */
- //u.f = x; ix = u.i;
- }
- if (ix >= 0x7f800000) return x+x;
- k += (ix>>23)-127;
+ k = 0;
+
+ k += (ix>>23) - 127;
ix &= 0x007fffff;
- i = (ix+(0x95f64<<3))&0x800000;
- u.i = ix|(i^0x3f800000); x = u.f;
+ i = (ix + (0x95f64<<3)) & 0x800000;
+ u.i = ix | (i^0x3f800000); x = u.f;
k += (i>>23);
- f = x-(float)1.0;
- if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
- if(f==zero) {
- if(k==0) return zero;
- else {
- dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
- }
- }
- R = f*f*((float)0.5-(float)0.33333333333333333*f);
- if(k==0)
- return f-R;
- else {
- dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f);
- }
+ f = x - 1.0f;
+ fsq = f * f;
+
+ if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+ R = fsq * (0.5f - 0.33333333333333333f * f);
+ return k * ln2_hi + k * ln2_lo + f - R;
}
- s = f/((float)2.0+f);
- dk = (float)k;
- z = s*s;
- i = ix-(0x6147a<<3);
- w = z*z;
- j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*Lg4);
- t2= z*(Lg1+w*Lg3);
+
+ s = f / (2.0f + f);
+ z = s * s;
+ i = ix - (0x6147a << 3);
+ w = z * z;
+ j = (0x6b851 << 3) - ix;
+ t1= w * mad(w, Lg4, Lg2);
+ t2= z * mad(w, Lg3, Lg1);
i |= j;
- R = t2+t1;
- if(i>0) {
- hfsq=(float)0.5*f*f;
- if(k==0) return f-(hfsq-s*(hfsq+R)); else
- return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
- } else {
- if(k==0) return f-s*(f-R); else
- return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
- }
+ R = t2 + t1;
+ partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+ return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
}
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+ union { unsigned int i; float f; } u;
+ u.f = x;
+ int ix = u.i;
-OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+ if (ix < 0 )
+ return NAN; /* log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
- union {float f; unsigned i; }u;
+ return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+ union { float f; unsigned i; } u;
const float
- zero = 0.0,
- two25 = 3.3554432000e+07, /* 0x4c000000 */
ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
log10_2lo = 7.9034151668e-07; /* 0x355427db */
- float y,z;
- int i,k,hx;
+ float y, z;
+ int i, k, hx;
u.f = x; hx = u.i;
- k=0;
- if (hx < 0x00800000) { /* x < 2**-126 */
- if ((hx&0x7fffffff)==0)
- return -two25/zero; /* log(+-0)=-inf */
- if (hx<0) return NAN; /* log(-#) = NaN */
- return -INFINITY; /* Gen does not support subnormal now */
- //k -= 25; x *= two25; /* subnormal number, scale up x */
- //u.f = x; hx = u.i;
- }
- if (hx >= 0x7f800000) return x+x;
- k += (hx>>23)-127;
- i = ((unsigned)k&0x80000000)>>31;
- hx = (hx&0x007fffff)|((0x7f-i)<<23);
- y = (float)(k+i);
+
+ if (hx<0)
+ return NAN; /* log(-#) = NaN */
+ if (hx >= 0x7f800000)
+ return NAN;
+
+ k = (hx >> 23) - 127;
+ i = ((unsigned)k & 0x80000000) >> 31;
+ hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+ y = (float)(k + i);
u.i = hx; x = u.f;
- z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
- return z+y*log10_2hi;
+
+ return y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
}
-OVERLOADABLE float __gen_ocl_internal_log2(float x) {
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
const float zero = 0.0,
- ln2 = 0.69314718055994530942,
- two25 = 3.355443200e+07, /** 0x4c000000 */
- Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
- Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
- Lg3 = 2.8571429849e-01, /** 3E924925 */
- Lg4 = 2.2222198546e-01; /** 3E638E29 */
-
- float hfsq,f,s,z,R,w,t1,t2,dk;
- int k,ix,i,j;
+ invln2 = 0x1.715476p+0f;
+ int ix;
- union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+ union { float f; int i; } u;
u.f = x; ix = u.i;
- k=0;
- if (ix < 0x00800000) { /** x < 2**-126 */
- if ((ix&0x7fffffff)==0)
- return -two25/(x-x); /** log(+-0)=-inf */
-
- if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */
- return -INFINITY;
- k -= 25; x *= two25; /** subnormal number, scale up x */
- u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
- }
-
- if (ix >= 0x7f800000) return x+x;
-
- k += (ix>>23)-127;
- ix &= 0x007fffff;
- i = (ix+(0x95f64<<3))&0x800000;
-
- u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000)); /** normalize x or x/2 */
- k += (i>>23);
- dk = (float)k;
- f = x-(float)1.0;
-
- if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */
- if(f==zero) return dk;
+ if (ix < 0)
+ return NAN; /** log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
- R = f*f*((float)0.5-(float)0.33333333333333333*f);
- return dk-(R-f)/ln2;
- }
-
- s = f/((float)2.0+f);
- z = s*s;
- i = ix-(0x6147a<<3);
- w = z*z;
- j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*Lg4);
- t2= z*(Lg1+w*Lg3);
- i |= j;
- R = t2+t1;
-
- if(i>0) {
- hfsq=(float)0.5*f*f;
- return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
- } else {
- return dk-((s*(f-R))-f)/ln2;
- }
+ return invln2 * __gen_ocl_internal_log_valid(x);
}
@@ -545,9 +463,13 @@ OVERLOADABLE float __kernel_sinf(float x)
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4));
+ r = mad(z,
+ mad(z,
+ mad(z, S4, S3),
+ S2),
+ S1);
- return x+v*(S1+z*r);
+ return mad(v, r, x);
}
float __kernel_cosf(float x, float y)
@@ -563,7 +485,9 @@ float __kernel_cosf(float x, float y)
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3)));
+ r = z * mad(z,
+ mad(z, C3, C2),
+ C1);
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
@@ -671,24 +595,27 @@ float __kernel_tanf(float x, float y, int iy)
}
if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
if(hx<0) {x = -x; y = -y;}
-
-
z = pio4-x;
w = pio4lo-y;
x = z+w; y = 0.0;
}
z = x*x;
w = z*z;
- /* Break x^5*(T[1]+x^2*T[2]+...) into
- * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
- * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
- */
-
- r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
- v = z*(T[2]+w*(T[4]+w*T[6]));
+ /* Break x^5*(T[1]+x^2*T[2]+...) into
+ * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+ * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+ */
+
+ r = mad(w,
+ mad(w,
+ mad(w, T[7],
+ T[5]),
+ T[3]),
+ T[1]);
+ v = z* mad(w, mad(w, T[6], T[4]), T[2]);
s = z*x;
- r = y + z*(s*(r+v)+y);
+ r = mad(z, mad(s, r + v, y), y);
r += T[0]*s;
w = x+r;
if(ix>=0x3f2ca140) {
@@ -696,21 +623,8 @@ float __kernel_tanf(float x, float y, int iy)
return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
}
if(iy==1) return w;
- else { /* if allow error up to 2 ulp
- simply return -1.0/(x+r) here */
- /* compute -1.0/(x+r) accurately */
- float a,t;
- int i;
- z = w;
- GEN_OCL_GET_FLOAT_WORD(i,z);
- GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
- v = r-(z - x); /* z+v = r+x */
- t = a = -(float)1.0/w; /* a = -1.0/w */
- GEN_OCL_GET_FLOAT_WORD(i,t);
- GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
- s = (float)1.0+t*z;
- return t+a*(s+t*v);
- }
+ else
+ return -1.0/(x+r);
}
OVERLOADABLE float tan(float x)
@@ -897,44 +811,45 @@ OVERLOADABLE float lgamma(float x) {
switch (i) {
case 0:
z = y * y;
- p1 = a0 + z * a2;
- p2 = z * (a1 + z * a3);
- p = y * p1 + p2;
+ p1 = mad(z, a2, a0);
+ p2 = z * mad(z, a3, a1);
+ p = mad(y, p1, p2);
r += (p - (float) 0.5 * y);
break;
case 1:
z = y * y;
w = z * y;
- p1 = t0 + w * t3;
- p2 = t1 + w * t4;
- p3 = t2 + w * t5;
- p = z * p1 - (tt - w * (p2 + y * p3));
+ p1 = mad(w, t3, t0);
+ p2 = mad(w, t4, t1);
+ p3 = mad(w, t5, t2);
+ p = z * p1 - (tt - w * mad(y, p3, p2));
r += (tf + p);
break;
case 2:
- p1 = y * (u0 + y * u1);
- p2 = one + y * (v1 + y * v2);
+ p1 = y * mad(y, u1, u0);
+ p2 = one + y * mad(y, v2, v1);
r += (-(float) 0.5 * y + p1 / p2);
}
} else if (ix < 0x41000000) {
i = (int) x;
t = zero;
y = x - (float) i;
- p = y * (s0 + y * (s1 + y * s2));
- q = one + y * (r1 + y * r2);
+ p = y * mad(y, mad(y, s2, s1), s0);
+ q = mad(y, mad(y, r2, r1), 1.0f);
r = .5f * y + p / q;
z = one;
+
switch (i) {
case 7:
- z *= (y + (float) 6.0);
+ z *= (y + 6.0f);
case 6:
- z *= (y + (float) 5.0);
+ z *= (y + 5.0f);
case 5:
- z *= (y + (float) 4.0);
+ z *= (y + 4.0f);
case 4:
- z *= (y + (float) 3.0);
+ z *= (y + 3.0f);
case 3:
- z *= (y + (float) 2.0);
+ z *= (y + 2.0f);
r += native_log(z);
break;
}
@@ -943,7 +858,7 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x);
z = one / x;
y = z * z;
- w = w0 + z * (w1 + y * w2);
+ w = mad(z, mad(y, w2, 21), w0);
r = (x - .5f) * (t - one) + w;
} else
r = x * (native_log(x) - one);
@@ -1053,32 +968,32 @@ OVERLOADABLE float lgamma(float x) {
switch (i) { \
case 0: \
z = y * y; \
- p1 = a0 + z * a2; \
- p2 = z * (a1 + z *a3); \
- p = y * p1 + p2; \
- r += (p - (float) 0.5 * y); \
+ p1 = mad(z, a2, a0); \
+ p2 = z * mad(z, a3, a1); \
+ p = mad(y, p1, p2); \
+ r -= mad(y, 0.5f, -p); \
break; \
case 1: \
z = y * y; \
w = z * y; \
- p1 = t0 + w * t3; \
- p2 = t1 + w * t4; \
- p3 = t2 + w * t5; \
- p = z * p1 - (tt - w * (p2 + y * p3)); \
+ p1 = mad(w, t3, t0); \
+ p2 = mad(w, t4, t1); \
+ p3 = mad(w, t5, t2); \
+ p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
r += (tf + p); \
break; \
case 2: \
- p1 = y * (u0 + y * u1); \
- p2 = one + y * (v1 + y * v2); \
- r += (-(float) 0.5 * y + p1 / p2); \
+ p1 = y * mad(y, u1, u0); \
+ p2 = mad(y, mad(y, v2, v1), 1.0f); \
+ r -= mad(y, 0.5f, -p1 / p2); \
} \
} else if (ix < 0x41000000) { \
i = (int) x; \
t = zero; \
y = x - (float) i; \
- p = y * (s0 + y * (s1 + y * s2)); \
- q = one + y * (r1 + y * r2); \
- r = .5f * y + p / q; \
+ p = y * mad(y, mad(y, s2, s1), s0); \
+ q = mad(y, mad(y, r2, r1), 1.0f); \
+ r = mad(y, 0.5f, p / q); \
z = one; \
switch (i) { \
case 7: \
@@ -1099,10 +1014,10 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x); \
z = one / x; \
y = z * z; \
- w = w0 + z * (w1 + y * w2); \
- r = (x - .5f) * (t - one) + w; \
+ w = mad(z, mad(y, w2, w1), w0); \
+ r = mad(x - 0.5f, t - 1.0f, w); \
} else \
- r = x * (native_log(x) - one); \
+ r = mad(x, native_log(x), -1.0f); \
if (hx < 0) \
r = nadj - r; \
return r;
@@ -1184,19 +1099,22 @@ OVERLOADABLE float log1p(float x) {
}
hfsq=(float)0.5*f*f;
if(hu==0) { /* |f| < 2**-20 */
- if(f==zero) { if(k==0) return zero;
- else {c += k*ln2_lo; return k*ln2_hi+c;} }
- R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+ if(f==zero) {
+ if(k==0) return zero;
+ else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+ }
+ R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
if(k==0) return f-R; else
- return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+ return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
}
s = f/((float)2.0+f);
z = s*s;
- R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
- if(k==0) return f-(hfsq-s*(hfsq+R)); else
- return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
-
+ R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+ if(k==0) return f - mad(hfsq + R, s, -hfsq);
+ else
+ return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
}
+
OVERLOADABLE float logb(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_logb(x);
@@ -1308,14 +1226,14 @@ OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
/* new cbrt to 23 bits */
r=t*t/x;
- s=C+r*t;
+ s=mad(r, t, C);
t*=G+F/(s+E+D/s);
/* one step newton iteration to 53 bits with error less than 0.667 ulps */
s=t*t; /* t*t is exact */
r=x/s;
w=t+t;
r=(r-t)/(w+r); /* r-s is exact */
- t=t+t*r;
+ t=mad(t, r, t);
/* retore the sign bit */
GEN_OCL_GET_FLOAT_WORD(high,t);
@@ -1367,10 +1285,22 @@ INLINE float __gen_ocl_asin_util(float x) {
qS4 = 7.70381505559019352791e-02;
float t = x*x;
- float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
- float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+ float p = t * mad(t,
+ mad(t,
+ mad(t,
+ mad(t, pS4, pS3),
+ pS2),
+ pS1),
+ pS0);
+ float q = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, qS4, qS3),
+ qS2),
+ qS1),
+ 1.0f);
float w = p / q;
- return x + x*w;
+ return mad(x, w, x);
}
OVERLOADABLE float __gen_ocl_internal_asin(float x) {
--
2.5.0
More information about the Beignet
mailing list