[Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad

Mon Jul 18 20:38:22 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Affected functions:
__gen_ocl_internal_log
__gen_ocl_internal_log10
__gen_ocl_internal_log2
__kernel_sinf
__kernel_cosf
__gen_ocl_internal_cbrt
__gen_ocl_asin_util
tan
log1p

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl | 385 +++++++++++++------------------
 1 file changed, 159 insertions(+), 226 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 9ea0817..149ba01 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -164,7 +164,7 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
   return ux.f;
 }
 
-OVERLOADABLE float __gen_ocl_internal_log(float x) {
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
 /*
  *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
  * ====================================================
@@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
  */
   union { unsigned int i; float f; } u;
   const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
+  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =  3.355443200e+07, /* 0x4c000000 */
   Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
   Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
   Lg3 = 2.8571429849e-01, /* 3E924925 */
   Lg4 = 2.2222198546e-01; /* 3E638E29 */
 
   const float zero   =  0.0;
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
+  float fsq, f, s, z, R, w, t1, t2, partial;
+  int k, ix, i, j;
 
   u.f = x;  ix = u.i;
-  k=0;
-  if (ix < 0x00800000) {      /* x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-    return -two25/zero;   /* log(+-0)=-inf */
-      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
-      return -INFINITY;  /* Gen does not support subnormal number now */
-      //k -= 25; x *= two25; /* subnormal number, scale up x */
-      //u.f = x;  ix = u.i;
-  }
-  if (ix >= 0x7f800000) return x+x;
-  k += (ix>>23)-127;
+  k = 0;
+
+  k += (ix>>23) - 127;
   ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
-  u.i = ix|(i^0x3f800000); x = u.f;
+  i = (ix + (0x95f64<<3)) & 0x800000;
+  u.i = ix | (i^0x3f800000); x = u.f;
   k += (i>>23);
-  f = x-(float)1.0;
-  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
-      if(f==zero) {
-        if(k==0) return zero;
-        else {
-          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
-        }
-      }
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      if(k==0)
-        return f-R;
-      else {
-        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
-      }
+  f = x - 1.0f;
+  fsq = f * f;
+
+  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+      R = fsq * (0.5f - 0.33333333333333333f * f);
+      return k * ln2_hi + k * ln2_lo + f - R;
   }
-  s = f/((float)2.0+f);
-  dk = (float)k;
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*Lg4);
-  t2= z*(Lg1+w*Lg3);
+
+  s = f / (2.0f + f);
+  z = s * s;
+  i = ix - (0x6147a << 3);
+  w = z * z;
+  j = (0x6b851 << 3) - ix;
+  t1= w * mad(w, Lg4, Lg2);
+  t2= z * mad(w, Lg3, Lg1);
   i |= j;
-  R = t2+t1;
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      if(k==0) return f-(hfsq-s*(hfsq+R)); else
-         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
-  } else {
-      if(k==0) return f-s*(f-R); else
-         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
-  }
+  R = t2 + t1;
+  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
 }
 
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+  union { unsigned int i; float f; } u;
+  u.f = x;
+  int ix = u.i;
 
-OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+  if (ix < 0 )
+	return NAN;  /* log(-#) = NaN */
+  if (ix >= 0x7f800000)
+    return NAN;
 
-  union {float f; unsigned i; }u;
+  return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+  union { float f; unsigned i; } u;
   const float
-  zero       = 0.0,
-  two25      =  3.3554432000e+07, /* 0x4c000000 */
   ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
   log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
   log10_2lo  =  7.9034151668e-07; /* 0x355427db */
 
-  float y,z;
-  int i,k,hx;
+  float y, z;
+  int i, k, hx;
 
   u.f = x; hx = u.i;
-  k=0;
-  if (hx < 0x00800000) {                  /* x < 2**-126  */
-    if ((hx&0x7fffffff)==0)
-      return -two25/zero;             /* log(+-0)=-inf */
-    if (hx<0) return NAN;        /* log(-#) = NaN */
-    return -INFINITY;      /* Gen does not support subnormal now */
-    //k -= 25; x *= two25; /* subnormal number, scale up x */
-    //u.f = x; hx = u.i;
-  }
-  if (hx >= 0x7f800000) return x+x;
-  k += (hx>>23)-127;
-  i  = ((unsigned)k&0x80000000)>>31;
-  hx = (hx&0x007fffff)|((0x7f-i)<<23);
-  y  = (float)(k+i);
+
+  if (hx<0)
+    return NAN; /* log(-#) = NaN */
+  if (hx >= 0x7f800000)
+    return NAN;
+
+  k = (hx >> 23) - 127;
+  i  = ((unsigned)k & 0x80000000) >> 31;
+  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+  y  = (float)(k + i);
   u.i = hx; x = u.f;
-  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
-  return  z+y*log10_2hi;
+
+  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
 }
 
 
-OVERLOADABLE float __gen_ocl_internal_log2(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
   const float zero   =  0.0,
-  ln2 = 0.69314718055994530942,
-  two25 =    3.355443200e+07, /** 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /** 3E924925 */
-  Lg4 = 2.2222198546e-01; /** 3E638E29 */
-
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
+  invln2 = 0x1.715476p+0f;
+  int ix;
 
-  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+  union { float f; int i; } u;
   u.f = x; ix = u.i;
 
-  k=0;
-  if (ix < 0x00800000) {           /** x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-      return -two25/(x-x);        /** log(+-0)=-inf */
-
-      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
-      return -INFINITY;
-      k -= 25; x *= two25; /** subnormal number, scale up x */
-      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
-  }
-
-  if (ix >= 0x7f800000) return x+x;
-
-  k += (ix>>23)-127;
-  ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
-
-  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
-  k += (i>>23);
-  dk = (float)k;
-  f = x-(float)1.0;
-
-  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
-      if(f==zero) return dk;
+  if (ix < 0)
+	return NAN;    /** log(-#) = NaN */
+  if (ix >= 0x7f800000)
+	return NAN;
 
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      return dk-(R-f)/ln2;
-  }
-
-  s = f/((float)2.0+f);
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*Lg4);
-  t2= z*(Lg1+w*Lg3);
-  i |= j;
-  R = t2+t1;
-
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
-  } else {
-      return dk-((s*(f-R))-f)/ln2;
-  }
+  return invln2 * __gen_ocl_internal_log_valid(x);
 }
 
 
@@ -545,9 +463,13 @@ OVERLOADABLE float __kernel_sinf(float x)
   float z,r,v;
   z =  x*x;
   v =  z*x;
-  r =  S2+z*(S3+z*(S4));
+  r = mad(z,
+		  mad(z,
+			  mad(z, S4, S3),
+		  S2),
+	  S1);
 
-  return x+v*(S1+z*r);
+  return mad(v, r, x);
 }
 
 float __kernel_cosf(float x, float y)
@@ -563,7 +485,9 @@ float __kernel_cosf(float x, float y)
   GEN_OCL_GET_FLOAT_WORD(ix,x);
   ix &= 0x7fffffff;     /* ix = |x|'s high word*/
   z  = x*x;
-  r = z*(C1+z*(C2+z*(C3)));
+  r = z * mad(z,
+		  mad(z, C3, C2),
+	  C1);
 
   if(ix < 0x3e99999a)       /* if |x| < 0.3 */
       return one - ((float)0.5*z - (z*r - x*y));
@@ -671,24 +595,27 @@ float __kernel_tanf(float x, float y, int iy)
             }
         if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
             if(hx<0) {x = -x; y = -y;}
-
-
             z = pio4-x;
             w = pio4lo-y;
             x = z+w; y = 0.0;
         }
         z       =  x*x;
         w       =  z*z;
-    /* Break x^5*(T[1]+x^2*T[2]+...) into
-     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
-     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
-     */
-
-        r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
-        v = z*(T[2]+w*(T[4]+w*T[6]));
+		/* Break x^5*(T[1]+x^2*T[2]+...) into
+		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+		 */
+
+        r = mad(w,
+        		mad(w,
+        				mad(w, T[7],
+					T[5]),
+				T[3]),
+			T[1]);
+        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
 
         s = z*x;
-        r = y + z*(s*(r+v)+y);
+        r = mad(z, mad(s, r + v, y), y);
         r += T[0]*s;
         w = x+r;
         if(ix>=0x3f2ca140) {
@@ -696,21 +623,8 @@ float __kernel_tanf(float x, float y, int iy)
             return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
         }
         if(iy==1) return w;
-        else {          /* if allow error up to 2 ulp
-                           simply return -1.0/(x+r) here */
-     /*  compute -1.0/(x+r) accurately */
-            float a,t;
-            int i;
-            z  = w;
-            GEN_OCL_GET_FLOAT_WORD(i,z);
-            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
-            v  = r-(z - x);     /* z+v = r+x */
-            t = a  = -(float)1.0/w;     /* a = -1.0/w */
-            GEN_OCL_GET_FLOAT_WORD(i,t);
-            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
-            s  = (float)1.0+t*z;
-            return t+a*(s+t*v);
-        }
+        else
+        	return -1.0/(x+r);
 }
 
 OVERLOADABLE float tan(float x)
@@ -897,44 +811,45 @@ OVERLOADABLE float lgamma(float x) {
 		switch (i) {
 		case 0:
 			z = y * y;
-			p1 = a0 + z * a2;
-			p2 = z * (a1 + z * a3);
-			p = y * p1 + p2;
+			p1 = mad(z, a2, a0);
+			p2 = z * mad(z, a3, a1);
+			p = mad(y, p1, p2);
 			r += (p - (float) 0.5 * y);
 			break;
 		case 1:
 			z = y * y;
 			w = z * y;
-			p1 = t0 + w * t3;
-			p2 = t1 + w * t4;
-			p3 = t2 + w * t5;
-			p = z * p1 - (tt - w * (p2 + y * p3));
+			p1 = mad(w, t3, t0);
+			p2 = mad(w, t4, t1);
+			p3 = mad(w, t5, t2);
+			p = z * p1 - (tt - w * mad(y, p3, p2));
 			r += (tf + p);
 			break;
 		case 2:
-			p1 = y * (u0 + y * u1);
-			p2 = one + y * (v1 + y * v2);
+			p1 = y * mad(y, u1, u0);
+			p2 = one + y * mad(y, v2, v1);
 			r += (-(float) 0.5 * y + p1 / p2);
 		}
 	} else if (ix < 0x41000000) {
 		i = (int) x;
 		t = zero;
 		y = x - (float) i;
-		p = y * (s0 + y * (s1 + y * s2));
-		q = one + y * (r1 + y * r2);
+		p = y * mad(y, mad(y, s2, s1), s0);
+		q = mad(y, mad(y, r2, r1), 1.0f);
 		r = .5f * y + p / q;
 		z = one;
+
 		switch (i) {
 		case 7:
-			z *= (y + (float) 6.0);
+			z *= (y + 6.0f);
 		case 6:
-			z *= (y + (float) 5.0);
+			z *= (y + 5.0f);
 		case 5:
-			z *= (y + (float) 4.0);
+			z *= (y + 4.0f);
 		case 4:
-			z *= (y + (float) 3.0);
+			z *= (y + 3.0f);
 		case 3:
-			z *= (y + (float) 2.0);
+			z *= (y + 2.0f);
 			r += native_log(z);
 			break;
 		}
@@ -943,7 +858,7 @@ OVERLOADABLE float lgamma(float x) {
 		t = native_log(x);
 		z = one / x;
 		y = z * z;
-		w = w0 + z * (w1 + y * w2);
+		w = mad(z, mad(y, w2, 21), w0);
 		r = (x - .5f) * (t - one) + w;
 	} else
 		r = x * (native_log(x) - one);
@@ -1053,32 +968,32 @@ OVERLOADABLE float lgamma(float x) {
 		switch (i) {  \
 		case 0:  \
 			z = y * y;  \
-			p1 = a0 + z * a2;  \
-			p2 = z * (a1 + z *a3);  \
-			p = y * p1 + p2;  \
-			r += (p - (float) 0.5 * y);  \
+			p1 = mad(z, a2, a0);  \
+			p2 = z * mad(z, a3, a1);  \
+			p = mad(y, p1, p2); \
+			r -= mad(y, 0.5f, -p);  \
 			break;  \
 		case 1:  \
 			z = y * y;  \
 			w = z * y;  \
-			p1 = t0 + w * t3;  \
-			p2 = t1 + w * t4;  \
-			p3 = t2 + w * t5;  \
-			p = z * p1 - (tt - w * (p2 + y * p3));  \
+			p1 = mad(w, t3, t0); \
+			p2 = mad(w, t4, t1); \
+			p3 = mad(w, t5, t2); \
+			p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
 			r += (tf + p);  \
 			break;  \
 		case 2:  \
-			p1 = y * (u0 + y * u1);  \
-			p2 = one + y * (v1 + y * v2);  \
-			r += (-(float) 0.5 * y + p1 / p2);  \
+			p1 = y * mad(y, u1, u0); \
+			p2 = mad(y, mad(y, v2, v1), 1.0f);  \
+			r -= mad(y, 0.5f, -p1 / p2);  \
 		}  \
 	} else if (ix < 0x41000000) {  \
 		i = (int) x;  \
 		t = zero;  \
 		y = x - (float) i;  \
-		p = y * (s0 + y * (s1 + y * s2));  \
-		q = one + y * (r1 + y * r2);  \
-		r = .5f * y + p / q;  \
+		p = y * mad(y, mad(y, s2, s1), s0); \
+		q = mad(y, mad(y, r2, r1), 1.0f);  \
+		r = mad(y, 0.5f, p / q); \
 		z = one;  \
 		switch (i) {  \
 		case 7:  \
@@ -1099,10 +1014,10 @@ OVERLOADABLE float lgamma(float x) {
 		t = native_log(x);  \
 		z = one / x;  \
 		y = z * z;  \
-		w = w0 + z * (w1 + y * w2);  \
-		r = (x - .5f) * (t - one) + w;  \
+		w = mad(z, mad(y, w2, w1), w0); \
+		r = mad(x - 0.5f, t - 1.0f, w); \
 	} else  \
-		r = x * (native_log(x) - one);  \
+		r = mad(x, native_log(x), -1.0f); \
 	if (hx < 0)  \
 		r = nadj - r;  \
 	return r;
@@ -1183,20 +1098,26 @@ OVERLOADABLE float log1p(float x) {
       f = u-(float)1.0;
   }
   hfsq=(float)0.5*f*f;
-  if(hu==0) { /* |f| < 2**-20 */
-      if(f==zero) { if(k==0) return zero;
-      else {c += k*ln2_lo; return k*ln2_hi+c;} }
-      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+  if(hu==0)
+  { /* |f| < 2**-20 */
+      if(f==zero)
+      {
+    	  if(k==0) return zero;
+    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+      }
+      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
       if(k==0) return f-R; else
-             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
   }
   s = f/((float)2.0+f);
   z = s*s;
-  R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
-  if(k==0) return f-(hfsq-s*(hfsq+R)); else
-     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
-
+  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+  if(k==0)
+	  return f + mad(hfsq + R, s, -hfsq);
+  else
+	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
 }
+
 OVERLOADABLE float logb(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_logb(x);
@@ -1308,14 +1229,14 @@ OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
 
     /* new cbrt to 23 bits */
   r=t*t/x;
-  s=C+r*t;
+  s=mad(r, t, C);
   t*=G+F/(s+E+D/s);
     /* one step newton iteration to 53 bits with error less than 0.667 ulps */
   s=t*t;    /* t*t is exact */
   r=x/s;
   w=t+t;
   r=(r-t)/(w+r);  /* r-s is exact */
-  t=t+t*r;
+  t=mad(t, r, t);
 
     /* retore the sign bit */
   GEN_OCL_GET_FLOAT_WORD(high,t);
@@ -1367,10 +1288,22 @@ INLINE float __gen_ocl_asin_util(float x) {
   qS4 =  7.70381505559019352791e-02;
 
   float t = x*x;
-  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
-  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+  float p = t * mad(t,
+		  	  	  mad(t,
+					  mad(t,
+						  mad(t, pS4, pS3),
+					  pS2),
+				  pS1),
+			  pS0);
+  float q = mad(t,
+		  	  mad(t,
+				  mad(t,
+					  mad(t, qS4, qS3),
+				  qS2),
+			  qS1),
+		  1.0f);
   float w = p / q;
-  return x + x*w;
+  return mad(x, w, x);
 }
 
 OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-- 
2.5.0