[Beignet] [PATCH] backend: refine pow function

Thu Jun 22 09:41:17 UTC 2017

Now save 40% time than before
	(1) group many branches which deal with corner case  to one branch.
        (2) using HW exp2 and log2 to replace some instructions

	pass conformance tests and utest

Signed-off-by: rander.wang <rander.wang at intel.com>
---
 backend/src/libocl/tmpl/ocl_math_common.tmpl.cl | 294 ++++++++++++------------
 1 file changed, 148 insertions(+), 146 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
index 2c0a702..6026629 100644
--- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
@@ -2352,7 +2352,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   float z,ax,z_h,z_l,p_h,p_l;
   float y1,t1,t2,r,s,sn,t,u,v,w;
   int i,j,k,yisint,n;
-  int hx,hy,ix,iy,is;
+  int hy,ix,iy,is;
+  unsigned int hx;
   float bp,dp_h,dp_l,
   zero    =  0.0,
   one	=  1.0,
@@ -2382,17 +2383,17 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   float retVal = 0.0f;
   bool bRet = false;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  hx = as_uint(x);
   GEN_OCL_GET_FLOAT_WORD(hy,y);
   ax   = __gen_ocl_fabs(x);
   ix = as_int(ax);  iy = as_int(fabs(y));
 
-  if(iy < 0x00800000 || hx==0x3f800000)
+  if(iy < 0x00800000)
   {
      bRet = true;
      retVal = one;
   }
-  else if (ix > 0x7f800000 || iy > 0x7f800000)
+  else if (iy > 0x7f800000)
   {
        bRet = true;
        retVal = NAN;
@@ -2403,120 +2404,152 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
      * yisint = 1	... y is an odd int
      * yisint = 2	... y is an even int
      */
-  yisint  = 0;
-  if(hx<0) {
-    k = (iy>>23)-0x7f;		 /* exponent */
-    j = iy>>(23-k);
-    yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
-    yisint = (iy>=0x4b800000) ? 2:yisint;
-  }
-
-    /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-    z = ax;			/*x is +-0,+-inf,+-1*/
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
 
-    z = (hy < 0)? one/z:z;
-    z = ((hx<0) && (((ix-0x3f800000)|yisint)==0))? NAN:z;
-    z = ((hx<0) && (yisint==1))? -z:z;
+  if(hx >= 0x7f800000)
+  {
+    yisint  = 0;
+    n = (hx>>31)-1;
 
-    retVal = (bRet)? retVal:z;
-    bRet = true;
-  }
+    if (!retVal && ix > 0x7f800000)
+    {
+      bRet = true;
+      retVal = NAN;
+    }
 
-  n = ((uint)hx>>31)-1;
+    if(hx >= 0x80000000) {
+      k = (iy>>23)-0x7f;		 /* exponent */
+      j = iy>>(23-k);
+      yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
+      yisint = (iy>=0x4b800000) ? 2:yisint;
+    }
 
-  /* (x<0)**(non-int) is NaN */
-  if(!bRet && (n|yisint)==0)
-  {
-     bRet= true;
-     retVal = NAN;
-  }
+      /* special value of x */
+    if(ix==0x7f800000||ix==0||ix==0x3f800000){
+      z = ax;     /*x is +-0,+-inf,+-1*/
+      z = (hy < 0)? one/z:z;
+      z = (((ix-0x3f800000)|yisint)==0)? NAN:z;
+      z = (yisint==1)? -z:z;
+      retVal = (bRet)? retVal:z;
+      bRet = true;
+    }
 
-  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+    /* (x<0)**(non-int) is NaN */
+    if(!bRet && (n|yisint)==0)
+    {
+       bRet= true;
+       retVal = NAN;
+    }
 
-  /* |y| is huge */
-  if(iy>0x4d000000)
-  { /* if |y| > 2**27 */
-	/* over/underflow if x is not close to one */
-	/* special value of y */
-	float b1 = (hy>=0)? y: zero;
-	float b2 = (hy<0)?-y: zero;
-	b1 = (ix > 0x3f800000)? b1:b2;
-	retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
-	bRet = (iy==0x7f800000 && !bRet)? true: bRet;
+    if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+  }
 
+    /* special value of x */
+  if((ix&0x7fffff) == 0) {
+    if(hx == 0x3f800000)
+    {
+      retVal = one;
+      bRet = true;
+    }
 
-	b1 = (hy>0)? sn*huge*huge:0;
-	retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
-	bRet = (ix>0x3f800007 && !bRet)? true:bRet;
+    if(ix==0x7f800000||ix==0) {
+      z = ax;			/*x is +0,+inf*/
+      z = (hy < 0)? one/z:z;
+      retVal = (bRet)? retVal:z;
+      bRet = true;
+    }
+  }
 
-	/* now |1-x| is tiny <= 2**-20, suffice to compute
-	      log(x) by x-x^2/2+x^3/3-x^4/4 */
-	t = ax-1;		/* t has 20 trailing zeros */
-	w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
-	u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-	v = t*ivln2_l-w*ivln2;
-	t1 = u+v;
-	GEN_OCL_GET_FLOAT_WORD(is,t1);
-	GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-	t2 = v-(t1-u);
-  } 
-  else
+  /* |y| is not huge */
+  if(iy <= 0x4d000000)
   {
-	float s2,s_h,s_l,t_h,t_l;
-	n = 0;
-	n  += ((ix)>>23)-0x7f;
-	j  = ix&0x007fffff;
-	/* determine interval */
-	ix = j|0x3f800000; /* normalize ix */
-
-	n = (j >= 0x5db3d7)?n+1:n;
-	ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
-	k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
+    float s2,s_h,s_l,t_h,t_l;
+    n  = ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+    /* determine interval */
+    ix = j|0x3f800000; /* normalize ix */
 
-	GEN_OCL_SET_FLOAT_WORD(ax,ix);
+    if(x > 0x1.0p-6 && x < 0x1.2p7 && fabs(y) < 0x1.0p4)
+    {
+      GEN_OCL_SET_FLOAT_WORD(ax,ix);
+      t1 = n;
+      t2 = native_log2(ax);
+    }
+    else
+    {
+      n = (j >= 0x5db3d7)?n+1:n;
+      ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
+      k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
+
+      GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+      bp = k? 1.5:bp;
+      dp_h = k? 5.84960938e-01:dp_h;
+      dp_l = k? 1.56322085e-06:dp_l;
+
+      /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+      u = ax-bp;		/* bp[0]=1.0, bp[1]=1.5 */
+      v = one/(ax+bp);
+      s = u*v;
+      s_h = s;
+      GEN_OCL_GET_FLOAT_WORD(is,s_h);
+      GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+      /* t_h=ax+bp[k] High */
+      is = ((ix>>1)&0xfffff000)|0x20000000;
+      GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+      t_l = ax - (t_h-bp);
+      s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
+
+      /* compute log(ax) */
+      s2 = s*s;
+      r = s2*s2*(mad(s2, L2, L1));
+      r = mad(s_l, (s_h+s), r);
+      t_h = mad(s_h, s_h, 3.0f)+r;
+      GEN_OCL_GET_FLOAT_WORD(is,t_h);
+      GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+      t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
+      /* u+v = s*(1+...) */
+      u = s_h*t_h;
+      v = mad(s_l, t_h, t_l*s);
+      /* 2/(3log2)*(s+...) */
+      p_h = mad(s_h, t_h, v);
+      GEN_OCL_GET_FLOAT_WORD(is,p_h);
+      GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+      p_l = v-(mad(-s_h, t_h, p_h));
+      z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
+      /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+      t = (float)n;
+      t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
+      GEN_OCL_GET_FLOAT_WORD(is,t1);
+      GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+      t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
+    }
+  }
+  else
+  { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    /* special value of y */
+    float b1 = (hy>=0)? y: zero;
+    float b2 = (hy<0)?-y: zero;
+    b1 = (ix > 0x3f800000)? b1:b2;
+    retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
+    bRet = (iy==0x7f800000 && !bRet)? true: bRet;
 
-	bp = k? 1.5:bp;
-	dp_h = k? 5.84960938e-01:dp_h;
-	dp_l = k? 1.56322085e-06:dp_l;
 
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-	u = ax-bp;		/* bp[0]=1.0, bp[1]=1.5 */
-	v = one/(ax+bp);
-	s = u*v;
-	s_h = s;
-	GEN_OCL_GET_FLOAT_WORD(is,s_h);
-	GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-	/* t_h=ax+bp[k] High */
-	is = ((ix>>1)&0xfffff000)|0x20000000;
-	GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
-	t_l = ax - (t_h-bp);
-	s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
+    b1 = (hy>0)? sn*huge*huge:0;
+    retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
+    bRet = (ix>0x3f800007 && !bRet)? true:bRet;
 
-	/* compute log(ax) */
-	s2 = s*s;
-	r = s2*s2*(mad(s2, L2, L1));
-	r = mad(s_l, (s_h+s), r);
-	t_h = mad(s_h, s_h, 3.0f)+r;
-	GEN_OCL_GET_FLOAT_WORD(is,t_h);
-	GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-	t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
-	/* u+v = s*(1+...) */
-	u = s_h*t_h;
-	v = mad(s_l, t_h, t_l*s);
-	/* 2/(3log2)*(s+...) */
-	p_h = mad(s_h, t_h, v);
-	GEN_OCL_GET_FLOAT_WORD(is,p_h);
-	GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-	p_l = v-(mad(-s_h, t_h, p_h));
-	z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
-	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-	t = (float)n;
-	t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
-	GEN_OCL_GET_FLOAT_WORD(is,t1);
-	GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-	t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
   }
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
@@ -2524,52 +2557,21 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
   p_l = mad((y-y1), t1, y*t2);
   p_h = y1*t1;
-  z = mad(y1, t1, p_l);
 
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if((j&0x7fffffff) >= 0x43000000 && !bRet)
+  z = mad(y1, t1, p_l);
+  if(fabs(z) >= 128.0f)
   {
-     retVal = ((j&0x7fffffff)>0x43160000)? 0.0:retVal;
-     retVal = (j > 0x43000000) ? sn*huge*huge:retVal;
-     retVal = (j == 0x43000000 && p_l+ovt>z-p_h)? sn*huge*huge:retVal;
-     retVal = (j == 0xc3160000 && p_l<=z-p_h)? 0.0:retVal;
-     bRet = (((j&0x7fffffff)>0x43160000) || (j == 0xc3160000 && p_l<=z-p_h) || retVal == sn*huge*huge)? true:false;
-  }
-
-  /*
-    * compute 2**(p_h+p_l)
-    */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
-    n = j+(0x00800000>>(k+1));
-    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-    n = ((n&0x007fffff)|0x00800000)>>(23-k);
-    n = (j<0)?-n:n;
-    p_h = mad(y1, t1, -t);
+    if(!bRet)
+    {
+       retVal = (fabs(z) > 150.0f)? 0.0:retVal;
+       retVal = (z > 128.0f) ? sn*INFINITY:retVal;
+       retVal = (z == 128.0f && p_l+ovt>z-p_h)? sn*INFINITY:retVal;
+       retVal = (z == -150.0f && p_l<=z-p_h)? 0.0:retVal;
+       bRet = ((fabs(z) > 150.0f) || (z == -150.0f && p_l<=z-p_h) || retVal == sn*INFINITY)? true:false;
+    }
   }
 
-  t = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
-
-  v = mad((p_l-(t-p_h)), lg2, t*lg2_l);
-  z = mad(t, lg2_h, v);
-  w = v-mad(-t, lg2_h, z);
-  t  = z*z;
-  t1  = mad(-t, (mad(t, P2, P1)), z);
-  r  = (z*t1)/(t1-two)-(mad(z, w, w));
-  z  = one-(r-z);
-
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-
-  if((j>>23)<=0)
-	z = 0;//__gen_ocl_scalbnf(z,n);	/* subnormal output */
-  else
-	GEN_OCL_SET_FLOAT_WORD(z,j);
+  z = exp2(p_l)*exp2(p_h);
 
   return bRet ? retVal:sn*z;
 }
-- 
2.7.4