[Beignet] [PATCH 2/2] Backend: Optimization of internal sine and cosine

Grigore Lupescu grigore.lupescu at intel.com
Thu Mar 17 17:10:25 UTC 2016


From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl | 41 +++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 6460755..f8a6e3d 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -544,17 +544,18 @@ OVERLOADABLE float __kernel_sinf(float x)
 {
   /* copied from fdlibm */
   const float
-  half_value =  5.0000000000e-01,/* 0x3f000000 */
   S1  = -1.6666667163e-01, /* 0xbe2aaaab */
   S2  =  8.3333337680e-03, /* 0x3c088889 */
   S3  = -1.9841270114e-04, /* 0xb9500d01 */
   S4  =  2.7557314297e-06, /* 0x3638ef1b */
-  S5  = -2.5050759689e-08, /* 0xb2d72f34 */
-  S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  S5  = -2.5050759689e-08; /* 0xb2d72f34 */
   float z,r,v;
   z =  x*x;
   v =  z*x;
-  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+
+  /* for more accuracy use higher polynomial e.g. S2+z*(S3+z*(S4+z*(S5))) */
+  r =  S2+z*(S3+z*(S4));
+
   return x+v*(S1+z*r);
 }
 
@@ -566,15 +567,16 @@ float __kernel_cosf(float x, float y)
   C1  =  4.1666667908e-02, /* 0x3d2aaaab */
   C2  = -1.3888889225e-03, /* 0xbab60b61 */
   C3  =  2.4801587642e-05, /* 0x37d00d01 */
-  C4  = -2.7557314297e-07, /* 0xb493f27c */
-  C5  =  2.0875723372e-09, /* 0x310f74f6 */
-  C6  = -1.1359647598e-11; /* 0xad47d74e */
+  C4  = -2.7557314297e-07; /* 0xb493f27c */
   float a,hz,z,r,qx;
   int ix;
   GEN_OCL_GET_FLOAT_WORD(ix,x);
   ix &= 0x7fffffff;     /* ix = |x|'s high word*/
   z  = x*x;
-  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+
+  /* for more accuracy use higher polynomial e.g. z*(C1+z*(C2+z*(C3+z*(C4)))) */
+  r = z*(C1+z*(C2+z*(C3)));
+
   if(ix < 0x3e99999a)       /* if |x| < 0.3 */
       return one - ((float)0.5*z - (z*r - x*y));
   else {
@@ -585,24 +587,27 @@ float __kernel_cosf(float x, float y)
   }
 }
 
-OVERLOADABLE float sin(float x) {
+OVERLOADABLE float sin(float x)
+{
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_sin(x);
 
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
   float y,z=0.0;
   int n, ix;
 
   float negative = x < 0.0f? -1.0f : 1.0f;
-  x = negative * x;
+  x = fabs(x);
 
   GEN_OCL_GET_FLOAT_WORD(ix,x);
-
   ix &= 0x7fffffff;
 
     /* sin(Inf or NaN) is NaN */
-  if (ix>=0x7f800000) return x-x;
+  if (ix >= 0x7f800000) return x-x;
 
-    /* argument reduction needed */
+  if(x <= pio4)
+	  return negative * __kernel_sinf(x);
+  /* argument reduction needed */
   else {
       n = __ieee754_rem_pio2f(x,&y);
       float s = __kernel_sinf(y);
@@ -612,10 +617,12 @@ OVERLOADABLE float sin(float x) {
   }
 }
 
-OVERLOADABLE float cos(float x) {
+OVERLOADABLE float cos(float x)
+{
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_cos(x);
 
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
   float y,z=0.0;
   int n, ix;
   x = __gen_ocl_fabs(x);
@@ -624,9 +631,11 @@ OVERLOADABLE float cos(float x) {
   ix &= 0x7fffffff;
 
     /* cos(Inf or NaN) is NaN */
-  if (ix>=0x7f800000) return x-x;
+  if (ix >= 0x7f800000) return x-x;
 
-    /* argument reduction needed */
+  if(x <= pio4)
+	  return __kernel_cosf(x, 0.f);
+  /* argument reduction needed */
   else {
       n = __ieee754_rem_pio2f(x,&y);
       n &= 3;
-- 
2.5.0



More information about the Beignet mailing list