[Beignet] [PATCH 2/2] Backend: Optimization of internal sine and cosine
Grigore Lupescu
grigore.lupescu at intel.com
Thu Mar 17 17:10:25 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 41 +++++++++++++++++++-------------
1 file changed, 25 insertions(+), 16 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 6460755..f8a6e3d 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -544,17 +544,18 @@ OVERLOADABLE float __kernel_sinf(float x)
{
/* copied from fdlibm */
const float
- half_value = 5.0000000000e-01,/* 0x3f000000 */
S1 = -1.6666667163e-01, /* 0xbe2aaaab */
S2 = 8.3333337680e-03, /* 0x3c088889 */
S3 = -1.9841270114e-04, /* 0xb9500d01 */
S4 = 2.7557314297e-06, /* 0x3638ef1b */
- S5 = -2.5050759689e-08, /* 0xb2d72f34 */
- S6 = 1.5896910177e-10; /* 0x2f2ec9d3 */
+ S5 = -2.5050759689e-08; /* 0xb2d72f34 */
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
+
+ /* for more accuracy use higher polynomial e.g. S2+z*(S3+z*(S4+z*(S5))) */
+ r = S2+z*(S3+z*(S4));
+
return x+v*(S1+z*r);
}
@@ -566,15 +567,16 @@ float __kernel_cosf(float x, float y)
C1 = 4.1666667908e-02, /* 0x3d2aaaab */
C2 = -1.3888889225e-03, /* 0xbab60b61 */
C3 = 2.4801587642e-05, /* 0x37d00d01 */
- C4 = -2.7557314297e-07, /* 0xb493f27c */
- C5 = 2.0875723372e-09, /* 0x310f74f6 */
- C6 = -1.1359647598e-11; /* 0xad47d74e */
+ C4 = -2.7557314297e-07; /* 0xb493f27c */
float a,hz,z,r,qx;
int ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+
+ /* for more accuracy use higher polynomial e.g. z*(C1+z*(C2+z*(C3+z*(C4)))) */
+ r = z*(C1+z*(C2+z*(C3)));
+
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
else {
@@ -585,24 +587,27 @@ float __kernel_cosf(float x, float y)
}
}
-OVERLOADABLE float sin(float x) {
+OVERLOADABLE float sin(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sin(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
float negative = x < 0.0f? -1.0f : 1.0f;
- x = negative * x;
+ x = fabs(x);
GEN_OCL_GET_FLOAT_WORD(ix,x);
-
ix &= 0x7fffffff;
/* sin(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return negative * __kernel_sinf(x);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
float s = __kernel_sinf(y);
@@ -612,10 +617,12 @@ OVERLOADABLE float sin(float x) {
}
}
-OVERLOADABLE float cos(float x) {
+OVERLOADABLE float cos(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cos(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
x = __gen_ocl_fabs(x);
@@ -624,9 +631,11 @@ OVERLOADABLE float cos(float x) {
ix &= 0x7fffffff;
/* cos(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return __kernel_cosf(x, 0.f);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
n &= 3;
--
2.5.0
More information about the Beignet
mailing list