[Beignet] [PATCH 2/4] Backend: Optimization internal math, lower polynomials
Grigore Lupescu
grigore.lupescu at intel.com
Mon Jun 27 19:04:05 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Use lower grade polynomials for approximations, keep conformance passing.
LOG Use polynomial grade 4 (was 7)
LOG2 Use polynomial grade 4 (was 7)
SIN Use polynomial grade 4 (was 6)
COS Use polynomial grade 3 (was 6)
TANF Use polynomial grade 7 (was 12)
GAMMA Use polynomial grade 3 (was 12)
GAMMA_R Use polynomial grade 3 (was 12)
LOG1P Use polynomial grade 4 (was 7)
ASIN Use polynomial grade 4 (was 5)
ATAN Use polynomial grade 6 (was 10)
EXP Use polynomial grade 2 (was 5)
EXPM1 Use polynomial grade 3 (was 5)
POW Use polynomial grade 2 (was 6)
POWN Use polynomial grade 2 (was 6)
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/include/ocl_float.h | 1 +
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 245 +++++++++----------------------
2 files changed, 69 insertions(+), 177 deletions(-)
diff --git a/backend/src/libocl/include/ocl_float.h b/backend/src/libocl/include/ocl_float.h
index e63eaf9..6be6c7c 100644
--- a/backend/src/libocl/include/ocl_float.h
+++ b/backend/src/libocl/include/ocl_float.h
@@ -81,6 +81,7 @@ INLINE_OVERLOADABLE int __ocl_finitef (float x){
#define M_E_F 2.718281828459045F
#define M_LOG2E_F 1.4426950408889634F
#define M_LOG10E_F 0.43429448190325176F
+#define M_LOG210_F 3.3219280948873626F
#define M_LN2_F 0.6931471805599453F
#define M_LN10_F 2.302585092994046F
#define M_PI_F 3.141592653589793F
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 4cd5add..57b09f0 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -57,7 +57,7 @@ OVERLOADABLE float native_tan(float x) {
}
OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
OVERLOADABLE float native_divide(float x, float y) { return x/y; }
/* Fast path */
@@ -184,10 +184,7 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
Lg3 = 2.8571429849e-01, /* 3E924925 */
- Lg4 = 2.2222198546e-01, /* 3E638E29 */
- Lg5 = 1.8183572590e-01, /* 3E3A3325 */
- Lg6 = 1.5313838422e-01, /* 3E1CD04F */
- Lg7 = 1.4798198640e-01; /* 3E178897 */
+ Lg4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float hfsq,f,s,z,R,w,t1,t2,dk;
@@ -230,8 +227,8 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
i = ix-(0x6147a<<3);
w = z*z;
j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*(Lg4+w*Lg6));
- t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ t1= w*(Lg2+w*Lg4);
+ t2= z*(Lg1+w*Lg3);
i |= j;
R = t2+t1;
if(i>0) {
@@ -257,6 +254,7 @@ OVERLOADABLE float __gen_ocl_internal_log10(float x) {
* is preserved.
* ====================================================
*/
+
union {float f; unsigned i; }u;
const float
zero = 0.0,
@@ -308,10 +306,7 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
Lg3 = 2.8571429849e-01, /** 3E924925 */
- Lg4 = 2.2222198546e-01, /** 3E638E29 */
- Lg5 = 1.8183572590e-01, /** 3E3A3325 */
- Lg6 = 1.5313838422e-01, /** 3E1CD04F */
- Lg7 = 1.4798198640e-01; /** 3E178897 */
+ Lg4 = 2.2222198546e-01; /** 3E638E29 */
float hfsq,f,s,z,R,w,t1,t2,dk;
int k,ix,i,j;
@@ -353,8 +348,8 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
i = ix-(0x6147a<<3);
w = z*z;
j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*(Lg4+w*Lg6));
- t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ t1= w*(Lg2+w*Lg4);
+ t2= z*(Lg1+w*Lg3);
i |= j;
R = t2+t1;
@@ -543,17 +538,15 @@ OVERLOADABLE float __kernel_sinf(float x)
{
/* copied from fdlibm */
const float
- half_value = 5.0000000000e-01,/* 0x3f000000 */
S1 = -1.6666667163e-01, /* 0xbe2aaaab */
S2 = 8.3333337680e-03, /* 0x3c088889 */
S3 = -1.9841270114e-04, /* 0xb9500d01 */
- S4 = 2.7557314297e-06, /* 0x3638ef1b */
- S5 = -2.5050759689e-08, /* 0xb2d72f34 */
- S6 = 1.5896910177e-10; /* 0x2f2ec9d3 */
+ S4 = 2.7557314297e-06; /* 0x3638ef1b */
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
+ r = S2+z*(S3+z*(S4));
+
return x+v*(S1+z*r);
}
@@ -564,16 +557,14 @@ float __kernel_cosf(float x, float y)
one = 1.0000000000e+00, /* 0x3f800000 */
C1 = 4.1666667908e-02, /* 0x3d2aaaab */
C2 = -1.3888889225e-03, /* 0xbab60b61 */
- C3 = 2.4801587642e-05, /* 0x37d00d01 */
- C4 = -2.7557314297e-07, /* 0xb493f27c */
- C5 = 2.0875723372e-09, /* 0x310f74f6 */
- C6 = -1.1359647598e-11; /* 0xad47d74e */
+ C3 = 2.4801587642e-05; /* 0x37d00d01 */
float a,hz,z,r,qx;
int ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+ r = z*(C1+z*(C2+z*(C3)));
+
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
else {
@@ -584,24 +575,27 @@ float __kernel_cosf(float x, float y)
}
}
-OVERLOADABLE float sin(float x) {
+OVERLOADABLE float sin(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sin(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
float negative = x < 0.0f? -1.0f : 1.0f;
- x = negative * x;
+ x = fabs(x);
GEN_OCL_GET_FLOAT_WORD(ix,x);
-
ix &= 0x7fffffff;
/* sin(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return negative * __kernel_sinf(x);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
float s = __kernel_sinf(y);
@@ -611,10 +605,12 @@ OVERLOADABLE float sin(float x) {
}
}
-OVERLOADABLE float cos(float x) {
+OVERLOADABLE float cos(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cos(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
x = __gen_ocl_fabs(x);
@@ -623,9 +619,11 @@ OVERLOADABLE float cos(float x) {
ix &= 0x7fffffff;
/* cos(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return __kernel_cosf(x, 0.f);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
n &= 3;
@@ -662,12 +660,6 @@ float __kernel_tanf(float x, float y, int iy)
T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
T[6] = 1.4562094584e-03; /* 0x3abede48 */
T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
- T[8] = 2.4646313977e-04; /* 0x398137b9 */
- T[9] = 7.8179444245e-05; /* 0x38a3f445 */
- T[10] = 7.1407252108e-05; /* 0x3895c07a */
- T[11] = -1.8558637748e-05; /* 0xb79bae5f */
- T[12] = 2.5907305826e-05; /* 0x37d95384 */
-
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff; /* high word of |x| */
@@ -691,8 +683,10 @@ float __kernel_tanf(float x, float y, int iy)
* x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
* x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
*/
- r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
- v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+
+ r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
+ v = z*(T[2]+w*(T[4]+w*T[6]));
+
s = z*x;
r = y + z*(s*(r+v)+y);
r += T[0]*s;
@@ -823,14 +817,6 @@ OVERLOADABLE float lgamma(float x) {
a1 = 3.2246702909e-01,
a2 = 6.7352302372e-02,
a3 = 2.0580807701e-02,
- a4 = 7.3855509982e-03,
- a5 = 2.8905137442e-03,
- a6 = 1.1927076848e-03,
- a7 = 5.1006977446e-04,
- a8 = 2.2086278477e-04,
- a9 = 1.0801156895e-04,
- a10 = 2.5214456400e-05,
- a11 = 4.4864096708e-05,
tc = 1.4616321325e+00,
tf = -1.2148628384e-01,
tt = 6.6971006518e-09,
@@ -840,46 +826,20 @@ OVERLOADABLE float lgamma(float x) {
t3 = -3.2788541168e-02,
t4 = 1.7970675603e-02,
t5 = -1.0314224288e-02,
- t6 = 6.1005386524e-03,
- t7 = -3.6845202558e-03,
- t8 = 2.2596477065e-03,
- t9 = -1.4034647029e-03,
- t10 = 8.8108185446e-04,
- t11 = -5.3859531181e-04,
- t12 = 3.1563205994e-04,
- t13 = -3.1275415677e-04,
- t14 = 3.3552918467e-04,
u0 = -7.7215664089e-02,
u1 = 6.3282704353e-01,
u2 = 1.4549225569e+00,
- u3 = 9.7771751881e-01,
- u4 = 2.2896373272e-01,
- u5 = 1.3381091878e-02,
v1 = 2.4559779167e+00,
v2 = 2.1284897327e+00,
v3 = 7.6928514242e-01,
- v4 = 1.0422264785e-01,
- v5 = 3.2170924824e-03,
s0 = -7.7215664089e-02,
s1 = 2.1498242021e-01,
s2 = 3.2577878237e-01,
- s3 = 1.4635047317e-01,
- s4 = 2.6642270386e-02,
- s5 = 1.8402845599e-03,
- s6 = 3.1947532989e-05,
r1 = 1.3920053244e+00,
r2 = 7.2193557024e-01,
- r3 = 1.7193385959e-01,
- r4 = 1.8645919859e-02,
- r5 = 7.7794247773e-04,
- r6 = 7.3266842264e-06,
w0 = 4.1893854737e-01,
w1 = 8.3333335817e-02,
- w2 = -2.7777778450e-03,
- w3 = 7.9365057172e-04,
- w4 = -5.9518753551e-04,
- w5 = 8.3633989561e-04,
- w6 = -1.6309292987e-03;
+ w2 = -2.7777778450e-03;
float t, y, z, nadj, p, p1, p2, p3, q, r, w;
int i, hx, ix;
nadj = 0;
@@ -937,31 +897,31 @@ OVERLOADABLE float lgamma(float x) {
switch (i) {
case 0:
z = y * y;
- p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
- p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ p1 = a0 + z * a2;
+ p2 = z * (a1 + z * a3);
p = y * p1 + p2;
r += (p - (float) 0.5 * y);
break;
case 1:
z = y * y;
w = z * y;
- p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
- p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
- p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ p1 = t0 + w * t3;
+ p2 = t1 + w * t4;
+ p3 = t2 + w * t5;
p = z * p1 - (tt - w * (p2 + y * p3));
r += (tf + p);
break;
case 2:
- p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
- p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ p1 = y * (u0 + y * u1);
+ p2 = one + y * (v1 + y * v2);
r += (-(float) 0.5 * y + p1 / p2);
}
} else if (ix < 0x41000000) {
i = (int) x;
t = zero;
y = x - (float) i;
- p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
- q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+ p = y * (s0 + y * (s1 + y * s2));
+ q = one + y * (r1 + y * r2);
r = .5f * y + p / q;
z = one;
switch (i) {
@@ -983,7 +943,7 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x);
z = one / x;
y = z * z;
- w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ w = w0 + z * (w1 + y * w2);
r = (x - .5f) * (t - one) + w;
} else
r = x * (native_log(x) - one);
@@ -1011,14 +971,6 @@ OVERLOADABLE float lgamma(float x) {
a1 = 3.2246702909e-01, \
a2 = 6.7352302372e-02, \
a3 = 2.0580807701e-02, \
- a4 = 7.3855509982e-03, \
- a5 = 2.8905137442e-03, \
- a6 = 1.1927076848e-03, \
- a7 = 5.1006977446e-04, \
- a8 = 2.2086278477e-04, \
- a9 = 1.0801156895e-04, \
- a10 = 2.5214456400e-05, \
- a11 = 4.4864096708e-05, \
tc = 1.4616321325e+00, \
tf = -1.2148628384e-01, \
tt = 6.6971006518e-09, \
@@ -1028,46 +980,18 @@ OVERLOADABLE float lgamma(float x) {
t3 = -3.2788541168e-02, \
t4 = 1.7970675603e-02, \
t5 = -1.0314224288e-02, \
- t6 = 6.1005386524e-03, \
- t7 = -3.6845202558e-03, \
- t8 = 2.2596477065e-03, \
- t9 = -1.4034647029e-03, \
- t10 = 8.8108185446e-04, \
- t11 = -5.3859531181e-04, \
- t12 = 3.1563205994e-04, \
- t13 = -3.1275415677e-04, \
- t14 = 3.3552918467e-04, \
u0 = -7.7215664089e-02, \
u1 = 6.3282704353e-01, \
- u2 = 1.4549225569e+00, \
- u3 = 9.7771751881e-01, \
- u4 = 2.2896373272e-01, \
- u5 = 1.3381091878e-02, \
v1 = 2.4559779167e+00, \
v2 = 2.1284897327e+00, \
- v3 = 7.6928514242e-01, \
- v4 = 1.0422264785e-01, \
- v5 = 3.2170924824e-03, \
s0 = -7.7215664089e-02, \
s1 = 2.1498242021e-01, \
s2 = 3.2577878237e-01, \
- s3 = 1.4635047317e-01, \
- s4 = 2.6642270386e-02, \
- s5 = 1.8402845599e-03, \
- s6 = 3.1947532989e-05, \
r1 = 1.3920053244e+00, \
r2 = 7.2193557024e-01, \
- r3 = 1.7193385959e-01, \
- r4 = 1.8645919859e-02, \
- r5 = 7.7794247773e-04, \
- r6 = 7.3266842264e-06, \
w0 = 4.1893854737e-01, \
w1 = 8.3333335817e-02, \
- w2 = -2.7777778450e-03, \
- w3 = 7.9365057172e-04, \
- w4 = -5.9518753551e-04, \
- w5 = 8.3633989561e-04, \
- w6 = -1.6309292987e-03; \
+ w2 = -2.7777778450e-03; \
float t, y, z, nadj, p, p1, p2, p3, q, r, w; \
int i, hx, ix; \
nadj = 0; \
@@ -1129,31 +1053,31 @@ OVERLOADABLE float lgamma(float x) {
switch (i) { \
case 0: \
z = y * y; \
- p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); \
- p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); \
+ p1 = a0 + z * a2; \
+ p2 = z * (a1 + z *a3); \
p = y * p1 + p2; \
r += (p - (float) 0.5 * y); \
break; \
case 1: \
z = y * y; \
w = z * y; \
- p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \
- p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \
- p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \
+ p1 = t0 + w * t3; \
+ p2 = t1 + w * t4; \
+ p3 = t2 + w * t5; \
p = z * p1 - (tt - w * (p2 + y * p3)); \
r += (tf + p); \
break; \
case 2: \
- p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); \
- p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); \
+ p1 = y * (u0 + y * u1); \
+ p2 = one + y * (v1 + y * v2); \
r += (-(float) 0.5 * y + p1 / p2); \
} \
} else if (ix < 0x41000000) { \
i = (int) x; \
t = zero; \
y = x - (float) i; \
- p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); \
- q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); \
+ p = y * (s0 + y * (s1 + y * s2)); \
+ q = one + y * (r1 + y * r2); \
r = .5f * y + p / q; \
z = one; \
switch (i) { \
@@ -1175,7 +1099,7 @@ OVERLOADABLE float lgamma(float x) {
t = native_log(x); \
z = one / x; \
y = z * z; \
- w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); \
+ w = w0 + z * (w1 + y * w2); \
r = (x - .5f) * (t - one) + w; \
} else \
r = x * (native_log(x) - one); \
@@ -1208,10 +1132,7 @@ OVERLOADABLE float log1p(float x) {
Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
Lp3 = 2.8571429849e-01, /* 3E924925 */
- Lp4 = 2.2222198546e-01, /* 3E638E29 */
- Lp5 = 1.8183572590e-01, /* 3E3A3325 */
- Lp6 = 1.5313838422e-01, /* 3E1CD04F */
- Lp7 = 1.4798198640e-01; /* 3E178897 */
+ Lp4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float hfsq,f,c,s,z,R,u;
int k,hx,hu,ax;
@@ -1271,7 +1192,7 @@ OVERLOADABLE float log1p(float x) {
}
s = f/((float)2.0+f);
z = s*s;
- R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+ R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
if(k==0) return f-(hfsq-s*(hfsq+R)); else
return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
@@ -1440,14 +1361,13 @@ INLINE float __gen_ocl_asin_util(float x) {
pS2 = 2.01212532134862925881e-01,
pS3 = -4.00555345006794114027e-02,
pS4 = 7.91534994289814532176e-04,
- pS5 = 3.47933107596021167570e-05,
qS1 = -2.40339491173441421878e+00,
qS2 = 2.02094576023350569471e+00,
qS3 = -6.88283971605453293030e-01,
qS4 = 7.70381505559019352791e-02;
float t = x*x;
- float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+ float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
float w = p / q;
return x + x*w;
@@ -1512,10 +1432,6 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
- aT[7] = -5.8335702866e-02; /* 0xbd6ef16b */
- aT[8] = 4.9768779427e-02; /* 0x3d4bda59 */
- aT[9] = -3.6531571299e-02; /* 0xbd15a221 */
- aT[10] = 1.6285819933e-02; /* 0x3c8569d7 */
const float one = 1.0, huge = 1.0e30;
float w,s1,s2,z;
@@ -1552,8 +1468,8 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
- s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
- s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+ s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
+ s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
if (id<0) return x - x*(s1+s2);
else {
z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
@@ -1666,12 +1582,6 @@ OVERLOADABLE float __gen_ocl_internal_rint(float x) {
}
OVERLOADABLE float __gen_ocl_internal_exp(float x) {
- //use native instruction when it has enough precision
- if (x > -0x1.6p1 && x < 0x1.6p1)
- {
- return native_exp(x);
- }
-
float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
@@ -1679,10 +1589,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
one = 1.0,
huge = 1.0e+30,
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
- P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08; /* 0x3331bb4c */
+ P2 = -2.7777778450e-03; /* 0xbb360b61 */
float y,hi=0.0,lo=0.0,c,t;
int k=0,xsb;
unsigned hx;
@@ -1726,7 +1633,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
/* x is now in primary range */
t = x*x;
- c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ c = x - t*(P1+t*P2);
if(k==0)
return one-((x*c)/(c-(float)2.0)-x);
else
@@ -2107,9 +2014,6 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
- Q3 = -7.9365076090e-05, /* 0xb8a670cd */
- Q4 = 4.0082177293e-06, /* 0x36867e54 */
- Q5 = -2.0109921195e-07, /* 0xb457edbb */
huge = 1.0e30,
tiny = 1.0e-30,
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
@@ -2166,7 +2070,7 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
/* x is now in primary range */
hfx = (float)0.5*x;
hxs = x*hfx;
- r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ r1 = one+hxs*(Q1+hxs*Q2);
t = (float)3.0-r1*hfx;
e = hxs*((r1-t)/((float)6.0 - x*t));
if(k==0)
@@ -2749,15 +2653,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
- L3 = 3.3333334327e-01, /* 0x3eaaaaab */
- L4 = 2.7272811532e-01, /* 0x3e8ba305 */
- L5 = 2.3066075146e-01, /* 0x3e6c3255 */
- L6 = 2.0697501302e-01, /* 0x3e53f142 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08, /* 0x3331bb4c */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 6.93145752e-01, /* 0x3f317200 */
lg2_l = 1.42860654e-06, /* 0x35bfbe8c */
@@ -2885,7 +2782,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
/* compute log(ax) */
s2 = s*s;
- r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+ r = s2*s2*(L1+s2*L2);
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = 3.0f+s2+r;
@@ -2950,7 +2847,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
z = u+v;
w = v-(z-u);
t = z*z;
- t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ t1 = z - t*(P1+t*P2);
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3063,15 +2960,8 @@ float __gen_ocl_internal_pown(float x, int y) {
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
- L3 = 3.3333334327e-01, /* 0x3eaaaaab */
- L4 = 2.7272811532e-01, /* 0x3e8ba305 */
- L5 = 2.3066075146e-01, /* 0x3e6c3255 */
- L6 = 2.0697501302e-01, /* 0x3e53f142 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08, /* 0x3331bb4c */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 0x1.62ep-1,
lg2_l = 0x1.0bfbe8p-15,
@@ -3171,7 +3061,7 @@ float __gen_ocl_internal_pown(float x, int y) {
/* compute log(ax) */
s2 = s*s;
- r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+ r = s2*s2*(L1+s2*L2);
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = (float)3.0+s2+r;
@@ -3243,7 +3133,7 @@ float __gen_ocl_internal_pown(float x, int y) {
z = u+v;
w = v-(z-u);
t = z*z;
- t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ t1 = z - t*(P1+t*P2);
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3556,6 +3446,7 @@ OVERLOADABLE float exp(float x) {
}
OVERLOADABLE float exp2(float x) {
+ /* Use native/faster instruction when it has enough precision, exp2 always */
return native_exp2(x);
}
--
2.5.0
More information about the Beignet
mailing list