[Beignet] [PATCH 2/4] Backend: Optimization internal math, lower polynomials
grigore.lupescu at intel.com
grigore.lupescu at intel.com
Tue Jul 26 13:24:40 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Use lower grade polynomials for approximations, keep conformance passing.
LOG Use polynomial grade 4 (was 7)
LOG2 Use polynomial grade 4 (was 7)
SIN Use polynomial grade 4 (was 6)
COS Use polynomial grade 3 (was 6)
TANF Use polynomial grade 7 (was 12)
LOG1P Use polynomial grade 4 (was 7)
ASIN Use polynomial grade 4 (was 5)
ATAN Use polynomial grade 6 (was 10)
EXP Use polynomial grade 2 (was 5)
EXPM1 Use polynomial grade 3 (was 5)
POW Use polynomial grade 2 (was 6)
POWN Use polynomial grade 2 (was 6)
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/libocl/include/ocl_float.h | 1 +
backend/src/libocl/tmpl/ocl_math.tmpl.cl | 131 +++++++++++--------------------
2 files changed, 47 insertions(+), 85 deletions(-)
diff --git a/backend/src/libocl/include/ocl_float.h b/backend/src/libocl/include/ocl_float.h
index e63eaf9..6be6c7c 100644
--- a/backend/src/libocl/include/ocl_float.h
+++ b/backend/src/libocl/include/ocl_float.h
@@ -81,6 +81,7 @@ INLINE_OVERLOADABLE int __ocl_finitef (float x){
#define M_E_F 2.718281828459045F
#define M_LOG2E_F 1.4426950408889634F
#define M_LOG10E_F 0.43429448190325176F
+#define M_LOG210_F 3.3219280948873626F
#define M_LN2_F 0.6931471805599453F
#define M_LN10_F 2.302585092994046F
#define M_PI_F 3.141592653589793F
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 4cd5add..55a4fed 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -57,7 +57,7 @@ OVERLOADABLE float native_tan(float x) {
}
OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
OVERLOADABLE float native_divide(float x, float y) { return x/y; }
/* Fast path */
@@ -184,10 +184,7 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
Lg3 = 2.8571429849e-01, /* 3E924925 */
- Lg4 = 2.2222198546e-01, /* 3E638E29 */
- Lg5 = 1.8183572590e-01, /* 3E3A3325 */
- Lg6 = 1.5313838422e-01, /* 3E1CD04F */
- Lg7 = 1.4798198640e-01; /* 3E178897 */
+ Lg4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float hfsq,f,s,z,R,w,t1,t2,dk;
@@ -230,8 +227,8 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
i = ix-(0x6147a<<3);
w = z*z;
j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*(Lg4+w*Lg6));
- t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ t1= w*(Lg2+w*Lg4);
+ t2= z*(Lg1+w*Lg3);
i |= j;
R = t2+t1;
if(i>0) {
@@ -257,6 +254,7 @@ OVERLOADABLE float __gen_ocl_internal_log10(float x) {
* is preserved.
* ====================================================
*/
+
union {float f; unsigned i; }u;
const float
zero = 0.0,
@@ -308,10 +306,7 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
Lg3 = 2.8571429849e-01, /** 3E924925 */
- Lg4 = 2.2222198546e-01, /** 3E638E29 */
- Lg5 = 1.8183572590e-01, /** 3E3A3325 */
- Lg6 = 1.5313838422e-01, /** 3E1CD04F */
- Lg7 = 1.4798198640e-01; /** 3E178897 */
+ Lg4 = 2.2222198546e-01; /** 3E638E29 */
float hfsq,f,s,z,R,w,t1,t2,dk;
int k,ix,i,j;
@@ -353,8 +348,8 @@ OVERLOADABLE float __gen_ocl_internal_log2(float x) {
i = ix-(0x6147a<<3);
w = z*z;
j = (0x6b851<<3)-ix;
- t1= w*(Lg2+w*(Lg4+w*Lg6));
- t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ t1= w*(Lg2+w*Lg4);
+ t2= z*(Lg1+w*Lg3);
i |= j;
R = t2+t1;
@@ -543,17 +538,15 @@ OVERLOADABLE float __kernel_sinf(float x)
{
/* copied from fdlibm */
const float
- half_value = 5.0000000000e-01,/* 0x3f000000 */
S1 = -1.6666667163e-01, /* 0xbe2aaaab */
S2 = 8.3333337680e-03, /* 0x3c088889 */
S3 = -1.9841270114e-04, /* 0xb9500d01 */
- S4 = 2.7557314297e-06, /* 0x3638ef1b */
- S5 = -2.5050759689e-08, /* 0xb2d72f34 */
- S6 = 1.5896910177e-10; /* 0x2f2ec9d3 */
+ S4 = 2.7557314297e-06; /* 0x3638ef1b */
float z,r,v;
z = x*x;
v = z*x;
- r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
+ r = S2+z*(S3+z*(S4));
+
return x+v*(S1+z*r);
}
@@ -564,16 +557,14 @@ float __kernel_cosf(float x, float y)
one = 1.0000000000e+00, /* 0x3f800000 */
C1 = 4.1666667908e-02, /* 0x3d2aaaab */
C2 = -1.3888889225e-03, /* 0xbab60b61 */
- C3 = 2.4801587642e-05, /* 0x37d00d01 */
- C4 = -2.7557314297e-07, /* 0xb493f27c */
- C5 = 2.0875723372e-09, /* 0x310f74f6 */
- C6 = -1.1359647598e-11; /* 0xad47d74e */
+ C3 = 2.4801587642e-05; /* 0x37d00d01 */
float a,hz,z,r,qx;
int ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
- r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+ r = z*(C1+z*(C2+z*(C3)));
+
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
else {
@@ -584,24 +575,27 @@ float __kernel_cosf(float x, float y)
}
}
-OVERLOADABLE float sin(float x) {
+OVERLOADABLE float sin(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sin(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
float negative = x < 0.0f? -1.0f : 1.0f;
- x = negative * x;
+ x = fabs(x);
GEN_OCL_GET_FLOAT_WORD(ix,x);
-
ix &= 0x7fffffff;
/* sin(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return negative * __kernel_sinf(x);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
float s = __kernel_sinf(y);
@@ -611,10 +605,12 @@ OVERLOADABLE float sin(float x) {
}
}
-OVERLOADABLE float cos(float x) {
+OVERLOADABLE float cos(float x)
+{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cos(x);
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
float y,z=0.0;
int n, ix;
x = __gen_ocl_fabs(x);
@@ -623,9 +619,11 @@ OVERLOADABLE float cos(float x) {
ix &= 0x7fffffff;
/* cos(Inf or NaN) is NaN */
- if (ix>=0x7f800000) return x-x;
+ if (ix >= 0x7f800000) return x-x;
- /* argument reduction needed */
+ if(x <= pio4)
+ return __kernel_cosf(x, 0.f);
+ /* argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,&y);
n &= 3;
@@ -662,12 +660,6 @@ float __kernel_tanf(float x, float y, int iy)
T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
T[6] = 1.4562094584e-03; /* 0x3abede48 */
T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
- T[8] = 2.4646313977e-04; /* 0x398137b9 */
- T[9] = 7.8179444245e-05; /* 0x38a3f445 */
- T[10] = 7.1407252108e-05; /* 0x3895c07a */
- T[11] = -1.8558637748e-05; /* 0xb79bae5f */
- T[12] = 2.5907305826e-05; /* 0x37d95384 */
-
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff; /* high word of |x| */
@@ -691,8 +683,10 @@ float __kernel_tanf(float x, float y, int iy)
* x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
* x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
*/
- r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
- v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+
+ r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
+ v = z*(T[2]+w*(T[4]+w*T[6]));
+
s = z*x;
r = y + z*(s*(r+v)+y);
r += T[0]*s;
@@ -1208,10 +1202,7 @@ OVERLOADABLE float log1p(float x) {
Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
Lp3 = 2.8571429849e-01, /* 3E924925 */
- Lp4 = 2.2222198546e-01, /* 3E638E29 */
- Lp5 = 1.8183572590e-01, /* 3E3A3325 */
- Lp6 = 1.5313838422e-01, /* 3E1CD04F */
- Lp7 = 1.4798198640e-01; /* 3E178897 */
+ Lp4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float hfsq,f,c,s,z,R,u;
int k,hx,hu,ax;
@@ -1271,7 +1262,7 @@ OVERLOADABLE float log1p(float x) {
}
s = f/((float)2.0+f);
z = s*s;
- R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+ R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
if(k==0) return f-(hfsq-s*(hfsq+R)); else
return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
@@ -1440,14 +1431,13 @@ INLINE float __gen_ocl_asin_util(float x) {
pS2 = 2.01212532134862925881e-01,
pS3 = -4.00555345006794114027e-02,
pS4 = 7.91534994289814532176e-04,
- pS5 = 3.47933107596021167570e-05,
qS1 = -2.40339491173441421878e+00,
qS2 = 2.02094576023350569471e+00,
qS3 = -6.88283971605453293030e-01,
qS4 = 7.70381505559019352791e-02;
float t = x*x;
- float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+ float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
float w = p / q;
return x + x*w;
@@ -1512,10 +1502,6 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
- aT[7] = -5.8335702866e-02; /* 0xbd6ef16b */
- aT[8] = 4.9768779427e-02; /* 0x3d4bda59 */
- aT[9] = -3.6531571299e-02; /* 0xbd15a221 */
- aT[10] = 1.6285819933e-02; /* 0x3c8569d7 */
const float one = 1.0, huge = 1.0e30;
float w,s1,s2,z;
@@ -1552,8 +1538,8 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
- s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
- s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+ s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6])));
+ s2 = w*(aT[1]+w*(aT[3]+w*(aT[5])));
if (id<0) return x - x*(s1+s2);
else {
z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
@@ -1666,12 +1652,6 @@ OVERLOADABLE float __gen_ocl_internal_rint(float x) {
}
OVERLOADABLE float __gen_ocl_internal_exp(float x) {
- //use native instruction when it has enough precision
- if (x > -0x1.6p1 && x < 0x1.6p1)
- {
- return native_exp(x);
- }
-
float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
@@ -1679,10 +1659,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
one = 1.0,
huge = 1.0e+30,
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
- P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08; /* 0x3331bb4c */
+ P2 = -2.7777778450e-03; /* 0xbb360b61 */
float y,hi=0.0,lo=0.0,c,t;
int k=0,xsb;
unsigned hx;
@@ -1726,7 +1703,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
/* x is now in primary range */
t = x*x;
- c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ c = x - t*(P1+t*P2);
if(k==0)
return one-((x*c)/(c-(float)2.0)-x);
else
@@ -2107,9 +2084,6 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
- Q3 = -7.9365076090e-05, /* 0xb8a670cd */
- Q4 = 4.0082177293e-06, /* 0x36867e54 */
- Q5 = -2.0109921195e-07, /* 0xb457edbb */
huge = 1.0e30,
tiny = 1.0e-30,
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
@@ -2166,7 +2140,7 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
/* x is now in primary range */
hfx = (float)0.5*x;
hxs = x*hfx;
- r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ r1 = one+hxs*(Q1+hxs*Q2);
t = (float)3.0-r1*hfx;
e = hxs*((r1-t)/((float)6.0 - x*t));
if(k==0)
@@ -2749,15 +2723,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
- L3 = 3.3333334327e-01, /* 0x3eaaaaab */
- L4 = 2.7272811532e-01, /* 0x3e8ba305 */
- L5 = 2.3066075146e-01, /* 0x3e6c3255 */
- L6 = 2.0697501302e-01, /* 0x3e53f142 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08, /* 0x3331bb4c */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 6.93145752e-01, /* 0x3f317200 */
lg2_l = 1.42860654e-06, /* 0x35bfbe8c */
@@ -2885,7 +2852,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
/* compute log(ax) */
s2 = s*s;
- r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+ r = s2*s2*(L1+s2*L2);
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = 3.0f+s2+r;
@@ -2950,7 +2917,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
z = u+v;
w = v-(z-u);
t = z*z;
- t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ t1 = z - t*(P1+t*P2);
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3063,15 +3030,8 @@ float __gen_ocl_internal_pown(float x, int y) {
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
- L3 = 3.3333334327e-01, /* 0x3eaaaaab */
- L4 = 2.7272811532e-01, /* 0x3e8ba305 */
- L5 = 2.3066075146e-01, /* 0x3e6c3255 */
- L6 = 2.0697501302e-01, /* 0x3e53f142 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
- P3 = 6.6137559770e-05, /* 0x388ab355 */
- P4 = -1.6533901999e-06, /* 0xb5ddea0e */
- P5 = 4.1381369442e-08, /* 0x3331bb4c */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 0x1.62ep-1,
lg2_l = 0x1.0bfbe8p-15,
@@ -3171,7 +3131,7 @@ float __gen_ocl_internal_pown(float x, int y) {
/* compute log(ax) */
s2 = s*s;
- r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+ r = s2*s2*(L1+s2*L2);
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = (float)3.0+s2+r;
@@ -3243,7 +3203,7 @@ float __gen_ocl_internal_pown(float x, int y) {
z = u+v;
w = v-(z-u);
t = z*z;
- t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ t1 = z - t*(P1+t*P2);
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3556,6 +3516,7 @@ OVERLOADABLE float exp(float x) {
}
OVERLOADABLE float exp2(float x) {
+ /* Use native/faster instruction when it has enough precision, exp2 always */
return native_exp2(x);
}
--
2.5.0
More information about the Beignet
mailing list