[Beignet] [PATCH] Adaptions for LLVM 3.3 / SPIR
Zhigang Gong
zhigang.gong at linux.intel.com
Tue Jun 4 00:04:12 PDT 2013
And one minor comment below:
On Mon, Jun 03, 2013 at 11:09:56AM +0200, Dag Lem wrote:
> diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
> index 613b844..97b1f24 100644
> --- a/backend/src/ocl_stdlib.h
> +++ b/backend/src/ocl_stdlib.h
> @@ -20,11 +20,11 @@
> #ifndef __GEN_OCL_STDLIB_H__
> #define __GEN_OCL_STDLIB_H__
>
> -#define INLINE __attribute__((always_inline)) inline
> +#define INLINE inline __attribute__((always_inline))
> #define OVERLOADABLE __attribute__((overloadable))
> #define PURE __attribute__((pure))
> #define CONST __attribute__((const))
> -#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline))
> +#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
>
> /////////////////////////////////////////////////////////////////////////////
> // OpenCL built-in scalar data types
> @@ -41,6 +41,8 @@ typedef unsigned int uintptr_t;
> /////////////////////////////////////////////////////////////////////////////
> // OpenCL address space
> /////////////////////////////////////////////////////////////////////////////
> +// These are built-ins in LLVM 3.3.
> +#if 100*__clang_major__ + __clang_minor__ <= 302
> #define __private __attribute__((address_space(0)))
> #define __global __attribute__((address_space(1)))
> #define __constant __attribute__((address_space(2)))
> @@ -50,6 +52,7 @@ typedef unsigned int uintptr_t;
> //#define local __local
> #define constant __constant
> #define private __private
> +#endif
>
> /////////////////////////////////////////////////////////////////////////////
> // OpenCL built-in vector data types
> @@ -72,12 +75,20 @@ DEF(float);
> /////////////////////////////////////////////////////////////////////////////
> // OpenCL other built-in data types
> /////////////////////////////////////////////////////////////////////////////
> +// FIXME:
> +// This is a transitional hack to bypass the LLVM 3.3 built-in types.
> +// See the Khronos SPIR specification for handling of these types.
Should not define the image address space again for 3.2/3.1.
#if 100 * __clang_major__ + __clang_minor__ == 303
> +#define __texture __attribute__((address_space(4)))
#endif
> struct _image2d_t;
> -typedef __texture struct _image2d_t* image2d_t;
> +typedef __texture struct _image2d_t* __image2d_t;
> struct _image3d_t;
> -typedef __texture struct _image3d_t* image3d_t;
> -typedef uint sampler_t;
> -typedef size_t event_t;
> +typedef __texture struct _image3d_t* __image3d_t;
> +typedef uint __sampler_t;
> +typedef size_t __event_t;
> +#define image2d_t __image2d_t
> +#define image3d_t __image3d_t
> +#define sampler_t __sampler_t
> +#define event_t __event_t
> /////////////////////////////////////////////////////////////////////////////
> // OpenCL conversions & type casting
> /////////////////////////////////////////////////////////////////////////////
> @@ -202,8 +213,8 @@ DEF;
> #undef DEF
>
> #define SDEF(TYPE) \
> -INLINE_OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y); \
> -INLINE_OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y); \
> +OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y); \
> +OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y); \
> INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
> INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
> SDEF(char);
> @@ -212,8 +223,8 @@ SDEF(int);
> SDEF(long);
> #undef SDEF
> #define UDEF(TYPE) \
> -INLINE_OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y); \
> -INLINE_OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y); \
> +OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y); \
> +OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y); \
> INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
> INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
> UDEF(uchar);
> @@ -361,7 +372,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
> #undef DECL_INTERNAL_WORK_ITEM_FN
>
> #define DECL_PUBLIC_WORK_ITEM_FN(NAME) \
> -inline unsigned NAME(unsigned int dim) { \
> +INLINE unsigned NAME(unsigned int dim) { \
> if (dim == 0) return __gen_ocl_##NAME##0(); \
> else if (dim == 1) return __gen_ocl_##NAME##1(); \
> else if (dim == 2) return __gen_ocl_##NAME##2(); \
> @@ -393,84 +404,84 @@ PURE CONST float __gen_ocl_rndz(float x);
> PURE CONST float __gen_ocl_rnde(float x);
> PURE CONST float __gen_ocl_rndu(float x);
> PURE CONST float __gen_ocl_rndd(float x);
> -INLINE OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
> -INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
> +INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
> +INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
> return __gen_ocl_cos(x * M_PI_F);
> }
> -INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
> +INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
> return __gen_ocl_sin(x * M_PI_F);
> }
> -INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
> -INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
> -INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
> -INLINE OVERLOADABLE float native_log(float x) {
> +INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
> +INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
> +INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
> +INLINE_OVERLOADABLE float native_log(float x) {
> return native_log2(x) * 0.6931472002f;
> }
> -INLINE OVERLOADABLE float native_log10(float x) {
> +INLINE_OVERLOADABLE float native_log10(float x) {
> return native_log2(x) * 0.3010299956f;
> }
> -INLINE OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
> -INLINE OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
> -INLINE OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
> -INLINE OVERLOADABLE int2 ilogb(float2 x) {
> +INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
> +INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
> +INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
> +INLINE_OVERLOADABLE int2 ilogb(float2 x) {
> return (int2)(ilogb(x.s0), ilogb(x.s1));
> }
> -INLINE OVERLOADABLE int4 ilogb(float4 x) {
> +INLINE_OVERLOADABLE int4 ilogb(float4 x) {
> return (int4)(ilogb(x.s01), ilogb(x.s23));
> }
> -INLINE OVERLOADABLE int8 ilogb(float8 x) {
> +INLINE_OVERLOADABLE int8 ilogb(float8 x) {
> return (int8)(ilogb(x.s0123), ilogb(x.s4567));
> }
> -INLINE OVERLOADABLE int16 ilogb(float16 x) {
> +INLINE_OVERLOADABLE int16 ilogb(float16 x) {
> return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
> }
> -INLINE OVERLOADABLE float nan(uint code) {
> +INLINE_OVERLOADABLE float nan(uint code) {
> return NAN;
> }
> -INLINE OVERLOADABLE float2 nan(uint2 code) {
> +INLINE_OVERLOADABLE float2 nan(uint2 code) {
> return (float2)(nan(code.s0), nan(code.s1));
> }
> -INLINE OVERLOADABLE float4 nan(uint4 code) {
> +INLINE_OVERLOADABLE float4 nan(uint4 code) {
> return (float4)(nan(code.s01), nan(code.s23));
> }
> -INLINE OVERLOADABLE float8 nan(uint8 code) {
> +INLINE_OVERLOADABLE float8 nan(uint8 code) {
> return (float8)(nan(code.s0123), nan(code.s4567));
> }
> -INLINE OVERLOADABLE float16 nan(uint16 code) {
> +INLINE_OVERLOADABLE float16 nan(uint16 code) {
> return (float16)(nan(code.s01234567), nan(code.s89abcdef));
> }
> -INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
> -INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
> -INLINE OVERLOADABLE float native_tan(float x) {
> +INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
> +INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
> +INLINE_OVERLOADABLE float native_tan(float x) {
> return native_sin(x) / native_cos(x);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
> return native_tan(x * M_PI_F);
> }
> -INLINE OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
> -INLINE OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
> -INLINE OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
> -INLINE OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
> +INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
> +INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
> +INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
> return __gen_ocl_pow(x, 0.3333333333f);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
> *cosval = native_cos(x);
> return native_sin(x);
> }
> -INLINE OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
> +INLINE_OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
> return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
> __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
> }
> -INLINE OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
> +INLINE_OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
> return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
> __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
> __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
> __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
> }
> -INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
> +INLINE_OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
> return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
> __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
> __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
> @@ -480,7 +491,7 @@ INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
> __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
> __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
> }
> -INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
> +INLINE_OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
> return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
> __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
> __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
> @@ -498,29 +509,29 @@ INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval
> __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
> __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
> return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
> return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
> float y = native_exp(-2 * x);
> return (1 - y) / (1 + y);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_asin(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
> return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
> return __gen_ocl_internal_asin(x) / M_PI_F;
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_acos(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
> return M_PI_2_F - __gen_ocl_internal_asin(x);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
> return __gen_ocl_internal_acos(x) / M_PI_F;
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
> float a = 0, c = 1;
> if (x <= -1) {
> a = - M_PI_2_F;
> @@ -534,44 +545,44 @@ INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
> }
> return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
> return __gen_ocl_internal_atan(x) / M_PI_F;
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
> return native_log(x + native_sqrt(x * x + 1));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
> return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
> return 0.5f * native_sqrt((1 + x) / (1 - x));
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
> return x * y < 0 ? -x : x;
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_erf(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
> return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
> return 1 - __gen_ocl_internal_erf(x);
> }
>
> // XXX work-around PTX profile
> #define sqrt native_sqrt
> -INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_log(float x) { return native_log(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_log2(float x) { return native_log2(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_exp(float x) { return native_exp(x); }
> -INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
> -INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
> -INLINE OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
> +INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x) { return native_log(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x) { return native_log2(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) { return native_exp(x); }
> +INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
> +INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
> +INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
> return 2 * __gen_ocl_internal_round(x / 2);
> }
> // TODO use llvm intrinsics definitions
> @@ -601,32 +612,32 @@ INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
> #define erf __gen_ocl_internal_erf
> #define erfc __gen_ocl_internal_erfc
>
> -INLINE OVERLOADABLE float mad(float a, float b, float c) {
> +INLINE_OVERLOADABLE float mad(float a, float b, float c) {
> return a*b+c;
> }
>
> -INLINE OVERLOADABLE uint select(uint src0, uint src1, int cond) {
> +INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
> return cond ? src1 : src0;
> }
> -INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
> +INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
> return cond ? src1 : src0;
> }
> -INLINE OVERLOADABLE int select(int src0, int src1, int cond) {
> +INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
> return cond ? src1 : src0;
> }
> -INLINE OVERLOADABLE int select(int src0, int src1, uint cond) {
> +INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
> return cond ? src1 : src0;
> }
> -INLINE OVERLOADABLE float select(float src0, float src1, int cond) {
> +INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
> return cond ? src1 : src0;
> }
> -INLINE OVERLOADABLE float select(float src0, float src1, uint cond) {
> +INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
> return cond ? src1 : src0;
> }
>
> // This will be optimized out by LLVM and will output LLVM select instructions
> #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
> -INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
> +INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
> TYPE4 dst; \
> const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
> const TYPE x1 = src1.x; \
> @@ -652,13 +663,13 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
> // Common Functions (see 6.11.4 of OCL 1.1 spec)
> /////////////////////////////////////////////////////////////////////////////
> #define DECL_MIN_MAX_CLAMP(TYPE) \
> -INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
> +INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
> return a > b ? a : b; \
> } \
> -INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
> +INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
> return a < b ? a : b; \
> } \
> -INLINE OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
> +INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
> return max(min(v, u), l); \
> }
> DECL_MIN_MAX_CLAMP(float)
> @@ -670,35 +681,35 @@ DECL_MIN_MAX_CLAMP(unsigned short)
> DECL_MIN_MAX_CLAMP(unsigned char)
> #undef DECL_MIN_MAX_CLAMP
>
> -INLINE OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
> -INLINE OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
> +INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
> float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
> return a > b ? x : b > a ? y : max(x, y);
> }
> -INLINE OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
> +INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
> float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
> return a < b ? x : b < a ? y : min(x, y);
> }
> -INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
> -INLINE OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
> +INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
> +INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
> return __gen_ocl_internal_fmax(x, y) - y;
> }
> -INLINE OVERLOADABLE float fract(float x, float *p) {
> +INLINE_OVERLOADABLE float fract(float x, float *p) {
> *p = __gen_ocl_internal_floor(x);
> return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
> }
> -INLINE OVERLOADABLE float2 fract(float2 x, float2 *p) {
> +INLINE_OVERLOADABLE float2 fract(float2 x, float2 *p) {
> return (float2)(fract(x.s0, (float *)p),
> fract(x.s1, 1 + (float *)p));
> }
> -INLINE OVERLOADABLE float4 fract(float4 x, float4 *p) {
> +INLINE_OVERLOADABLE float4 fract(float4 x, float4 *p) {
> return (float4)(fract(x.s0, (float *)p),
> fract(x.s1, 1 + (float *)p),
> fract(x.s2, 2 + (float *)p),
> fract(x.s3, 3 + (float *)p));
> }
> -INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
> +INLINE_OVERLOADABLE float8 fract(float8 x, float8 *p) {
> return (float8)(fract(x.s0, (float *)p),
> fract(x.s1, 1 + (float *)p),
> fract(x.s2, 2 + (float *)p),
> @@ -708,7 +719,7 @@ INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
> fract(x.s6, 6 + (float *)p),
> fract(x.s7, 7 + (float *)p));
> }
> -INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
> +INLINE_OVERLOADABLE float16 fract(float16 x, float16 *p) {
> return (float16)(fract(x.s0, (float *)p),
> fract(x.s1, 1 + (float *)p),
> fract(x.s2, 2 + (float *)p),
> @@ -726,85 +737,85 @@ INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
> fract(x.se, 14 + (float *)p),
> fract(x.sf, 15 + (float *)p));
> }
> -INLINE OVERLOADABLE float native_divide(float x, float y) { return x/y; }
> -INLINE OVERLOADABLE float ldexp(float x, int n) {
> +INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
> +INLINE_OVERLOADABLE float ldexp(float x, int n) {
> return __gen_ocl_pow(2, n) * x;
> }
> -INLINE OVERLOADABLE float pown(float x, int n) {
> +INLINE_OVERLOADABLE float pown(float x, int n) {
> if (x == 0 && n == 0)
> return 1;
> return powr(x, n);
> }
> -INLINE OVERLOADABLE float rootn(float x, int n) {
> +INLINE_OVERLOADABLE float rootn(float x, int n) {
> return powr(x, 1.f / n);
> }
>
> /////////////////////////////////////////////////////////////////////////////
> // Geometric functions (see 6.11.5 of OCL 1.1 spec)
> /////////////////////////////////////////////////////////////////////////////
> -INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {
> +INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
> return mad(p0.x,p1.x,p0.y*p1.y);
> }
> -INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {
> +INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
> return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
> }
> -INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {
> +INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
> return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
> }
>
> -INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {
> +INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
> return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
> mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
> }
> -INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {
> +INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
> return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
> mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
> mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
> mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
> }
>
> -INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
> -INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }
> -INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
> -INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
> -INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
> -INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
> -INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
> -INLINE OVERLOADABLE float normalize(float x) { return 1.f; }
> -INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
> -
> -INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
> -INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
> -INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
> -INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }
> -INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
> -INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
> -
> -INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {
> +INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
> +INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
> +INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
> +INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
> +
> +INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
> +INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
> +INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
> +INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
> +INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
> +INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
> +
> +INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
> return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
> }
> -INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
> +INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
> return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
> }
>
> @@ -816,10 +827,10 @@ INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
> // cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
> // Well we do not care, we do not activate TBAA in the compiler
> #define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
> -INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
> +INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
> return *(SPACE TYPE##DIM *) (p + DIM * offset); \
> } \
> -INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
> +INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
> *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
> }
>
> @@ -854,22 +865,22 @@ DECL_UNTYPED_RW_ALL(float)
> // Declare functions for vector types which are derived from scalar ones
> /////////////////////////////////////////////////////////////////////////////
> #define DECL_VECTOR_1OP(NAME, TYPE) \
> - INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
> + INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
> return (TYPE##2)(NAME(v.x), NAME(v.y)); \
> }\
> - INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
> + INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
> return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \
> }\
> - INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
> + INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
> return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \
> }\
> - INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
> + INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
> TYPE##8 dst;\
> dst.s0123 = NAME(v.s0123);\
> dst.s4567 = NAME(v.s4567);\
> return dst;\
> }\
> - INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
> + INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
> TYPE##16 dst;\
> dst.s01234567 = NAME(v.s01234567);\
> dst.s89abcdef = NAME(v.s89abcdef);\
> @@ -920,22 +931,22 @@ DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
> /////////////////////////////////////////////////////////////////////////////
>
> #define DECL_VECTOR_2OP(NAME, TYPE) \
> - INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
> + INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
> return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
> }\
> - INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
> + INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
> return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
> }\
> - INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
> + INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
> return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
> }\
> - INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
> + INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
> TYPE##8 dst;\
> dst.s0123 = NAME(v0.s0123, v1.s0123);\
> dst.s4567 = NAME(v0.s4567, v1.s4567);\
> return dst;\
> }\
> - INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
> + INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
> TYPE##16 dst;\
> dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
> dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
> @@ -957,22 +968,22 @@ DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
> #undef DECL_VECTOR_2OP
>
> #define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
> - INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
> + INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
> return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
> }\
> - INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
> + INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
> return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
> }\
> - INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
> + INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
> return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
> }\
> - INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
> + INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
> TYPE##8 dst;\
> dst.s0123 = NAME(v0.s0123, v1.s0123);\
> dst.s4567 = NAME(v0.s4567, v1.s4567);\
> return dst;\
> }\
> - INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
> + INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
> TYPE##16 dst;\
> dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
> dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
> @@ -984,22 +995,22 @@ DECL_VECTOR_2OP(rootn, float, int);
> #undef DECL_VECTOR_2OP
>
> #define DECL_VECTOR_3OP(NAME, TYPE) \
> - INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
> + INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
> return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \
> }\
> - INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
> + INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
> return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \
> }\
> - INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
> + INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
> return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \
> }\
> - INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
> + INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
> TYPE##8 dst;\
> dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\
> dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\
> return dst;\
> }\
> - INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
> + INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
> TYPE##16 dst;\
> dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\
> dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\
> @@ -1010,11 +1021,11 @@ DECL_VECTOR_3OP(mix, float);
> #undef DECL_VECTOR_3OP
>
> // mix requires more variants
> -INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
> -INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
> -INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
> -INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
> -INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
> +INLINE_OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
> +INLINE_OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
> +INLINE_OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
> +INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
> +INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
>
> // XXX workaround ptx profile
> #define fabs __gen_ocl_internal_fabs
> diff --git a/kernels/compiler_clod.cl b/kernels/compiler_clod.cl
> index e21d9f5..dba7d6f 100644
> --- a/kernels/compiler_clod.cl
> +++ b/kernels/compiler_clod.cl
> @@ -28,7 +28,7 @@ inline uint pack_fp4(float4 u4) {
>
> #define time 1.f
>
> -float f(vec3 o)
> +inline float f(vec3 o)
> {
> float a=(sin(o.x)+o.y*.25f)*.35f;
> o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
> @@ -36,7 +36,7 @@ float f(vec3 o)
> }
>
> // XXX front end does not inline this function
> -__attribute((always_inline)) vec3 s(vec3 o,vec3 d)
> +inline __attribute((always_inline)) vec3 s(vec3 o,vec3 d)
> {
> float t=0.0f;
> float dt = 0.2f;
> diff --git a/kernels/compiler_julia.cl b/kernels/compiler_julia.cl
> index 98c5799..21672f6 100644
> --- a/kernels/compiler_julia.cl
> +++ b/kernels/compiler_julia.cl
> @@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
> dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
> } while (0)
>
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
> {
> float mz2,md2,dist,t;
> @@ -74,7 +74,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
> }
>
> #if 1
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> vec3 calcNormal(vec3 p, vec4 c)
> {
> vec4 nz,ndz,dz[4];
> diff --git a/kernels/compiler_julia_no_break.cl b/kernels/compiler_julia_no_break.cl
> index 1a9be64..5c357b1 100644
> --- a/kernels/compiler_julia_no_break.cl
> +++ b/kernels/compiler_julia_no_break.cl
> @@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
> dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
> } while (0)
>
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
> {
> float mz2,md2,dist,t;
> @@ -75,7 +75,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
> }
>
> #if 1
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> vec3 calcNormal(vec3 p, vec4 c)
> {
> vec4 nz,ndz,dz[4];
> diff --git a/kernels/compiler_mandelbrot.cl b/kernels/compiler_mandelbrot.cl
> index 42295ab..d15ccd0 100644
> --- a/kernels/compiler_mandelbrot.cl
> +++ b/kernels/compiler_mandelbrot.cl
> @@ -1,8 +1,8 @@
> // Used to ID into the 1D array, so that we can use
> // it effectively as a 2D array
> -int ID(int x, int y, int width) { return 4*width*y + x*4; }
> -float mapX(float x) { return x*3.25f - 2.f; }
> -float mapY(float y) { return y*2.5f - 1.25f; }
> +inline int ID(int x, int y, int width) { return 4*width*y + x*4; }
> +inline float mapX(float x) { return x*3.25f - 2.f; }
> +inline float mapY(float y) { return y*2.5f - 1.25f; }
>
> __kernel void compiler_mandelbrot(__global char *out) {
> int x_dim = get_global_id(0);
> diff --git a/kernels/compiler_mandelbrot_alternate.cl b/kernels/compiler_mandelbrot_alternate.cl
> index fc99326..ab6fb07 100644
> --- a/kernels/compiler_mandelbrot_alternate.cl
> +++ b/kernels/compiler_mandelbrot_alternate.cl
> @@ -1,6 +1,6 @@
> -int offset(int x, int y, int width) { return width*y + x; }
> -float mapX(float x) {return x*3.25f - 2.f;}
> -float mapY(float y) {return y*2.5f - 1.25f;}
> +inline int offset(int x, int y, int width) { return width*y + x; }
> +inline float mapX(float x) {return x*3.25f - 2.f;}
> +inline float mapY(float y) {return y*2.5f - 1.25f;}
>
> __kernel void compiler_mandelbrot_alternate(__global uint *out,
> float rcpWidth,
> diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
> index 95469c5..4de6c10 100644
> --- a/kernels/compiler_menger_sponge_no_shadow.cl
> +++ b/kernels/compiler_menger_sponge_no_shadow.cl
> @@ -14,11 +14,11 @@ typedef float4 vec4;
> #define time 1.f
>
> // fmod is not like glsl mod!
> -__attribute__((always_inline, overloadable))
> +inline __attribute__((always_inline, overloadable))
> float glsl_mod(float x,float y) { return x-y*floor(x/y); }
> -__attribute__((always_inline, overloadable))
> +inline __attribute__((always_inline, overloadable))
> float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); }
> -__attribute__((always_inline, overloadable))
> +inline __attribute__((always_inline, overloadable))
> float3 glsl_mod(float3 a,float3 b) { return (float3)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y), glsl_mod(a.z,b.z)); }
>
> inline vec3 reflect(vec3 I, vec3 N) {
> @@ -38,10 +38,10 @@ inline uint pack_fp4(float4 u4) {
> dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
> } while (0)
>
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> float maxcomp(vec3 p) { return max(p.x,max(p.y,p.z));}
>
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> float sdBox(vec3 p, vec3 b)
> {
> vec3 di = fabs(p) - b;
> @@ -49,7 +49,7 @@ float sdBox(vec3 p, vec3 b)
> return min(mc,length(max(di,0.0f)));
> }
>
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> vec4 map(vec3 p)
> {
> float d = sdBox(p,(vec3)(1.0f));
> @@ -78,7 +78,7 @@ vec4 map(vec3 p)
> }
>
> // GLSL ES doesn't seem to like loops with conditional break/return...
> -__attribute__((always_inline))
> +inline __attribute__((always_inline))
> vec4 intersect( vec3 ro, vec3 rd )
> {
> float t = 0.0f;
> diff --git a/kernels/compiler_ribbon.cl b/kernels/compiler_ribbon.cl
> index 92375e7..41b446e 100644
> --- a/kernels/compiler_ribbon.cl
> +++ b/kernels/compiler_ribbon.cl
> @@ -27,7 +27,7 @@ inline float ob(vec3 q) {
> inline float o(vec3 q) { return min(oa(q),ob(q)); }
>
> // Get Normal XXX Not inline by LLVM
> -__attribute__((always_inline)) vec3 gn(vec3 q) {
> +inline __attribute__((always_inline)) vec3 gn(vec3 q) {
> const vec3 fxyy = (vec3)(.01f, 0.f, 0.f);
> const vec3 fyxy = (vec3)(0.f, .01f, 0.f);
> const vec3 fyyx = (vec3)(0.f, 0.f, .01f);
> --
> 1.8.1.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list