[Beignet] [PATCH] Adaptions for LLVM 3.3 / SPIR

Mon Jun 3 02:09:56 PDT 2013

Handle the fact that several include files are moved from llvm/ to
llvm/IR/ in LLVM 3.3.

"__attribute__((always_inline)) inline" no longer works as intended,
and is replaced by "inline __attribute__((always_inline))".

For LLVM 3.3, the target is changed from "nvptx" to "spir", and
built-in address space qualifiers are used. For now, the built-in
types image2d_t, image3d_t, sampler_t, and event_t are overridden by
defines.

Signed-off-by: Dag Lem <dag at nimrod.no>
---
 backend/src/backend/program.cpp             |  17 +-
 backend/src/ir/unit.cpp                     |   5 +
 backend/src/ir/unit.hpp                     |   5 +
 backend/src/llvm/llvm_gen_backend.cpp       |  38 ++-
 backend/src/llvm/llvm_passes.cpp            |  20 +-
 backend/src/llvm/llvm_scalarize.cpp         |  17 +-
 backend/src/llvm/llvm_to_gen.cpp            |  13 +-
 backend/src/ocl_stdlib.h                    | 359 ++++++++++++++--------------
 kernels/compiler_clod.cl                    |   4 +-
 kernels/compiler_julia.cl                   |   4 +-
 kernels/compiler_julia_no_break.cl          |   4 +-
 kernels/compiler_mandelbrot.cl              |   6 +-
 kernels/compiler_mandelbrot_alternate.cl    |   6 +-
 kernels/compiler_menger_sponge_no_shadow.cl |  14 +-
 kernels/compiler_ribbon.cl                  |   2 +-
 15 files changed, 305 insertions(+), 209 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 9e37bdb..e41e5b6 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -59,7 +59,11 @@
 #include <clang/Basic/TargetOptions.h>
 #include <llvm/ADT/IntrusiveRefCntPtr.h>
 #include <llvm/ADT/OwningPtr.h>
+#if LLVM_VERSION_MINOR <= 2
 #include <llvm/Module.h>
+#else
+#include <llvm/IR/Module.h>
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/raw_ostream.h>
 
@@ -147,8 +151,15 @@ namespace gbe {
 
     args.push_back("-emit-llvm");
     if(bOpt)  args.push_back("-O3");
+#if LLVM_VERSION_MINOR <= 2
     args.push_back("-triple");
     args.push_back("nvptx");
+#else
+    args.push_back("-x");
+    args.push_back("cl");
+    args.push_back("-triple");
+    args.push_back("spir");
+#endif /* LLVM_VERSION_MINOR <= 2 */
     args.push_back(input);
 
     // The compiler invocation needs a DiagnosticsEngine so it can report problems
@@ -162,8 +173,6 @@ namespace gbe {
     clang::DiagnosticsEngine Diags(DiagID, DiagClient);
 #else
     args.push_back("-ffp-contract=off");
-    args.push_back("-triple");
-    args.push_back("nvptx");
 
     llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *DiagClient =
@@ -183,7 +192,11 @@ namespace gbe {
     clang::CompilerInstance Clang;
     Clang.setInvocation(CI.take());
     // Get ready to report problems
+#if LLVM_VERSION_MINOR <= 2
     Clang.createDiagnostics(args.size(), &args[0]);
+#else
+    Clang.createDiagnostics();
+#endif /* LLVM_VERSION_MINOR <= 2 */
     if (!Clang.hasDiagnostics())
       return;
 
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 44cec3c..01e1eb1 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -21,7 +21,12 @@
  * \file unit.cpp
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Instructions.h"
+#else
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "ir/unit.hpp"
 #include "ir/function.hpp"
 
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index f19fd7e..1017f5f 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -24,7 +24,12 @@
 #ifndef __GBE_IR_UNIT_HPP__
 #define __GBE_IR_UNIT_HPP__
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Value.h"
+#else
+#include "llvm/IR/Value.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index a9c726b..99fbadb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -71,16 +71,31 @@
  *   is intercepted, we just abort
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -101,9 +116,10 @@
 #include "llvm/MC/MCSymbol.h"
 #if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
 #include "llvm/Target/TargetData.h"
-#endif
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 2)
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
 #endif
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
@@ -138,9 +154,9 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 2)
-#error "Only LLVM 3.0 / 3.1 is supported"
-#endif /* (LLVM_VERSION_MAJOR != 3) && (LLVM_VERSION_MINOR >= 2) */
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3)
+#error "Only LLVM 3.0 - 3.3 is supported"
+#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3) */
 
 using namespace llvm;
 
@@ -1139,9 +1155,13 @@ namespace gbe
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
       case CallingConv::PTX_Device: // we do not emit device function
         return;
       case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+#endif
         break;
       default: GBE_ASSERTM(false, "Unsupported calling convention");
     }
@@ -1597,14 +1617,14 @@ namespace gbe
           break;
           case Intrinsic::stackrestore:
           break;
-#if LLVM_VERSION_MINOR == 2
+#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
           case Intrinsic::fmuladd:
             this->newRegister(&I);
           break;
-#endif /* LLVM_VERSION_MINOR == 2 */
+#endif /* LLVM_VERSION_MINOR >= 2 */
           default:
           GBE_ASSERTM(false, "Unsupported intrinsics");
         }
@@ -1775,7 +1795,7 @@ namespace gbe
             ctx.MOV(ir::getType(family), dst, src);
           }
           break;
-#if LLVM_VERSION_MINOR == 2
+#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::fmuladd:
           {
             const ir::Register tmp  = ctx.reg(ir::FAMILY_DWORD);
@@ -1791,7 +1811,7 @@ namespace gbe
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
-#endif /* LLVM_VERSION_MINOR == 2 */
+#endif /* LLVM_VERSION_MINOR >= 2 */
           default: NOT_IMPLEMENTED;
         }
       } else {
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 40c0e62..4bafc0d 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -30,16 +30,31 @@
  * Segovia) the right to use another license for it (MIT here)
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -60,9 +75,10 @@
 #include "llvm/MC/MCSymbol.h"
 #if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
 #include "llvm/Target/TargetData.h"
-#endif
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 2)
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
 #endif
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index c24e575..3c0d6a4 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -63,18 +63,29 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Function.h"
 #include "llvm/InstrTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #if LLVM_VERSION_MINOR <= 1
 #include "llvm/Support/IRBuilder.h"
-#else
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
 #endif /* LLVM_VERSION_MINOR <= 1 */
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
@@ -746,9 +757,13 @@ namespace gbe {
   bool Scalarize::runOnFunction(Function& F)
   {
     switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
     case CallingConv::PTX_Device:
       return false;
     case CallingConv::PTX_Kernel:
+#else
+    case CallingConv::C:
+#endif
       break;
     default: GBE_ASSERTM(false, "Unsupported calling convention");
     }
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 559cde0..788a3dd 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -22,11 +22,22 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Support/IRReader.h"
+#else
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Assembly/PrintModulePass.h"
@@ -58,7 +69,7 @@ namespace gbe
       o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
 
     // Get the module from its file
-    SMDiagnostic Err;
+    llvm::SMDiagnostic Err;
     std::auto_ptr<Module> M;
     M.reset(ParseIRFile(fileName, Err, c));
     if (M.get() == 0) return false;
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 613b844..97b1f24 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -20,11 +20,11 @@
 #ifndef __GEN_OCL_STDLIB_H__
 #define __GEN_OCL_STDLIB_H__
 
-#define INLINE __attribute__((always_inline)) inline
+#define INLINE inline __attribute__((always_inline))
 #define OVERLOADABLE __attribute__((overloadable))
 #define PURE __attribute__((pure))
 #define CONST __attribute__((const))
-#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in scalar data types
@@ -41,6 +41,8 @@ typedef unsigned int uintptr_t;
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL address space
 /////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
 #define __private __attribute__((address_space(0)))
 #define __global __attribute__((address_space(1)))
 #define __constant __attribute__((address_space(2)))
@@ -50,6 +52,7 @@ typedef unsigned int uintptr_t;
 //#define local __local
 #define constant __constant
 #define private __private
+#endif
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in vector data types
@@ -72,12 +75,20 @@ DEF(float);
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL other built-in data types
 /////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
 struct _image2d_t;
-typedef __texture struct _image2d_t* image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
 struct _image3d_t;
-typedef __texture struct _image3d_t* image3d_t;
-typedef uint sampler_t;
-typedef size_t event_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef uint __sampler_t;
+typedef size_t __event_t;
+#define image2d_t __image2d_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+#define event_t __event_t
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL conversions & type casting
 /////////////////////////////////////////////////////////////////////////////
@@ -202,8 +213,8 @@ DEF;
 #undef DEF
 
 #define SDEF(TYPE)                                                              \
-INLINE_OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
 INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
 INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
 SDEF(char);
@@ -212,8 +223,8 @@ SDEF(int);
 SDEF(long);
 #undef SDEF
 #define UDEF(TYPE)                                                              \
-INLINE_OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
 INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
 INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
 UDEF(uchar);
@@ -361,7 +372,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
 #define DECL_PUBLIC_WORK_ITEM_FN(NAME) \
-inline unsigned NAME(unsigned int dim) { \
+INLINE unsigned NAME(unsigned int dim) { \
   if (dim == 0) return __gen_ocl_##NAME##0(); \
   else if (dim == 1) return __gen_ocl_##NAME##1(); \
   else if (dim == 2) return __gen_ocl_##NAME##2(); \
@@ -393,84 +404,84 @@ PURE CONST float __gen_ocl_rndz(float x);
 PURE CONST float __gen_ocl_rnde(float x);
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
-INLINE OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
-INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
+INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
   return __gen_ocl_cos(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
   return __gen_ocl_sin(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-INLINE OVERLOADABLE float native_log(float x) {
+INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE_OVERLOADABLE float native_log(float x) {
   return native_log2(x) * 0.6931472002f;
 }
-INLINE OVERLOADABLE float native_log10(float x) {
+INLINE_OVERLOADABLE float native_log10(float x) {
   return native_log2(x) * 0.3010299956f;
 }
-INLINE OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
-INLINE OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE OVERLOADABLE int2 ilogb(float2 x) {
+INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
+INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE int2 ilogb(float2 x) {
   return (int2)(ilogb(x.s0), ilogb(x.s1));
 }
-INLINE OVERLOADABLE int4 ilogb(float4 x) {
+INLINE_OVERLOADABLE int4 ilogb(float4 x) {
   return (int4)(ilogb(x.s01), ilogb(x.s23));
 }
-INLINE OVERLOADABLE int8 ilogb(float8 x) {
+INLINE_OVERLOADABLE int8 ilogb(float8 x) {
   return (int8)(ilogb(x.s0123), ilogb(x.s4567));
 }
-INLINE OVERLOADABLE int16 ilogb(float16 x) {
+INLINE_OVERLOADABLE int16 ilogb(float16 x) {
   return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
 }
-INLINE OVERLOADABLE float nan(uint code) {
+INLINE_OVERLOADABLE float nan(uint code) {
   return NAN;
 }
-INLINE OVERLOADABLE float2 nan(uint2 code) {
+INLINE_OVERLOADABLE float2 nan(uint2 code) {
   return (float2)(nan(code.s0), nan(code.s1));
 }
-INLINE OVERLOADABLE float4 nan(uint4 code) {
+INLINE_OVERLOADABLE float4 nan(uint4 code) {
   return (float4)(nan(code.s01), nan(code.s23));
 }
-INLINE OVERLOADABLE float8 nan(uint8 code) {
+INLINE_OVERLOADABLE float8 nan(uint8 code) {
   return (float8)(nan(code.s0123), nan(code.s4567));
 }
-INLINE OVERLOADABLE float16 nan(uint16 code) {
+INLINE_OVERLOADABLE float16 nan(uint16 code) {
   return (float16)(nan(code.s01234567), nan(code.s89abcdef));
 }
-INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-INLINE OVERLOADABLE float native_tan(float x) {
+INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE_OVERLOADABLE float native_tan(float x) {
   return native_sin(x) / native_cos(x);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
   return native_tan(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
-INLINE OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
-INLINE OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
-INLINE OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   return __gen_ocl_pow(x, 0.3333333333f);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
   *cosval = native_cos(x);
   return native_sin(x);
 }
-INLINE OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
+INLINE_OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
   return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
 }
-INLINE OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
+INLINE_OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
   return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
 }
-INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
+INLINE_OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
   return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
@@ -480,7 +491,7 @@ INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
                   __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
 }
-INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
+INLINE_OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
   return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                    __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                    __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
@@ -498,29 +509,29 @@ INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval
                    __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
                    __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
   return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
   return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
   float y = native_exp(-2 * x);
   return (1 - y) / (1 + y);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
   return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
   return __gen_ocl_internal_asin(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
   return M_PI_2_F - __gen_ocl_internal_asin(x);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
   return __gen_ocl_internal_acos(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   float a = 0, c = 1;
   if (x <= -1) {
     a = - M_PI_2_F;
@@ -534,44 +545,44 @@ INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   }
   return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
   return native_log(x + native_sqrt(x * x + 1));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
   return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
   return 0.5f * native_sqrt((1 + x) / (1 - x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
   return x * y < 0 ? -x : x;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
   return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
   return 1 - __gen_ocl_internal_erf(x);
 }
 
 // XXX work-around PTX profile
 #define sqrt native_sqrt
-INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
-INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-INLINE OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
-INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
+INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
+INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return 2 * __gen_ocl_internal_round(x / 2);
 }
 // TODO use llvm intrinsics definitions
@@ -601,32 +612,32 @@ INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 #define erf __gen_ocl_internal_erf
 #define erfc __gen_ocl_internal_erfc
 
-INLINE OVERLOADABLE float mad(float a, float b, float c) {
+INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return a*b+c;
 }
 
-INLINE OVERLOADABLE uint select(uint src0, uint src1, int cond) {
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE int select(int src0, int src1, int cond) {
+INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE int select(int src0, int src1, uint cond) {
+INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE float select(float src0, float src1, int cond) {
+INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE float select(float src0, float src1, uint cond) {
+INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
   return cond ? src1 : src0;
 }
 
 // This will be optimized out by LLVM and will output LLVM select instructions
 #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
   TYPE4 dst; \
   const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
   const TYPE x1 = src1.x; \
@@ -652,13 +663,13 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
 #define DECL_MIN_MAX_CLAMP(TYPE) \
-INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
   return a > b ? a : b; \
 } \
-INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
   return a < b ? a : b; \
 } \
-INLINE OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
   return max(min(v, u), l); \
 }
 DECL_MIN_MAX_CLAMP(float)
@@ -670,35 +681,35 @@ DECL_MIN_MAX_CLAMP(unsigned short)
 DECL_MIN_MAX_CLAMP(unsigned char)
 #undef DECL_MIN_MAX_CLAMP
 
-INLINE OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-INLINE OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-INLINE OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
   float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
   return a > b ? x : b > a ? y : max(x, y);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
   float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
   return a < b ? x : b < a ? y : min(x, y);
 }
-INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
-INLINE OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
   return __gen_ocl_internal_fmax(x, y) - y;
 }
-INLINE OVERLOADABLE float fract(float x, float *p) {
+INLINE_OVERLOADABLE float fract(float x, float *p) {
   *p = __gen_ocl_internal_floor(x);
   return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
 }
-INLINE OVERLOADABLE float2 fract(float2 x, float2 *p) {
+INLINE_OVERLOADABLE float2 fract(float2 x, float2 *p) {
   return (float2)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p));
 }
-INLINE OVERLOADABLE float4 fract(float4 x, float4 *p) {
+INLINE_OVERLOADABLE float4 fract(float4 x, float4 *p) {
   return (float4)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p),
                   fract(x.s2, 2 + (float *)p),
                   fract(x.s3, 3 + (float *)p));
 }
-INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
+INLINE_OVERLOADABLE float8 fract(float8 x, float8 *p) {
   return (float8)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p),
                   fract(x.s2, 2 + (float *)p),
@@ -708,7 +719,7 @@ INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
                   fract(x.s6, 6 + (float *)p),
                   fract(x.s7, 7 + (float *)p));
 }
-INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
+INLINE_OVERLOADABLE float16 fract(float16 x, float16 *p) {
   return (float16)(fract(x.s0, (float *)p),
                    fract(x.s1, 1 + (float *)p),
                    fract(x.s2, 2 + (float *)p),
@@ -726,85 +737,85 @@ INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
                    fract(x.se, 14 + (float *)p),
                    fract(x.sf, 15 + (float *)p));
 }
-INLINE OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-INLINE OVERLOADABLE float ldexp(float x, int n) {
+INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE_OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_pow(2, n) * x;
 }
-INLINE OVERLOADABLE float pown(float x, int n) {
+INLINE_OVERLOADABLE float pown(float x, int n) {
   if (x == 0 && n == 0)
     return 1;
   return powr(x, n);
 }
-INLINE OVERLOADABLE float rootn(float x, int n) {
+INLINE_OVERLOADABLE float rootn(float x, int n) {
   return powr(x, 1.f / n);
 }
 
 /////////////////////////////////////////////////////////////////////////////
 // Geometric functions (see 6.11.5 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {
+INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
   return mad(p0.x,p1.x,p0.y*p1.y);
 }
-INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {
+INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
   return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
 }
-INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {
+INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
   return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
 }
 
-INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {
+INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
   return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
          mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
 }
-INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {
+INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
   return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
          mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
          mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
          mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
 }
 
-INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
-INLINE OVERLOADABLE float normalize(float x) { return 1.f; }
-INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }
-INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
    return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
 }
-INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
    return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
 }
 
@@ -816,10 +827,10 @@ INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
 // cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
 // Well we do not care, we do not activate TBAA in the compiler
 #define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
-INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
   return *(SPACE TYPE##DIM *) (p + DIM * offset); \
 } \
-INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
   *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
 }
 
@@ -854,22 +865,22 @@ DECL_UNTYPED_RW_ALL(float)
 // Declare functions for vector types which are derived from scalar ones
 /////////////////////////////////////////////////////////////////////////////
 #define DECL_VECTOR_1OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
     return (TYPE##2)(NAME(v.x), NAME(v.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
     return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
     return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v.s0123);\
     dst.s4567 = NAME(v.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v.s01234567);\
     dst.s89abcdef = NAME(v.s89abcdef);\
@@ -920,22 +931,22 @@ DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
 /////////////////////////////////////////////////////////////////////////////
 
 #define DECL_VECTOR_2OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
     return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
     return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
     return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
@@ -957,22 +968,22 @@ DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
 #undef DECL_VECTOR_2OP
 
 #define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
     return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
     return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
     return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
@@ -984,22 +995,22 @@ DECL_VECTOR_2OP(rootn, float, int);
 #undef DECL_VECTOR_2OP
 
 #define DECL_VECTOR_3OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
     return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
     return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
     return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\
@@ -1010,11 +1021,11 @@ DECL_VECTOR_3OP(mix, float);
 #undef DECL_VECTOR_3OP
 
 // mix requires more variants
-INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
-INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
-INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
-INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
-INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
+INLINE_OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
+INLINE_OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
+INLINE_OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
+INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
+INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
 
 // XXX workaround ptx profile
 #define fabs __gen_ocl_internal_fabs
diff --git a/kernels/compiler_clod.cl b/kernels/compiler_clod.cl
index e21d9f5..dba7d6f 100644
--- a/kernels/compiler_clod.cl
+++ b/kernels/compiler_clod.cl
@@ -28,7 +28,7 @@ inline uint pack_fp4(float4 u4) {
 
 #define time 1.f
 
-float f(vec3 o)
+inline float f(vec3 o)
 {
     float a=(sin(o.x)+o.y*.25f)*.35f;
     o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
@@ -36,7 +36,7 @@ float f(vec3 o)
 }
 
 // XXX front end does not inline this function
-__attribute((always_inline)) vec3 s(vec3 o,vec3 d)
+inline __attribute((always_inline)) vec3 s(vec3 o,vec3 d)
 {
     float t=0.0f;
     float dt = 0.2f;
diff --git a/kernels/compiler_julia.cl b/kernels/compiler_julia.cl
index 98c5799..21672f6 100644
--- a/kernels/compiler_julia.cl
+++ b/kernels/compiler_julia.cl
@@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 {
     float mz2,md2,dist,t;
@@ -74,7 +74,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 }
 
 #if 1
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec3 calcNormal(vec3 p, vec4 c)
 {
     vec4 nz,ndz,dz[4];
diff --git a/kernels/compiler_julia_no_break.cl b/kernels/compiler_julia_no_break.cl
index 1a9be64..5c357b1 100644
--- a/kernels/compiler_julia_no_break.cl
+++ b/kernels/compiler_julia_no_break.cl
@@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 {
     float mz2,md2,dist,t;
@@ -75,7 +75,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 }
 
 #if 1
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec3 calcNormal(vec3 p, vec4 c)
 {
     vec4 nz,ndz,dz[4];
diff --git a/kernels/compiler_mandelbrot.cl b/kernels/compiler_mandelbrot.cl
index 42295ab..d15ccd0 100644
--- a/kernels/compiler_mandelbrot.cl
+++ b/kernels/compiler_mandelbrot.cl
@@ -1,8 +1,8 @@
 // Used to ID into the 1D array, so that we can use
 // it effectively as a 2D array
-int ID(int x, int y, int width) { return 4*width*y + x*4; }
-float mapX(float x) { return x*3.25f - 2.f; }
-float mapY(float y) { return y*2.5f - 1.25f; }
+inline int ID(int x, int y, int width) { return 4*width*y + x*4; }
+inline float mapX(float x) { return x*3.25f - 2.f; }
+inline float mapY(float y) { return y*2.5f - 1.25f; }
 
 __kernel void compiler_mandelbrot(__global char *out) {
   int x_dim = get_global_id(0);
diff --git a/kernels/compiler_mandelbrot_alternate.cl b/kernels/compiler_mandelbrot_alternate.cl
index fc99326..ab6fb07 100644
--- a/kernels/compiler_mandelbrot_alternate.cl
+++ b/kernels/compiler_mandelbrot_alternate.cl
@@ -1,6 +1,6 @@
-int offset(int x, int y, int width) { return width*y + x; }
-float mapX(float x) {return x*3.25f - 2.f;}
-float mapY(float y) {return y*2.5f - 1.25f;}
+inline int offset(int x, int y, int width) { return width*y + x; }
+inline float mapX(float x) {return x*3.25f - 2.f;}
+inline float mapY(float y) {return y*2.5f - 1.25f;}
 
 __kernel void compiler_mandelbrot_alternate(__global uint *out,
                                             float rcpWidth,
diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
index 95469c5..4de6c10 100644
--- a/kernels/compiler_menger_sponge_no_shadow.cl
+++ b/kernels/compiler_menger_sponge_no_shadow.cl
@@ -14,11 +14,11 @@ typedef float4 vec4;
 #define time 1.f
 
 // fmod is not like glsl mod!
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float glsl_mod(float x,float y) { return x-y*floor(x/y); }
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); }
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float3 glsl_mod(float3 a,float3 b) { return (float3)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y), glsl_mod(a.z,b.z)); }
 
 inline vec3 reflect(vec3 I, vec3 N) {
@@ -38,10 +38,10 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float maxcomp(vec3 p) { return max(p.x,max(p.y,p.z));}
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float sdBox(vec3 p, vec3 b)
 {
   vec3  di = fabs(p) - b;
@@ -49,7 +49,7 @@ float sdBox(vec3 p, vec3 b)
   return min(mc,length(max(di,0.0f)));
 }
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec4 map(vec3 p)
 {
    float d = sdBox(p,(vec3)(1.0f));
@@ -78,7 +78,7 @@ vec4 map(vec3 p)
 }
 
 // GLSL ES doesn't seem to like loops with conditional break/return...
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec4 intersect( vec3 ro, vec3 rd )
 {
     float t = 0.0f;
diff --git a/kernels/compiler_ribbon.cl b/kernels/compiler_ribbon.cl
index 92375e7..41b446e 100644
--- a/kernels/compiler_ribbon.cl
+++ b/kernels/compiler_ribbon.cl
@@ -27,7 +27,7 @@ inline float ob(vec3 q) {
 inline float o(vec3 q) { return min(oa(q),ob(q)); }
 
 // Get Normal XXX Not inline by LLVM
-__attribute__((always_inline)) vec3 gn(vec3 q) {
+inline __attribute__((always_inline)) vec3 gn(vec3 q) {
  const vec3 fxyy = (vec3)(.01f, 0.f, 0.f);
  const vec3 fyxy = (vec3)(0.f, .01f, 0.f);
  const vec3 fyyx = (vec3)(0.f, 0.f, .01f);
-- 
1.8.1.4