[Beignet] [PATCH 1/2] GBE: Refine bti usage in backend & runtime.

Ruiling Song ruiling.song at intel.com
Tue Jul 29 22:59:29 PDT 2014


Previously, we simply map 2G surface for memory access,
which has obvious security issue, user can easily read/write graphics
memory that does not belong to him. To prevent such kind of behaviour,
We bind each surface to a dedicated bti. HW provides automatic
bounds check. For out-of-bound write, it will be ignored. And for read
out-of-bound, hardware will simply return zero value.

The idea behind the patch is for a load/store instruction, it will search
through the LLVM use-def chain until finding out where the address
comes from. Then the bti is saved in ir::Instruction and used for
the later code generation. And for mixed pointer case, a load/store
will access more than one bti.

To simplify some code, '0' is reserved for constant address space,
'1' is reserved for private address space. Other btis are assigned
automatically by backend.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/context.cpp            |   10 ++
 backend/src/backend/context.hpp            |    2 +
 backend/src/backend/gen_insn_selection.cpp |  242 ++++++++++++++++++++--------
 backend/src/backend/program.cpp            |   14 +-
 backend/src/backend/program.h              |    9 ++
 backend/src/backend/program.hpp            |    7 +-
 backend/src/gbe_bin_interpreter.cpp        |    1 +
 backend/src/ir/context.cpp                 |    4 +-
 backend/src/ir/context.hpp                 |   10 +-
 backend/src/ir/function.hpp                |    5 +-
 backend/src/ir/image.cpp                   |    9 +-
 backend/src/ir/image.hpp                   |    2 +-
 backend/src/ir/instruction.cpp             |   39 ++++-
 backend/src/ir/instruction.hpp             |   18 ++-
 backend/src/llvm/llvm_gen_backend.cpp      |  158 +++++++++++++++---
 kernels/compiler_mixed_pointer.cl          |   24 +++
 src/cl_command_queue.c                     |    4 +-
 src/cl_command_queue_gen7.c                |    5 +-
 src/cl_driver.h                            |    4 +-
 src/cl_gbe_loader.cpp                      |    5 +
 src/cl_gbe_loader.h                        |    1 +
 src/cl_kernel.c                            |    2 +-
 src/cl_kernel.h                            |    1 +
 src/intel/intel_gpgpu.c                    |   71 ++++----
 utests/CMakeLists.txt                      |    1 +
 utests/compiler_mixed_pointer.cpp          |  120 ++++++++++++++
 26 files changed, 601 insertions(+), 167 deletions(-)
 create mode 100644 kernels/compiler_mixed_pointer.cl
 create mode 100644 utests/compiler_mixed_pointer.cpp

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 831421d..a9d5d4a 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -436,6 +436,12 @@ namespace gbe
   void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
     curbeRegs.insert(std::make_pair(reg, offset));
   }
+  ir::Register Context::getSurfaceBaseReg(unsigned char bti) {
+    map<unsigned char, ir::Register>::iterator iter;
+    iter = btiRegMap.find(bti);
+    GBE_ASSERT(iter != btiRegMap.end());
+    return iter->second;
+  }
 
   void Context::buildArgList(void) {
     kernel->argNum = fn.argNum();
@@ -443,6 +449,8 @@ namespace gbe
       kernel->args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, kernel->argNum);
     else
       kernel->args = NULL;
+    btiRegMap.clear();
+    btiRegMap.insert(std::make_pair(1, ir::ocl::stackbuffer));
     for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
       const auto &arg = fn.getArg(argID);
 
@@ -457,6 +465,8 @@ namespace gbe
         case ir::FunctionArgument::GLOBAL_POINTER:
           kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
           kernel->args[argID].size = sizeof(void*);
+          kernel->args[argID].bti = arg.bti;
+          btiRegMap.insert(std::make_pair(arg.bti, arg.reg));
           break;
         case ir::FunctionArgument::CONSTANT_POINTER:
           kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 24f2873..f8ee4ae 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -99,6 +99,7 @@ namespace gbe
     void deallocateScratchMem(int32_t offset);
     /*! Preallocated curbe register set including special registers. */
     map<ir::Register, uint32_t> curbeRegs;
+    ir::Register getSurfaceBaseReg(unsigned char bti);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -139,6 +140,7 @@ namespace gbe
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    map<unsigned char, ir::Register> btiRegMap;
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 348aca4..23f99ca 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2714,25 +2714,55 @@ namespace gbe
   /*! Load instruction pattern */
   DECL_PATTERN(LoadInstruction)
   {
+    void readDWord(Selection::Opaque &sel,
+                   vector<GenRegister> &dst,
+                   vector<GenRegister> &dst2,
+                   GenRegister addr,
+                   uint32_t valueNum,
+                   ir::AddressSpace space,
+                   ir::BTI bti) const
+    {
+      for (uint32_t x = 0; x < bti.count; x++) {
+        if(x > 0)
+          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+
+        GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
+        if(x > 0) {
+          sel.push();
+            if(sel.isScalarReg(dst[0].reg())) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            for (uint32_t y = 0; y < valueNum; y++)
+              sel.ADD(dst[y], dst[y], dst2[y]);
+          sel.pop();
+        }
+      }
+    }
+
     void emitUntypedRead(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
+      vector<GenRegister> dst2(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
     }
 
     void emitDWordGather(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
+      GBE_ASSERT(bti.count == 1);
       const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
       GBE_ASSERT(insn.getValueNum() == 1);
 
@@ -2740,7 +2770,7 @@ namespace gbe
         GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
         sel.push();
           sel.curr.noMask = 1;
-          sel.SAMPLE(&dst, 1, &addr, 1, bti, 0, true, true);
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
         sel.pop();
         return;
       }
@@ -2756,63 +2786,34 @@ namespace gbe
         sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
       sel.pop();
 
-      sel.DWORD_GATHER(dst, addrDW, bti);
+      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
     }
 
     void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-
+      GBE_ASSERT(bti.count == 1);
       GenRegister dst[valueNum];
+      GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      sel.READ64(addr, dst, valueNum, bti);
+      sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
     }
 
-    void emitByteGather(Selection::Opaque &sel,
-                        const ir::LoadInstruction &insn,
+    void readByteAsDWord(Selection::Opaque &sel,
                         const uint32_t elemSize,
                         GenRegister address,
-                        uint32_t bti) const
+                        GenRegister dst,
+                        uint32_t simdWidth,
+                        uint8_t bti) const
     {
       using namespace ir;
-      const uint32_t valueNum = insn.getValueNum();
-      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
-                                 1 : sel.ctx.getSimdWidth();
-      if(valueNum > 1) {
-        vector<GenRegister> dst(valueNum);
-        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
-
-        if(elemSize == GEN_BYTE_SCATTER_WORD) {
-          for(uint32_t i = 0; i < valueNum; i++)
-            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
-        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
-          for(uint32_t i = 0; i < valueNum; i++)
-            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
-        }
-
-        uint32_t tmpRegNum = typeSize*valueNum / 4;
-        vector<GenRegister> tmp(tmpRegNum);
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
-          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        }
-
-        sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti);
-
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
-        }
-     } else {
-        GBE_ASSERT(insn.getValueNum() == 1);
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
-
         Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
         GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
         GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
@@ -2836,10 +2837,65 @@ namespace gbe
           sel.SHR(tmpData, tmpData, tmpAddr);
 
           if (elemSize == GEN_BYTE_SCATTER_WORD)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
         sel.pop();
+    }
+
+    void emitByteGather(Selection::Opaque &sel,
+                        const ir::LoadInstruction &insn,
+                        const uint32_t elemSize,
+                        GenRegister address,
+                        ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+                                 1 : sel.ctx.getSimdWidth();
+      RegisterFamily family = getFamily(insn.getValueType());
+
+      if(valueNum > 1) {
+        vector<GenRegister> dst(valueNum);
+        const uint32_t typeSize = getFamilySize(family);
+
+        for(uint32_t i = 0; i < valueNum; i++)
+          dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        vector<GenRegister> tmp2(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        }
+
+        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        }
+      } else {
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
+        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        GenRegister tmp = value;
+
+        for (int x = 0; x < bti.count; x++) {
+          if (x > 0)
+            tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
+
+          GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+          readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
+          if (x > 0) {
+            sel.push();
+              if (simdWidth == 1) {
+                sel.curr.noMask = 1;
+                sel.curr.execWidth = 1;
+              }
+              sel.ADD(value, value, tmp);
+            sel.pop();
+          }
+        }
       }
     }
 
@@ -2855,6 +2911,18 @@ namespace gbe
       sel.INDIRECT_MOVE(dst, src);
     }
 
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
+      if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+        return address;
+
+      sel.push();
+        sel.curr.noMask = 1;
+        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+      sel.pop();
+      return temp;
+    }
+
     INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
       using namespace ir;
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
@@ -2871,24 +2939,32 @@ namespace gbe
         sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
         address = temp;
       }
-      if (insn.getAddressSpace() == MEM_CONSTANT) {
+      BTI bti;
+      if (space == MEM_CONSTANT || space == MEM_LOCAL) {
+        bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+        bti.count = 1;
+      } else {
+        bti = insn.getBTI();
+      }
+      if (space == MEM_CONSTANT) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
         if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitRead64(sel, insn, address, 0x2);
+          this->emitRead64(sel, insn, address, bti);
         else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitDWordGather(sel, insn, address, 0x2);
+          this->emitDWordGather(sel, insn, address, bti);
         else {
-          this->emitByteGather(sel, insn, elemSize, address, 0x2);
+          this->emitByteGather(sel, insn, elemSize, address, bti);
+        }
+      } else {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, bti);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedRead(sel, insn, address, bti);
+        else {
+          this->emitByteGather(sel, insn, elemSize, address, bti);
         }
-      }
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else {
-        this->emitByteGather(sel, insn, elemSize, address, space == MEM_LOCAL ? 0xfe : 0x01);
       }
       return true;
     }
@@ -2977,7 +3053,6 @@ namespace gbe
     {
       using namespace ir;
       const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
@@ -2986,12 +3061,29 @@ namespace gbe
         sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
         address = temp;
       }
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWrite64(sel, insn, address, bti);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedWrite(sel, insn, address, bti);
-      else {
-        this->emitByteScatter(sel, insn, elemSize, address, bti);
+      if(space == MEM_LOCAL) {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitWrite64(sel, insn, address, 0xfe);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedWrite(sel, insn, address,  0xfe);
+        else
+          this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
+      } else {
+        BTI bti = insn.getBTI();
+        for (int x = 0; x < bti.count; x++) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+          sel.push();
+            sel.curr.noMask = 1;
+            sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+            this->emitWrite64(sel, insn, temp, bti.bti[x]);
+          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+            this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
+          else {
+            this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
+          }
+        }
       }
       return true;
     }
@@ -3391,20 +3483,32 @@ namespace gbe
       using namespace ir;
       const AtomicOps atomicOp = insn.getAtomicOpcode();
       const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const uint32_t srcNum = insn.getSrcNum();
+
       GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
       GenRegister src1 = src0, src2 = src0;
       if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
       if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      if(space == MEM_LOCAL && sel.needPatchSLMAddr()){
-        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
-        src0 = temp;
+      if(space == MEM_LOCAL) {
+        if (sel.needPatchSLMAddr()) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+          src0 = temp;
+        }
+        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+      } else {
+        ir::BTI b = insn.getBTI();
+        for (int x = 0; x < b.count; x++) {
+          sel.push();
+            sel.curr.noMask = 1;
+            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
+        }
       }
-      sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
       return true;
     }
     DECL_CTOR(AtomicInstruction, 1, 1);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 57ef515..20c1807 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -260,7 +260,7 @@ namespace gbe {
       OUT_UPDATE_SZ(arg.type);
       OUT_UPDATE_SZ(arg.size);
       OUT_UPDATE_SZ(arg.align);
-      OUT_UPDATE_SZ(arg.bufSize);
+      OUT_UPDATE_SZ(arg.bti);
     }
 
     OUT_UPDATE_SZ(patches.size());
@@ -349,7 +349,7 @@ namespace gbe {
       IN_UPDATE_SZ(arg.type);
       IN_UPDATE_SZ(arg.size);
       IN_UPDATE_SZ(arg.align);
-      IN_UPDATE_SZ(arg.bufSize);
+      IN_UPDATE_SZ(arg.bti);
     }
 
     IN_UPDATE_SZ(patch_num);
@@ -465,7 +465,7 @@ namespace gbe {
       outs << spaces_nl << "      type value: "<< arg.type << "\n";
       outs << spaces_nl << "      size: "<< arg.size << "\n";
       outs << spaces_nl << "      align: "<< arg.align << "\n";
-      outs << spaces_nl << "      bufSize: "<< arg.bufSize << "\n";
+      outs << spaces_nl << "      bti: "<< arg.bti << "\n";
     }
 
     outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
@@ -1016,6 +1016,12 @@ namespace gbe {
     return kernel->getArgSize(argID);
   }
 
+  static uint8_t kernelGetArgBTI(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgBTI(argID);
+  }
+
   static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
     if (genKernel == NULL) return 0u;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -1181,6 +1187,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
@@ -1227,6 +1234,7 @@ namespace gbe
       gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
       gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
       gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+      gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
       gbe_kernel_get_arg_type = gbe::kernelGetArgType;
       gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
       gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 2a3841a..330a3de 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -62,6 +62,11 @@ enum gbe_get_arg_info_value {
   GBE_GET_ARG_INFO_INVALID = 0xffffffff
 };
 
+// BTI magic number
+#define BTI_CONSTANT 0
+#define BTI_PRIVATE 1
+#define BTI_RESERVED_NUM 2
+
 /*! Constant buffer values (ie values to setup in the constant buffer) */
 enum gbe_curbe_type {
   GBE_CURBE_LOCAL_ID_X = 0,
@@ -283,6 +288,10 @@ extern gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info;
 typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
 extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
 
+/*! Get the the bti of a __global buffer */
+typedef uint8_t (gbe_kernel_get_arg_bti_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti;
+
 /*! Get the type of the given argument */
 typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
 extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index b780c42..a6303b9 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -49,7 +49,7 @@ namespace gbe {
     gbe_arg_type type; //!< Pointer, structure, image, regular value?
     uint32_t size;     //!< Size of the argument
     uint32_t align;    //!< addr alignment of the argument
-    uint32_t bufSize;  //!< Contant buffer size
+    uint8_t bti;      //!< binding table index for __global buffer
     ir::FunctionArgument::InfoFromLLVM info;
   };
 
@@ -91,6 +91,11 @@ namespace gbe {
     INLINE uint32_t getArgSize(uint32_t argID) const {
       return argID >= argNum ? 0u : args[argID].size;
     }
+    /*! Return the bti for __global buffer */
+    INLINE uint8_t getArgBTI(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].bti;
+    }
+    /*! Return the alignment of buffer argument */
     INLINE uint32_t getArgAlign(uint32_t argID) const {
       return argID >= argNum ? 0u : args[argID].align;
     }
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index bc032de..2f02b34 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -49,6 +49,7 @@ struct BinInterpCallBackInitializer
     gbe_kernel_get_name = gbe::kernelGetName;
     gbe_kernel_get_arg_type = gbe::kernelGetArgType;
     gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+    gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
     gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
     gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
     gbe_kernel_use_slm = gbe::kernelUseSLM;
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index bfbe825..1528a8d 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -108,10 +108,10 @@ namespace ir {
   }
 
   void Context::input(const std::string &name, FunctionArgument::Type type, Register reg,
-                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align) {
+                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align, unsigned char bti) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
     GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
-    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info);
+    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
     fn->args.push_back(arg);
   }
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 27ff4e9..8718a86 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -110,7 +110,7 @@ namespace ir {
     LabelIndex label(void);
     /*! Append a new input register for the function */
     void input(const std::string &name, FunctionArgument::Type type, Register reg,
-               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0);
+               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0, uint8_t bti = 0);
     /*! Append a new output register for the function */
     void output(Register reg);
     /*! Get the immediate value */
@@ -163,22 +163,22 @@ namespace ir {
 
     /*! LOAD with the destinations directly specified */
     template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
     }
 
     /*! STORE with the sources directly specified */
     template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
     }
 
   protected:
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 2710b17..deb7552 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -176,8 +176,8 @@ namespace ir {
     };
 
     /*! Create a function input argument */
-    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info) :
-      type(type), reg(reg), size(size), align(align), name(name), info(info) { }
+    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info, uint8_t bti) :
+      type(type), reg(reg), size(size), align(align), name(name), info(info), bti(bti) { }
 
     Type type;     //!< Gives the type of argument we have
     Register reg;  //!< Holds the argument
@@ -185,6 +185,7 @@ namespace ir {
     uint32_t align; //!< address alignment for the argument
     const std::string name; //!< Holds the function name for IR output
     InfoFromLLVM info;  //!< Holds the llvm passed info
+    uint8_t bti; //!< binding table index
     GBE_STRUCT(FunctionArgument); // Use custom allocator
   };
 
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index ee80a3d..a9b1563 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -94,8 +94,9 @@ namespace ir {
   }
 
   void ImageSet::getData(struct ImageInfo *imageInfos) const {
+      int id = 0;
       for(auto &it : regMap)
-        imageInfos[it.second->idx - gbe_get_image_base_index()] = *it.second;
+        imageInfos[id++] = *it.second;
   }
 
   ImageSet::~ImageSet() {
@@ -186,7 +187,7 @@ namespace ir {
       IN_UPDATE_SZ(img_info->channelOrderSlot);
       IN_UPDATE_SZ(img_info->dimOrderSlot);
 
-      indexMap.insert(std::make_pair(index, img_info));
+      indexMap.insert(std::make_pair(img_info->idx, img_info));
     }
 
     IN_UPDATE_SZ(magic);
@@ -252,7 +253,7 @@ namespace ir {
     return reg;
   }
 
-  void ImageSet::append(Register imageReg, Context *ctx)
+  void ImageSet::append(Register imageReg, Context *ctx, uint8_t bti)
   {
     ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
     GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
@@ -261,7 +262,7 @@ namespace ir {
     int32_t id = ctx->getFunction().getArgID(arg);
     struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
     imageInfo->arg_idx = id;
-    imageInfo->idx = regMap.size() + gbe_get_image_base_index();
+    imageInfo->idx = bti;
     imageInfo->wSlot = -1;
     imageInfo->hSlot = -1;
     imageInfo->depthSlot = -1;
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index 82fee56..b31c7da 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -44,7 +44,7 @@ namespace ir {
   {
   public:
     /*! Append an image argument. */
-    void append(Register imageReg, Context *ctx);
+    void append(Register imageReg, Context *ctx, uint8_t bti);
     /*! Append an image info slot. */
     void appendInfo(ImageInfoKey key, uint32_t offset);
     /*! Append an image info register. */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 3006893..23848d3 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -314,6 +314,7 @@ namespace ir {
       AtomicInstruction(AtomicOps atomicOp,
                          Register dst,
                          AddressSpace addrSpace,
+                         BTI bti,
                          Tuple src)
       {
         this->opcode = OP_ATOMIC;
@@ -321,6 +322,7 @@ namespace ir {
         this->dst[0] = dst;
         this->src = src;
         this->addrSpace = addrSpace;
+        this->bti = bti;
         srcNum = 2;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
@@ -329,12 +331,14 @@ namespace ir {
           srcNum = 3;
       }
       INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
       Tuple src;
       AddressSpace addrSpace; //!< Address space
+      BTI bti;               //!< bti
       uint8_t srcNum:2;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
@@ -400,7 +404,8 @@ namespace ir {
                       Register offset,
                       AddressSpace addrSpace,
                       uint32_t valueNum,
-                      bool dwAligned)
+                      bool dwAligned,
+                      BTI bti)
       {
         GBE_ASSERT(valueNum < 128);
         this->opcode = OP_LOAD;
@@ -410,6 +415,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
       }
       INLINE Register getDst(const Function &fn, uint32_t ID) const {
         GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
@@ -423,6 +429,7 @@ namespace ir {
       INLINE Type getValueType(void) const { return type; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -431,6 +438,7 @@ namespace ir {
       Register offset;        //!< Alias to make it similar to store
       Tuple values;           //!< Values to load
       AddressSpace addrSpace; //!< Where to load
+      BTI bti;
       uint8_t valueNum:7;     //!< Number of values to load
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
     };
@@ -444,7 +452,8 @@ namespace ir {
                        Register offset,
                        AddressSpace addrSpace,
                        uint32_t valueNum,
-                       bool dwAligned)
+                       bool dwAligned,
+                       BTI bti)
       {
         GBE_ASSERT(valueNum < 255);
         this->opcode = OP_STORE;
@@ -454,6 +463,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
         GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
@@ -473,6 +483,7 @@ namespace ir {
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE Type getValueType(void) const { return type; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -480,6 +491,7 @@ namespace ir {
       Register offset;        //!< First source is the offset where to store
       Tuple values;           //!< Values to store
       AddressSpace addrSpace; //!< Where to store
+      BTI bti;                //!< Which btis need access
       uint8_t valueNum:7;     //!< Number of values to store
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
       Register dst[0];        //!< No destination
@@ -1069,6 +1081,9 @@ namespace ir {
       out << " {" << "%" << this->getSrc(fn, 0) << "}";
       for (uint32_t i = 1; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
 
@@ -1103,6 +1118,9 @@ namespace ir {
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
       out << " %" << this->getSrc(fn, 0);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1112,6 +1130,9 @@ namespace ir {
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
       out << "}";
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
     INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1183,7 +1204,7 @@ namespace ir {
   return HelperIntrospection<CLASS, RefClass>::value == 1;
 
 #define START_INTROSPECTION(CLASS) \
-  static_assert(sizeof(internal::CLASS) == sizeof(uint64_t), \
+  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
                 "Bad instruction size"); \
   static_assert(offsetof(internal::CLASS, opcode) == 0, \
                 "Bad opcode offset"); \
@@ -1427,14 +1448,17 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
 DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
 DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
@@ -1578,8 +1602,8 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
   }
 
   // BRA
@@ -1621,9 +1645,10 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
                    Register offset, \
                    AddressSpace space, \
                    uint32_t valueNum, \
-                   bool dwAligned) \
+                   bool dwAligned, \
+                   BTI bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned).convert(); \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index ada780f..e4c415e 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -31,9 +31,14 @@
 #include "sys/intrusive_list.hpp"
 
 #include <ostream>
+#define MAX_MIXED_POINTER 4
 
 namespace gbe {
 namespace ir {
+  struct BTI {
+    uint8_t bti[MAX_MIXED_POINTER];
+    uint8_t count;
+  };
 
   /*! All opcodes */
   enum Opcode : uint8_t {
@@ -95,7 +100,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   /*! Stores instruction internal data and opcode */
-  class ALIGNED(sizeof(uint64_t)) InstructionBase
+  class ALIGNED(sizeof(uint64_t)*2) InstructionBase
   {
   public:
     /*! Initialize the instruction from a 8 bytes stream */
@@ -109,7 +114,7 @@ namespace ir {
     /*! Get the instruction opcode */
     INLINE Opcode getOpcode(void) const { return opcode; }
   protected:
-    enum { opaqueSize = sizeof(uint64_t)-sizeof(uint8_t) };
+    enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
     Opcode opcode;               //!< Idendifies the instruction
     char opaque[opaqueSize];     //!< Remainder of it
     GBE_CLASS(InstructionBase);  //!< Use internal allocators
@@ -273,6 +278,7 @@ namespace ir {
     static const uint32_t addressIndex = 0;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
+    BTI getBTI(void) const;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
     /*! Return the register that contains the addresses */
@@ -292,6 +298,7 @@ namespace ir {
     Type getValueType(void) const;
     /*! Give the number of values the instruction is storing (srcNum-1) */
     uint32_t getValueNum(void) const;
+    BTI getBTI(void) const;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
     /*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -323,6 +330,7 @@ namespace ir {
     bool isAligned(void) const;
     /*! Return the register that contains the addresses */
     INLINE Register getAddress(void) const { return this->getSrc(0u); }
+    BTI getBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
@@ -644,7 +652,7 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -658,9 +666,9 @@ namespace ir {
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
   /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index e6aff27..2ab12a4 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -158,6 +158,7 @@
 #include "ir/value.hpp"
 #include "sys/set.hpp"
 #include "sys/cvar.hpp"
+#include "backend/program.h"
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -449,6 +450,7 @@ namespace gbe
      *  compare instructions we need to invert to decrease branch complexity
      */
     set<const Value*> conditionSet;
+    map<const Value*, int> globalPointer;
     /*!
      *  <phi,phiCopy> node information for later optimization
      */
@@ -463,7 +465,7 @@ namespace gbe
 
     LoopInfo *LI;
     const Module *TheModule;
-
+    int btiBase;
   public:
     static char ID;
     explicit GenWriter(ir::Unit &unit)
@@ -472,7 +474,8 @@ namespace gbe
         ctx(unit),
         regTranslator(ctx),
         LI(0),
-        TheModule(0)
+        TheModule(0),
+        btiBase(BTI_RESERVED_NUM)
     {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       pass = PASS_EMIT_REGISTERS;
@@ -503,6 +506,9 @@ namespace gbe
       LI = &getAnalysis<LoopInfo>();
       emitFunction(F);
       phiMap.clear();
+      globalPointer.clear();
+      // Reset for next function
+      btiBase = BTI_RESERVED_NUM;
       return false;
     }
 
@@ -594,10 +600,12 @@ namespace gbe
     void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
     void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
     template <bool isLoad, typename T> void visitLoadOrStore(T &I);
+
+    INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
     // batch vec4/8/16 load/store
     INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                   Value *llvmValue, const ir::Register ptr,
-                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad);
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
   };
 
@@ -1193,7 +1201,7 @@ namespace gbe
           const uint32_t elemSize = getTypeByteSize(unit, elemType);
           const uint32_t elemNum = vectorType->getNumElements();
           //vector's elemType always scalar type
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
 
           ir::Function& fn = ctx.getFunction();
           for(uint32_t i=1; i < elemNum; i++) {
@@ -1208,7 +1216,7 @@ namespace gbe
                     "vector type in the function argument is not supported yet");
         const ir::Register reg = getRegister(I);
         if (type->isPointerTy() == false)
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
         else {
           PointerType *pointerType = dyn_cast<PointerType>(type);
           Type *pointed = pointerType->getElementType();
@@ -1219,7 +1227,7 @@ namespace gbe
           if (I->hasByValAttr()) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
             const size_t structSize = getTypeByteSize(unit, pointed);
-            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type));
+            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
           }
           // Regular user provided pointer (global, local or constant)
           else {
@@ -1229,18 +1237,20 @@ namespace gbe
             const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align);
+                globalPointer.insert(std::make_pair(I, btiBase));
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
+                btiBase++;
               break;
               case ir::MEM_LOCAL:
-                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, 0xfe);
                 ctx.getFunction().setUseSLM(true);
               break;
               case ir::MEM_CONSTANT:
-                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align, 0x2);
               break;
               case ir::IMAGE:
-                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align);
-                ctx.getFunction().getImageSet()->append(reg, &ctx);
+                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
+                ctx.getFunction().getImageSet()->append(reg, &ctx, btiBase++);
               break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
             }
@@ -2489,6 +2499,8 @@ namespace gbe
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register dst = this->getRegister(&I);
 
+    ir::BTI bti;
+    gatherBTI(*AI, bti);
     vector<ir::Register> src;
     uint32_t srcNum = 0;
     while(AI != AE) {
@@ -2496,7 +2508,7 @@ namespace gbe
       srcNum++;
     }
     const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
+    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
   }
 
   /* append a new sampler. should be called before any reference to
@@ -3170,7 +3182,7 @@ handle_write_image:
   void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                                       Value *llvmValues, const ir::Register ptr,
                                       const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad) {
+                                      Type * elemType, bool isLoad, ir::BTI bti) {
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
     uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
     uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3216,12 +3228,107 @@ handle_write_image:
 
       // Emit the instruction
       if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true);
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
       else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true);
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
     }
   }
 
+  // The idea behind is to search along the use-def chain, and find out all
+  // possible source of the pointer. Then in later codeGen, we can emit
+  // read/store instructions to these btis gathered.
+  void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
+    typedef map<const Value*, int>::iterator GlobalPtrIter;
+    Value *p;
+    size_t idx = 0;
+    int nBTI = 0;
+    std::vector<Value*> candidates;
+    candidates.push_back(pointer);
+    std::set<Value*> processed;
+    bool needNewBTI = true;
+
+    while (idx < candidates.size()) {
+      bool isPrivate = false;
+      p = candidates[idx];
+
+      while (dyn_cast<User>(p)) {
+
+        if (processed.find(p) == processed.end()) {
+          processed.insert(p);
+        } else {
+          // This use-def chain falls into a loop,
+          // it does not introduce a new buffer source.
+          needNewBTI = false;
+          break;
+        }
+
+        if (dyn_cast<SelectInst>(p)) {
+          SelectInst *sel = cast<SelectInst>(p);
+          p = sel->getTrueValue();
+          candidates.push_back(sel->getFalseValue());
+          continue;
+        }
+
+        if (dyn_cast<PHINode>(p)) {
+          PHINode* phi = cast<PHINode>(p);
+          int n = phi->getNumIncomingValues();
+          for (int j = 1; j < n; j++)
+            candidates.push_back(phi->getIncomingValue(j));
+          p = phi->getIncomingValue(0);
+          continue;
+        }
+
+        if (dyn_cast<AllocaInst>(p)) {
+          isPrivate = true;
+          break;
+        }
+        p = cast<User>(p)->getOperand(0);
+      }
+
+      if (needNewBTI == false) {
+        // go to next possible pointer source
+        idx++; continue;
+      }
+
+      uint8_t new_bti = 0;
+      if (isPrivate) {
+        new_bti = BTI_PRIVATE;
+      } else {
+        if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
+          // structure value implementation is not complete now,
+          // they are now treated as push constant, so, the load/store
+          // here is not as meaningful.
+          bti.bti[0] = BTI_PRIVATE;
+          bti.count = 1;
+          break;
+        }
+        Type *ty = p->getType();
+        if(ty->getPointerAddressSpace() == 3) {
+          // __local memory
+          new_bti = 0xfe;
+        } else {
+          // __global memory
+          GlobalPtrIter iter = globalPointer.find(p);
+          GBE_ASSERT(iter != globalPointer.end());
+          new_bti = iter->second;
+        }
+      }
+      // avoid duplicate
+      bool bFound = false;
+      for (int j = 0; j < nBTI; j++) {
+        if (bti.bti[j] == new_bti) {
+          bFound = true; break;
+        }
+      }
+      if (bFound == false) {
+        bti.bti[nBTI++] = new_bti;
+        bti.count = nBTI;
+      }
+      idx++;
+    }
+    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
+  }
+
   extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
   INLINE void GenWriter::emitLoadOrStore(T &I)
@@ -3233,15 +3340,18 @@ handle_write_image:
     const bool dwAligned = (I.getAlignment() % 4) == 0;
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register ptr = this->getRegister(llvmPtr);
-
+    ir::BTI binding;
+    if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
+      gatherBTI(llvmPtr, binding);
+    }
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
       else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, values);
+        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
     }
     // A vector type requires to build a tuple
     else {
@@ -3285,18 +3395,18 @@ handle_write_image:
 
           // Emit the instruction
           if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
           else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
         }
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
         }
       }
       else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -3316,9 +3426,9 @@ handle_write_image:
               ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
           }
           if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, reg);
+           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
           else
-           ctx.STORE(type, addr, addrSpace, dwAligned, reg);
+           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
         }
       }
     }
diff --git a/kernels/compiler_mixed_pointer.cl b/kernels/compiler_mixed_pointer.cl
new file mode 100644
index 0000000..3da90be
--- /dev/null
+++ b/kernels/compiler_mixed_pointer.cl
@@ -0,0 +1,24 @@
+
+kernel void compiler_mixed_pointer(__global uint* src1, __global uint *src2, __global uint *dst) {
+  int x = get_global_id(0);
+  global uint * tmp = NULL;
+
+  switch(x) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[x] = tmp[x];
+}
+
+kernel void compiler_mixed_pointer1(__global uint* src, __global uint *dst1, __global uint *dst2) {
+  int x = get_global_id(0);
+  global uint * tmp = x < 5 ? dst1 : dst2;
+  tmp[x] = src[x];
+}
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 842c864..05be801 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -167,9 +167,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl());
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
     } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl());
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
     }
   }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 58ecc5a..a4c8af7 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -127,7 +127,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
   if(raw_size == 0)
      return 0;
 
-  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size);
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
   if (bo == NULL)
     return -1;
   cl_buffer_map(bo, 1);
@@ -255,8 +255,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
    */
   if(cl_driver_get_ver(ctx->drv) == 75)
     stack_sz *= 4;
-
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
 }
 
 static int
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 374813a..461c11e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -113,7 +113,7 @@ typedef void (cl_gpgpu_sync_cb)(void*);
 extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 
 /* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t cchint);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
 /* bind samplers defined in both kernel and kernel args. */
@@ -158,7 +158,7 @@ extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
 typedef int (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
 extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
 
-typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size, uint8_t bti);
 extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
 
 /* Setup all indirect states */
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index c2a61c4..5f2f9ce 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -47,6 +47,7 @@ gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
 gbe_kernel_get_code_size_cb *interp_kernel_get_code_size = NULL;
 gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num = NULL;
 gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size = NULL;
+gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti = NULL;
 gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type = NULL;
 gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align = NULL;
 gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width = NULL;
@@ -143,6 +144,10 @@ struct GbeLoaderInitializer
     if (interp_kernel_get_arg_size == NULL)
       return false;
 
+    interp_kernel_get_arg_bti = *(gbe_kernel_get_arg_bti_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_bti");
+    if (interp_kernel_get_arg_bti == NULL)
+      return false;
+
     interp_kernel_get_arg_type = *(gbe_kernel_get_arg_type_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_type");
     if (interp_kernel_get_arg_type == NULL)
       return false;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index f855390..d095240 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -47,6 +47,7 @@ extern gbe_kernel_get_code_cb *interp_kernel_get_code;
 extern gbe_kernel_get_code_size_cb *interp_kernel_get_code_size;
 extern gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num;
 extern gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size;
+extern gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti;
 extern gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type;
 extern gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align;
 extern gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width;
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index aad3c04..94b834b 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -204,7 +204,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   k->args[index].mem = mem;
   k->args[index].is_set = 1;
   k->args[index].local_sz = 0;
-
+  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 09362b3..f4ed8d3 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -37,6 +37,7 @@ struct _gbe_kernel;
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
   cl_sampler sampler;   /* For sampler. */
+  unsigned char bti;
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
 } cl_argument;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 7986ab1..e0c00ea 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -91,7 +91,6 @@ struct intel_gpgpu
 
   unsigned long img_bitmap;              /* image usage bitmap. */
   unsigned int img_index_base;          /* base index for image surface.*/
-  drm_intel_bo *binded_img[max_img_n + 128];  /* all images binded for the call */
 
   unsigned long sampler_bitmap;          /* sampler usage bitmap. */
 
@@ -690,13 +689,13 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
 }
 
 static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
+intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
   uint32_t s = size - 1;
   assert(size != 0);
 
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
   memset(ss2, 0, sizeof(gen7_surface_state_t));
   ss2->ss0.surface_type = I965_SURFACE_BUFFER;
   ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
@@ -704,7 +703,7 @@ intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
   ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
   ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
   ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
 
   if(gpgpu->constant_b.bo)
     dri_bo_unreference(gpgpu->constant_b.bo);
@@ -717,20 +716,20 @@ intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
                       I915_GEM_DOMAIN_RENDER,
                       0,
                       gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[2] +
+                      heap->binding_table[bti] +
                       offsetof(gen7_surface_state_t, ss1),
                       gpgpu->constant_b.bo);
   return gpgpu->constant_b.bo;
 }
 
 static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
+intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
   uint32_t s = size - 1;
   assert(size != 0);
 
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
   memset(ss2, 0, sizeof(gen7_surface_state_t));
   ss2->ss0.surface_type = I965_SURFACE_BUFFER;
   ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
@@ -742,7 +741,7 @@ intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
   ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
   ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
   ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
-  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
 
   if(gpgpu->constant_b.bo)
     dri_bo_unreference(gpgpu->constant_b.bo);
@@ -755,36 +754,39 @@ intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
                       I915_GEM_DOMAIN_RENDER,
                       0,
                       gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[2] +
+                      heap->binding_table[bti] +
                       offsetof(gen7_surface_state_t, ss1),
                       gpgpu->constant_b.bo);
   return gpgpu->constant_b.bo;
 }
 
-
-/* Map address space with two 2GB surfaces. One surface for untyped message and
- * one surface for byte scatters / gathers. Actually the HW does not require two
- * surfaces but Fulsim complains
- */
 static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
+intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
 {
+  uint32_t s = size - 1;
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
-  gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
   memset(ss0, 0, sizeof(gen7_surface_state_t));
-  memset(ss1, 0, sizeof(gen7_surface_state_t));
-  ss1->ss0.surface_type = ss0->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss1->ss0.surface_format = ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
-  ss1->ss2.width  = ss0->ss2.width  = 127;   /* bits 6:0 of sz */
-  ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
-  ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
-  ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
-  ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  heap->binding_table[0] = offsetof(surface_heap_t, surface);
-  heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+  ss0->ss1.base_addr = buf->offset + internal_offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      internal_offset,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[index] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      buf);
 }
 
+
 static int
 intel_is_surface_array(cl_mem_object_type type)
 {
@@ -863,7 +865,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   }
   ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
   intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
-  gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
 
   assert(index < GEN_MAX_SURFACES);
 }
@@ -884,7 +885,6 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
   memset(ss, 0, sizeof(*ss));
-
   ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
   if (index > 128 + 2 && type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     ss->ss0.surface_type = I965_SURFACE_2D;
@@ -916,20 +916,20 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
   }
   ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
   intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
-  gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
 
   assert(index < GEN_MAX_SURFACES);
 }
 
 static void
 intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
-                     uint32_t internal_offset, uint32_t cchint)
+                     uint32_t internal_offset, uint32_t size, uint8_t bti)
 {
   assert(gpgpu->binded_n < max_buf_n);
   gpgpu->binded_buf[gpgpu->binded_n] = buf;
   gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
   gpgpu->binded_offset[gpgpu->binded_n] = offset;
   gpgpu->binded_n++;
+  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
 }
 
 static int
@@ -957,11 +957,12 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
   return 0;
 }
 static void
-intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
-  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, cchint);
+
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
 }
 
 static void
@@ -1155,7 +1156,6 @@ intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
   gpgpu->ker = kernel;
   intel_gpgpu_build_idrt(gpgpu, kernel);
-  intel_gpgpu_map_address_space(gpgpu);
   dri_bo_unmap(gpgpu->aux_buf.bo);
 }
 
@@ -1378,8 +1378,7 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint
   }
   memset(bo->virtual, 0, size);
   drm_intel_bo_unmap(bo);
-
-  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, 0);
+  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, 0);
   return 0;
 }
 
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 561744d..c6dea8c 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -31,6 +31,7 @@ set (utests_sources
   compiler_insert_to_constant.cpp
   compiler_argument_structure.cpp
   compiler_arith_shift_right.cpp
+  compiler_mixed_pointer.cpp
   compiler_array0.cpp
   compiler_array.cpp
   compiler_array1.cpp
diff --git a/utests/compiler_mixed_pointer.cpp b/utests/compiler_mixed_pointer.cpp
new file mode 100644
index 0000000..20019f0
--- /dev/null
+++ b/utests/compiler_mixed_pointer.cpp
@@ -0,0 +1,120 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src1, int *src2, int *dst) {
+  int * tmp = NULL;
+
+  switch(global_id) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[global_id] = tmp[global_id];
+
+}
+static void cpu1(int global_id, int *src, int *dst1, int *dst2) {
+  int * tmp = global_id < 5 ? dst1 : dst2;
+  tmp[global_id] = src[global_id];
+}
+
+void compiler_mixed_pointer(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16], cpu_src1[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mixed_pointer");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_src1[i] = ((int32_t*)buf_data[1])[i] = 65536-i;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_src1, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst[i], ((int32_t*)buf_data[2])[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer);
+
+void compiler_mixed_pointer1(void)
+{
+  const size_t n = 16;
+  int cpu_dst1[16], cpu_dst2[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_mixed_pointer", "compiler_mixed_pointer1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_dst1[i] = ((int32_t*)buf_data[1])[i] = 0xff;
+      cpu_dst2[i] = ((int32_t*)buf_data[2])[i] = 0xff;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu1(i, cpu_src, cpu_dst1, cpu_dst2);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst1[i], ((int32_t*)buf_data[1])[i]);
+//      printf(" %d  %d\n", ((int32_t*)buf_data[2])[i], cpu_dst2[i]);
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst1[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst2[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer1);
+
-- 
1.7.10.4



More information about the Beignet mailing list