[Beignet] [PATCH 1/5] GBE: refactor curbe register allocation.

Zhigang Gong zhigang.gong at intel.com
Sun Sep 13 23:19:32 PDT 2015


The major motivation is to normalize the curbe payload's
allocation and prepare to use liveness information
to avoid unecessary payload register allocation and avoid
fragments when allocate curbe registers. For an example,
for GBE_CURBE_LOCAL_ID_Y/Z, many one dimention
kernels don't need them. But previous curbe allocation
occurs before the liveness interval computing, thus it
will allocate that curbe anyway. Altough it will be expired
soon but it still need us to prepare those payload at
host side. After this patch, this type of overhead
has been eliminated easily.

Another purpose is to eliminate the ugly curbe patch list
handling in backend. After this patch, the curbe register
handling is much cleaner than before.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/context.cpp            |  14 ----
 backend/src/backend/context.hpp            |  18 ++++-
 backend/src/backend/gen_context.cpp        | 118 ++++++----------------------
 backend/src/backend/gen_context.hpp        |   2 +-
 backend/src/backend/gen_reg_allocation.cpp | 121 ++++++++++++++++++++---------
 backend/src/backend/program.h              |   3 +-
 backend/src/ir/context.cpp                 |   7 +-
 backend/src/ir/context.hpp                 |   3 +-
 backend/src/ir/function.hpp                |  19 ++++-
 backend/src/ir/image.cpp                   |   2 +-
 backend/src/ir/instruction.hpp             |   1 +
 backend/src/ir/profile.cpp                 |  64 +++++++--------
 backend/src/ir/profile.hpp                 |  12 ++-
 backend/src/ir/register.hpp                |  58 ++++++++++++--
 src/cl_command_queue.c                     |   4 +-
 src/cl_command_queue_gen7.c                |  34 ++++----
 src/cl_kernel.c                            |  12 +--
 17 files changed, 266 insertions(+), 226 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 81b284d..a02771a 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -421,20 +421,6 @@ namespace gbe
     return offset;
   }
 
-  uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
-  {
-    int32_t offset = fn.getImageSet()->getInfoOffset(key);
-    if (offset >= 0)
-      return offset + GEN_REG_SIZE;
-    newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
-    std::sort(kernel->patches.begin(), kernel->patches.end());
-
-    offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
-    GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
-    fn.getImageSet()->appendInfo(key, offset);
-    return offset + GEN_REG_SIZE;
-  }
-
   void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
     curbeRegs.insert(std::make_pair(reg, offset));
   }
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 079967d..e1f5a71 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -90,9 +90,6 @@ namespace gbe
     void deallocate(int16_t offset);
     /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
     void splitBlock(int16_t offset, int16_t subOffset);
-    /* allocate a new entry for a specific image's information */
-    /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
-    uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
     int32_t allocateScratchMem(uint32_t size);
     /*! deallocate scratch memory at offset */
@@ -107,6 +104,21 @@ namespace gbe
     uint32_t getMaxLabel(void) const {
       return this->isDWLabel() ? 0xffffffff : 0xffff;
     }
+    /*! get register's payload type. */
+    INLINE void getRegPayloadType(ir::Register reg, gbe_curbe_type &curbeType, int &subType) const {
+      if (reg.value() >= fn.getRegisterFile().regNum()) {
+        curbeType = GBE_GEN_REG;
+        subType = 0;
+        return;
+      }
+      fn.getRegPayloadType(reg, curbeType, subType);
+    }
+    /*! check whether a register is a payload register */
+    INLINE bool isPayloadReg(ir::Register reg) const{
+      if (reg.value() >= fn.getRegisterFile().regNum())
+        return false;
+      return fn.isPayloadReg(reg);
+    }
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 25fdf08..ae02fbe 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -181,9 +181,8 @@ namespace gbe
     GenRegister dst_;
     if (dst.type == GEN_TYPE_UW)
       dst_ = dst;
-    else
-      dst_ = GenRegister::uw16grf(126,0);
-
+    else if (dst.type == GEN_TYPE_UD)
+      dst_ = GenRegister::retype(dst, GEN_TYPE_UW);
     p->push();
       uint32_t execWidth = p->curr.execWidth;
       p->curr.predicate = GEN_PREDICATE_NONE;
@@ -220,26 +219,30 @@ namespace gbe
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD);
 
     loadLaneID(stackptr);
 
     // We compute the per-lane stack pointer here
     // threadId * perThreadSize + laneId*perLaneSize
     // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
       p->curr.execWidth = this->simdWidth;
       p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
       p->curr.execWidth = 1;
       if(perThreadSize > 0xffff) {
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
+        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize));
+        p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
       } else
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+      p->ADD(stackptr, stackptr, tmpReg);
     p->pop();
   }
 
@@ -2203,104 +2206,31 @@ namespace gbe
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
-  void GenContext::allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue) {
+  void GenContext::allocCurbeReg(ir::Register reg) {
     uint32_t regSize;
+    gbe_curbe_type curbeType;
+    int subType;
+    this->getRegPayloadType(reg, curbeType, subType);
     regSize = this->ra->getRegSize(reg);
-    insertCurbeReg(reg, newCurbeEntry(value, subValue, regSize));
-  }
-
-  void GenContext::buildPatchList(void) {
-    const uint32_t ptrSize = this->getPointerSize();
-    kernel->curbeSize = 0u;
-    auto &stackUse = dag->getUse(ir::ocl::stackptr);
-
-    // We insert the block IP mask first
-    using namespace ir::ocl;
-    if (!isDWLabel())
-      allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
-    else
-      allocCurbeReg(dwblockip, GBE_CURBE_DW_BLOCK_IP);
-    allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
-    allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
-    allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
-    allocCurbeReg(zero, GBE_CURBE_ZERO);
-    allocCurbeReg(one, GBE_CURBE_ONE);
-    allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
-    if (stackUse.size() != 0)
-      allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
-    // Go over the arguments and find the related patch locations
-    const uint32_t argNum = fn.argNum();
-    for (uint32_t argID = 0u; argID < argNum; ++argID) {
-      const ir::FunctionArgument &arg = fn.getArg(argID);
-      // For pointers and values, we have nothing to do. We just push the values
-      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
-          arg.type == ir::FunctionArgument::LOCAL_POINTER ||
-          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, ptrSize, ptrSize));
-      if (arg.type == ir::FunctionArgument::VALUE ||
-          arg.type == ir::FunctionArgument::STRUCTURE ||
-          arg.type == ir::FunctionArgument::IMAGE ||
-          arg.type == ir::FunctionArgument::SAMPLER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, arg.size));
+    insertCurbeReg(reg, newCurbeEntry(curbeType, subType, regSize));
+    /* Need to patch the image information registers. */
+    if (curbeType == GBE_CURBE_IMAGE_INFO) {
+      std::sort(kernel->patches.begin(), kernel->patches.end());
+      uint32_t offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, subType);
+      fn.getImageSet()->appendInfo(static_cast<ir::ImageInfoKey>(subType), offset);
     }
+  }
 
-    // Go over all the instructions and find the special register we need
-    // to push
-    #define INSERT_REG(SPECIAL_REG, PATCH) \
-    if (reg == ir::ocl::SPECIAL_REG) { \
-      if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
-      allocCurbeReg(reg, GBE_CURBE_##PATCH); \
-    } else
-
-    fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
-          }
-          continue;
-        }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
-        INSERT_REG(lsize0, LOCAL_SIZE_X)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
-        INSERT_REG(workdim, WORK_DIM)
-        INSERT_REG(numgroup0, GROUP_NUM_X)
-        INSERT_REG(numgroup1, GROUP_NUM_Y)
-        INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
-        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
-        do {} while(0);
-      }
-    });
-#undef INSERT_REG
+  void GenContext::buildPatchList() {
 
     // After this point the vector is immutable. Sorting it will make
     // research faster
     std::sort(kernel->patches.begin(), kernel->patches.end());
-
     kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
   }
 
   bool GenContext::emitCode(void) {
     GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
-    buildPatchList();
     sel->select();
     schedulePreRegAllocation(*this, *this->sel);
     if (UNLIKELY(ra->allocate(*this->sel) == false))
@@ -2308,8 +2238,8 @@ namespace gbe
     schedulePostRegAllocation(*this, *this->sel);
     if (OCL_OUTPUT_REG_ALLOC)
       ra->outputAllocation();
-    this->clearFlagRegister();
     this->emitStackPointer();
+    this->clearFlagRegister();
     this->emitSLMOffset();
     this->emitInstructionStream();
     if (this->patchBranches() == false)
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 34f9293..b03097e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -214,7 +214,7 @@ namespace gbe
       return GBE_NEW(Gen7Encoder, this->simdWidth, 7, deviceID);
     }
     /*! allocate a new curbe register and insert to curbe pool. */
-    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
+    void allocCurbeReg(ir::Register reg);
 
     virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 36ad914..c3d5e29 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -188,6 +188,7 @@ namespace gbe
     INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
     INLINE bool vectorCanSpill(SelectionVector *vector);
     INLINE void allocateScratchForSpilled();
+    void allocateCurbePayload(void);
 
     /*! replace specified source/dst register with temporary register and update interval */
     INLINE ir::Register replaceReg(Selection &sel, SelectionInstruction *insn,
@@ -208,6 +209,7 @@ namespace gbe
       return reg;
     }
     /*! Use custom allocator */
+    friend GenRegAllocator;
     GBE_CLASS(Opaque);
   };
 
@@ -223,9 +225,9 @@ namespace gbe
     assert(offset >= GEN_REG_SIZE);
     offset += subOffset;
     RA.insert(std::make_pair(reg, offset));
-    GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
-    this->intervals[reg].minID = 0;
-    this->intervals[reg].maxID = 0;
+    //GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+    //this->intervals[reg].minID = 0;
+    //this->intervals[reg].maxID = 0;
   }
 
   INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
@@ -240,22 +242,17 @@ namespace gbe
     for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
       const uint32_t argID = rit->second.argID;
       const FunctionArgument arg = fn.getArg(argID);
-
       const uint32_t subOffset = rit->second.offset;
       const Register reg = rit->second.getRegister();
+
+      if (intervals[reg].maxID == - INT_MAX)
+        continue;
       auto it = this->ctx.curbeRegs.find(arg.reg);
       assert(it != ctx.curbeRegs.end());
       allocatePayloadReg(reg, it->second, subOffset);
       ctx.splitBlock(it->second, subOffset);
     }
 
-    if (RA.contains(ocl::stackbuffer)) {
-      uint32_t regSize = 0;
-      this->getRegAttrib(ocl::stackptr, regSize);
-      uint32_t offset = this->ctx.allocate(regSize, regSize, 1);
-      RA.insert(std::make_pair(ocl::stackptr, offset));
-    }
-
     // Group and barrier IDs are always allocated by the hardware in r0
     RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
     RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
@@ -263,6 +260,36 @@ namespace gbe
     RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
   }
 
+  template <bool sortStartingPoint>
+  inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
+    if (sortStartingPoint) {
+      if (i0->minID == i1->minID)
+        return (i0->maxID < i1->maxID);
+      return i0->minID < i1->minID;
+    } else {
+      if (i0->maxID == i1->maxID)
+        return (i0->minID < i1->minID);
+      return i0->maxID < i1->maxID;
+    }
+  }
+
+  void GenRegAllocator::Opaque::allocateCurbePayload(void) {
+    vector <GenRegInterval *> payloadInterval;
+    for (auto interval : starting) {
+      if (!ctx.isPayloadReg(interval->reg))
+        continue;
+      if (interval->minID > 0)
+        break;
+      payloadInterval.push_back(interval);
+    }
+    std::sort(payloadInterval.begin(), payloadInterval.end(), cmp<false>);
+    for(auto interval : payloadInterval) {
+      if (interval->maxID < 0)
+        continue;
+      ctx.allocCurbeReg(interval->reg);
+    }
+  }
+
   bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
     using namespace ir;
     const ir::Register reg = interval.reg;
@@ -340,7 +367,7 @@ namespace gbe
   }
 
   /*! Will sort vector in decreasing order */
-  inline bool cmp(const SelectionVector *v0, const SelectionVector *v1) {
+  inline bool cmpVec(const SelectionVector *v0, const SelectionVector *v1) {
     return v0->regNum > v1->regNum;
   }
 
@@ -357,7 +384,7 @@ namespace gbe
 
     // Heuristic (really simple...): sort them by the number of registers they
     // contain
-    std::sort(this->vectors.begin(), this->vectors.end(), cmp);
+    std::sort(this->vectors.begin(), this->vectors.end(), cmpVec);
 
     // Insert MOVs when this is required
     for (vectorID = 0; vectorID < vectorNum; ++vectorID) {
@@ -368,19 +395,6 @@ namespace gbe
     }
   }
 
-  template <bool sortStartingPoint>
-  inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
-    if (sortStartingPoint) {
-      if (i0->minID == i1->minID)
-        return (i0->maxID < i1->maxID);
-      return i0->minID < i1->minID;
-    } else {
-      if (i0->maxID == i1->maxID)
-        return (i0->minID < i1->minID);
-      return i0->maxID < i1->maxID;
-    }
-  }
-
   bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
     bool ret = false;
     while (this->expiringID != ending.size()) {
@@ -685,11 +699,11 @@ namespace gbe
     for (uint32_t startID = 0; startID < regNum; ++startID) {
       const GenRegInterval &interval = *this->starting[startID];
       const ir::Register reg = interval.reg;
+
       if (interval.maxID == -INT_MAX)
         continue; // Unused register
       if (RA.contains(reg))
         continue; // already allocated
-
       if (flagBooleans.contains(reg))
         continue;
 
@@ -1001,22 +1015,34 @@ namespace gbe
 
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
+
     if (ctx.reservedSpillRegs != 0) {
       reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE);
       reservedReg /= GEN_REG_SIZE;
     } else {
       reservedReg = 0;
     }
-    // schedulePreRegAllocation(ctx, selection);
 
     // Now start the linear scan allocation
-    for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID)
+    for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID) {
       this->intervals.push_back(ir::Register(regID));
-
-    // Allocate the special registers (only those which are actually used)
-    this->allocateSpecialRegs();
-
-    // block IP used to handle the mask in SW is always allocated
+      // Set all payload register's liveness minID to 0.
+      gbe_curbe_type curbeType;
+      int subType;
+      ctx.getRegPayloadType(ir::Register(regID), curbeType, subType);
+      if (curbeType != GBE_GEN_REG) {
+        intervals[regID].minID = 0;
+
+        // zero and one have implicitly usage in the initial block.
+        if (curbeType == GBE_CURBE_ONE || curbeType == GBE_CURBE_ZERO)
+          intervals[regID].maxID = 10;
+        // FIXME stack buffer is not used, we may need to remove it in the furture.
+        if (curbeType == GBE_CURBE_EXTRA_ARGUMENT && subType == GBE_STACK_BUFFER)
+          intervals[regID].maxID = 1;
+        if (curbeType == GBE_CURBE_BTI_UTIL)
+          intervals[regID].maxID = INT_MAX;
+      }
+    }
 
     // Compute the intervals
     int32_t insnID = 0;
@@ -1143,6 +1169,12 @@ namespace gbe
         break;
     }
 
+    this->allocateCurbePayload();
+    ctx.buildPatchList();
+
+    // Allocate the special registers (only those which are actually used)
+    this->allocateSpecialRegs();
+
     // Allocate all the GRFs now (regular register and boolean that are not in
     // flag registers)
     return this->allocateGRFs(selection);
@@ -1237,9 +1269,24 @@ namespace gbe
   }
 
   uint32_t GenRegAllocator::getRegSize(ir::Register reg) {
-     uint32_t regSize; 
-     this->opaque->getRegAttrib(reg, regSize); 
-     return regSize;
+    uint32_t regSize;
+    gbe_curbe_type curbeType = GBE_GEN_REG;
+    int subType = 0;
+    this->opaque->ctx.getRegPayloadType(reg, curbeType, subType);
+    if (curbeType == GBE_CURBE_IMAGE_INFO)
+      regSize = 4;
+    else if (curbeType == GBE_CURBE_KERNEL_ARGUMENT) {
+      const ir::FunctionArgument &arg = this->opaque->ctx.getFunction().getArg(subType);
+      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
+          arg.type == ir::FunctionArgument::LOCAL_POINTER  ||
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
+        regSize = this->opaque->ctx.getPointerSize();
+      else
+        regSize = arg.size;
+      GBE_ASSERT(arg.reg == reg);
+    } else
+      this->opaque->getRegAttrib(reg, regSize);
+    return regSize;
   }
 
 } /* namespace gbe */
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index af19732..0ba0bd5 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -100,14 +100,13 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
-  GBE_CURBE_SLM_OFFSET,
   GBE_CURBE_BTI_UTIL,
+  GBE_GEN_REG,
 };
 
 /*! Extra arguments use the negative range of sub-values */
 enum gbe_extra_argument {
   GBE_STACK_BUFFER = 0,   /* Give stack location in curbe */
-  GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
 };
 
 typedef struct ImageInfo {
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index 2412fe9..3f52b17 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -93,9 +93,11 @@ namespace ir {
     usedLabels = elem.usedLabels;
   }
 
-  Register Context::reg(RegisterFamily family, bool uniform) {
+  Register Context::reg(RegisterFamily family, bool uniform,
+                        gbe_curbe_type curbeType,
+                        int subType) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
-    return fn->newRegister(family, uniform);
+    return fn->newRegister(family, uniform, curbeType, subType);
   }
 
   LabelIndex Context::label(void) {
@@ -113,6 +115,7 @@ namespace ir {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
     GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
     FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
+    fn->setRegPayloadType(arg->reg, GBE_CURBE_KERNEL_ARGUMENT, fn->args.size());
     fn->args.push_back(arg);
   }
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 54265d0..0f7ded4 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -63,7 +63,8 @@ namespace ir {
     /*! Append a new pushed constant */
     void appendPushedConstant(Register reg, const PushLocation &pushed);
     /*! Create a new register with the given family for the current function */
-    Register reg(RegisterFamily family, bool uniform = false);
+    Register reg(RegisterFamily family, bool uniform = false,
+                 gbe_curbe_type curbeType = GBE_GEN_REG, int subType = 0);
     /*! Create a new immediate value */
     template <typename T> INLINE ImmediateIndex newImmediate(T value) {
       const Immediate imm(value);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 5d00cca..b5f4ba2 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -273,8 +273,11 @@ namespace ir {
     /*! Get the function profile */
     INLINE Profile getProfile(void) const { return profile; }
     /*! Get a new valid register */
-    INLINE Register newRegister(RegisterFamily family, bool uniform = false) {
-      return this->file.append(family, uniform);
+    INLINE Register newRegister(RegisterFamily family,
+                                bool uniform = false,
+                                gbe_curbe_type curbeType = GBE_GEN_REG,
+                                int subType = 0) {
+      return this->file.append(family, uniform, curbeType, subType);
     }
     /*! Get the function name */
     const std::string &getName(void) const { return name; }
@@ -288,6 +291,18 @@ namespace ir {
     INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); }
     /*! return true if the specified regsiter is uniform type */
     INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); }
+    /*! set register as specified payload type */
+    INLINE void setRegPayloadType(Register reg, gbe_curbe_type curbeType, int subType) {
+      file.setPayloadType(reg, curbeType, subType);
+    }
+    /*! get register's payload type. */
+    INLINE void getRegPayloadType(Register reg, gbe_curbe_type &curbeType, int &subType) const {
+      file.getPayloadType(reg, curbeType, subType);
+    }
+    /*! check whether a register is a payload register */
+    INLINE bool isPayloadReg(Register reg) const{
+      return file.isPayloadReg(reg);
+    }
     /*! Get the register family from the register itself */
     INLINE RegisterFamily getRegisterFamily(Register reg) const {
       return this->getRegisterData(reg).family;
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 8976a68..e006087 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -247,7 +247,7 @@ namespace ir {
     auto it = infoRegMap.find(key.data);
     if (it != infoRegMap.end())
       return it->second;
-    Register reg = ctx->reg(FAMILY_DWORD);
+    Register reg = ctx->reg(FAMILY_DWORD, false, GBE_CURBE_IMAGE_INFO, key.data);
     infoRegMap.insert(std::make_pair(key.data, reg));
     return reg;
   }
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index cf8d839..3f3c655 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -401,6 +401,7 @@ namespace ir {
 
   typedef union _ImageInfoKey{
     _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
+    _ImageInfoKey(int key) : data(key) {};
     struct {
      uint8_t index; /*! the allocated image index */
      uint8_t  type;  /*! the information type */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 37f2d3d..75522eb 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -41,57 +41,53 @@ namespace ir {
         "block_ip",
         "barrier_id", "thread_number", "work_dimension",
         "zero", "one",
-        "retVal", "slm_offset",
+        "retVal",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
         "dwblockip",
-        "lane_id",
-        "invalid",
         "bti_utility"
     };
 
 #if GBE_DEBUG
-#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   r = fn.newRegister(FAMILY, UNIFORM); \
+#define DECL_NEW_REG(FAMILY, REG, ...) \
+   r = fn.newRegister(FAMILY, __VA_ARGS__); \
    GBE_ASSERT(r == REG);
 #else
-#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   fn.newRegister(FAMILY, UNIFORM);
+#define DECL_NEW_REG(FAMILY, REG, ...) \
+   fn.newRegister(FAMILY, __VA_ARGS__);
 #endif /* GBE_DEBUG */
     static void init(Function &fn) {
       IF_DEBUG(Register r);
-      DECL_NEW_REG(FAMILY_DWORD, lid0, 0);
-      DECL_NEW_REG(FAMILY_DWORD, lid1, 0);
-      DECL_NEW_REG(FAMILY_DWORD, lid2, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid0, 0, GBE_CURBE_LOCAL_ID_X);
+      DECL_NEW_REG(FAMILY_DWORD, lid1, 0, GBE_CURBE_LOCAL_ID_Y);
+      DECL_NEW_REG(FAMILY_DWORD, lid2, 0, GBE_CURBE_LOCAL_ID_Z);
       DECL_NEW_REG(FAMILY_DWORD, groupid0, 1);
       DECL_NEW_REG(FAMILY_DWORD, groupid1, 1);
       DECL_NEW_REG(FAMILY_DWORD, groupid2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1, GBE_CURBE_GROUP_NUM_X);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1, GBE_CURBE_GROUP_NUM_Y);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1, GBE_CURBE_GROUP_NUM_Z);
+      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
+      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
+      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
+      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1, GBE_CURBE_GLOBAL_SIZE_X);
+      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1, GBE_CURBE_GLOBAL_SIZE_Y);
+      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1, GBE_CURBE_GLOBAL_SIZE_Z);
+      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1, GBE_CURBE_GLOBAL_OFFSET_X);
+      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1, GBE_CURBE_GLOBAL_OFFSET_Y);
+      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1, GBE_CURBE_GLOBAL_OFFSET_Z);
       DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
-      DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1);
-      DECL_NEW_REG(FAMILY_WORD,  blockip, 0);
+      DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+      DECL_NEW_REG(FAMILY_WORD,  blockip, 0, GBE_CURBE_BLOCK_IP);
       DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
-      DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
-      DECL_NEW_REG(FAMILY_DWORD, workdim, 1);
-      DECL_NEW_REG(FAMILY_DWORD, zero, 1);
-      DECL_NEW_REG(FAMILY_DWORD, one, 1);
+      DECL_NEW_REG(FAMILY_DWORD, threadn, 1, GBE_CURBE_THREAD_NUM);
+      DECL_NEW_REG(FAMILY_DWORD, workdim, 1, GBE_CURBE_WORK_DIM);
+      DECL_NEW_REG(FAMILY_DWORD, zero, 1, GBE_CURBE_ZERO);
+      DECL_NEW_REG(FAMILY_DWORD, one, 1, GBE_CURBE_ONE);
       DECL_NEW_REG(FAMILY_WORD, retVal, 1);
-      DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
-      DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
-      DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
-      DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
-      DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
-      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
+      DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER);
+      DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER);
+      DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
+      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1, GBE_CURBE_BTI_UTIL);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index bf909be..b3f2a21 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -68,13 +68,11 @@ namespace ir {
     static const Register zero = Register(24);     //  scalar register holds zero.
     static const Register one = Register(25);     //  scalar register holds one. 
     static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
-    static const Register printfbptr = Register(28); // printf buffer address .
-    static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register dwblockip = Register(30);  // blockip
-    static const Register invalid = Register(31);  // used for valid comparation.
-    static const Register btiUtil = Register(32);  // used for mixed pointer as bti utility.
-    static const uint32_t regNum = 33;             // number of special registers
+    static const Register printfbptr = Register(27); // printf buffer address .
+    static const Register printfiptr = Register(28); // printf index buffer address.
+    static const Register dwblockip = Register(29);  // blockip
+    static const Register btiUtil = Register(30);  // used for mixed pointer as bti utility.
+    static const uint32_t regNum = 31;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index d8df7b0..d64304e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -26,6 +26,7 @@
 
 #include "sys/vector.hpp"
 #include "sys/platform.hpp"
+#include "../backend/program.h"
 
 namespace gbe {
 namespace ir {
@@ -78,21 +79,38 @@ namespace ir {
     ARF_TM
   };
 
+  /*! Register is the position of the index of the register data in the register
+   *  file. We enforce type safety with this class
+   */
+  TYPE_SAFE(Register, uint32_t)
+
   /*! A register can be either a byte, a word, a dword or a qword. We store this
    *  value into a register data (which makes the register file) 
    */
   class RegisterData
   {
   public:
+    struct PayloadRegisterData {
+      gbe_curbe_type  curbeType;
+      int subType;
+    };
+
     /*! Build a register. All fields will be immutable */
     INLINE RegisterData(RegisterFamily family,
-                        bool uniform = false) : family(family), uniform(uniform) {}
+                        bool uniform,
+                        gbe_curbe_type curbeType,
+                        int subType) : family(family), uniform(uniform) {
+      payloadData.curbeType = curbeType;
+      payloadData.subType = subType;
+    }
+
     /*! Copy constructor */
-    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform) {}
+    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform), payloadData(other.payloadData) {}
     /*! Copy operator */
     INLINE RegisterData &operator= (const RegisterData &other) {
       this->family = other.family;
       this->uniform = other.uniform;
+      this->payloadData = other.payloadData;
       return *this;
     }
     /*! Nothing really happens here */
@@ -100,18 +118,26 @@ namespace ir {
     RegisterFamily family;            //!< Register size or if it is a flag
     INLINE bool isUniform() const { return uniform; }
     INLINE void setUniform(bool uni) { uniform = uni; }
+    INLINE void setPayloadType(gbe_curbe_type curbeType, int subType) {
+      payloadData.curbeType = curbeType;
+      payloadData.subType = subType;
+    }
+    INLINE void getPayloadType(gbe_curbe_type &curbeType, int &subType) const {
+      curbeType = payloadData.curbeType;
+      subType = payloadData.subType;
+    }
+    INLINE bool isPayloadType(void) const {
+      return payloadData.curbeType != GBE_GEN_REG;
+    }
   private:
     bool uniform;
+    PayloadRegisterData payloadData;
     GBE_CLASS(RegisterData);
   };
 
   /*! Output the register file string in the given stream */
   std::ostream &operator<< (std::ostream &out, const RegisterData &regData);
 
-  /*! Register is the position of the index of the register data in the register
-   *  file. We enforce type safety with this class
-   */
-  TYPE_SAFE(Register, uint32_t)
   INLINE bool operator< (const Register &r0, const Register &r1) {
     return r0.value() < r1.value();
   }
@@ -128,14 +154,18 @@ namespace ir {
   {
   public:
     /*! Return the index of a newly allocated register */
-    INLINE Register append(RegisterFamily family, bool uniform = false) {
+    INLINE Register append(RegisterFamily family,
+                           bool uniform = false,
+                           gbe_curbe_type curbeType = GBE_GEN_REG,
+                           int subType = 0) {
       GBE_ASSERTM((uint64_t)regNum() < MAX_INDEX,
                   "Too many defined registers (only 4G are supported)");
       const uint32_t index = regNum();
-      const RegisterData reg(family, uniform);
+      const RegisterData reg(family, uniform, curbeType, subType);
       regs.push_back(reg);
       return Register(index);
     }
+
     /*! Make a tuple from an array of register */
     Tuple appendArrayTuple(const Register *reg, uint32_t regNum);
     /*! Make a tuple and return the index to the first element of the tuple */
@@ -155,6 +185,18 @@ namespace ir {
     INLINE bool isUniform(Register index) { return regs[index].isUniform(); }
     /*! Set a register to uniform or varying data type*/
     INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
+    /*! Set payload type of a register */
+    INLINE void setPayloadType(Register index, gbe_curbe_type curbeType, int subType) {
+      regs[index].setPayloadType(curbeType, subType);
+    }
+    /*! Get payload type of a register */
+    INLINE void getPayloadType(Register index, gbe_curbe_type &curbeType, int &subType) const {
+      regs[index].getPayloadType(curbeType, subType);
+    }
+    /*! Check whether the register is a payload register */
+    INLINE bool isPayloadReg(Register index) const {
+      return regs[index].isPayloadType();
+    }
     /*! Get the register index from the tuple */
     INLINE Register get(Tuple index, uint32_t which) const {
       return regTuples[index.value() + which];
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..bd5a003 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -164,11 +164,13 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   uint32_t i;
   enum gbe_arg_type arg_type; /* kind of argument */
   for (i = 0; i < k->arg_n; ++i) {
-    uint32_t offset; // location of the address in the curbe
+    int32_t offset; // location of the address in the curbe
     arg_type = interp_kernel_get_arg_type(k->opaque, i);
     if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
       continue;
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+    if (offset < 0)
+      continue;
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
       cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 0e60528..8c09615 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -58,14 +58,14 @@ cl_set_varying_payload(const cl_kernel ker,
   if (ip_offset < 0)
     dw_ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_DW_BLOCK_IP, 0);
   assert(ip_offset < 0 || dw_ip_offset < 0);
-  assert(id_offset[0] >= 0 &&
-         id_offset[1] >= 0 &&
-         id_offset[2] >= 0 &&
-         (ip_offset >= 0 || dw_ip_offset >= 0));
-
-  TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
-  TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
-  TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  assert(ip_offset >= 0 || dw_ip_offset >= 0);
+
+  if (id_offset[0] >= 0)
+    TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  if (id_offset[1] >= 0)
+    TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  if (id_offset[2] >= 0)
+    TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
   /* 0xffff means that the lane is inactivated */
   memset(block_ips, 0xff, sizeof(int16_t)*thread_n*simd_sz);
@@ -75,9 +75,12 @@ cl_set_varying_payload(const cl_kernel ker,
   for (k = 0; k < local_wk_sz[2]; ++k)
   for (j = 0; j < local_wk_sz[1]; ++j)
   for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
-    ids[0][curr] = i;
-    ids[1][curr] = j;
-    ids[2][curr] = k;
+    if (id_offset[0] >= 0)
+      ids[0][curr] = i;
+    if (id_offset[1] >= 0)
+      ids[1][curr] = j;
+    if (id_offset[2] >= 0)
+      ids[2][curr] = k;
     block_ips[curr] = 0;
   }
 
@@ -90,9 +93,12 @@ cl_set_varying_payload(const cl_kernel ker,
     uint16_t *ips  = (uint16_t *) (data + ip_offset);
     uint32_t *dw_ips  = (uint32_t *) (data + dw_ip_offset);
     for (j = 0; j < simd_sz; ++j, ++curr) {
-      ids0[j] = ids[0][curr];
-      ids1[j] = ids[1][curr];
-      ids2[j] = ids[2][curr];
+      if (id_offset[0] >= 0)
+        ids0[j] = ids[0][curr];
+      if (id_offset[1] >= 0)
+        ids1[j] = ids[1][curr];
+      if (id_offset[2] >= 0)
+        ids2[j] = ids[2][curr];
       if (ip_offset >= 0)
         ips[j] = block_ips[curr];
       if (dw_ip_offset >= 0)
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 28d88b6..ccfde7c 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -102,7 +102,7 @@ cl_kernel_add_ref(cl_kernel k)
 LOCAL cl_int
 cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 {
-  uint32_t offset;            /* where to patch */
+  int32_t offset;            /* where to patch */
   enum gbe_arg_type arg_type; /* kind of argument */
   size_t arg_sz;              /* size of the argument */
   cl_mem mem = NULL;          /* for __global, __constant and image arguments */
@@ -154,7 +154,8 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (arg_type == GBE_ARG_VALUE) {
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     assert(offset + sz <= k->curbe_sz);
-    memcpy(k->curbe + offset, value, sz);
+    if (offset >= 0)
+      memcpy(k->curbe + offset, value, sz);
     k->args[index].local_sz = 0;
     k->args[index].is_set = 1;
     k->args[index].mem = NULL;
@@ -179,9 +180,10 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     k->args[index].sampler = sampler;
     cl_set_sampler_arg_slot(k, index, sampler);
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    //assert(arg_sz == 4);
-    assert(offset + 4 <= k->curbe_sz);
-    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
+    if (offset >= 0) {
+      assert(offset + 4 <= k->curbe_sz);
+      memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
+    }
     return CL_SUCCESS;
   }
 
-- 
1.9.1



More information about the Beignet mailing list