[Beignet] [Beignet 2/3] Put constant buffer of argument to payload.

Tue Apr 16 01:48:17 PDT 2013

Change load to ia move. Need recompile when queue range.
Now if thers is not enough gen reg for constant buffer, will assert.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/context.cpp                    |   70 ++++++++++++++++++++
 backend/src/backend/context.hpp                    |   14 ++++
 backend/src/backend/gen_context.cpp                |   46 ++++++++++++-
 backend/src/backend/gen_context.hpp                |    8 ++-
 backend/src/backend/gen_encoder.cpp                |   10 ++-
 backend/src/backend/gen_encoder.hpp                |    4 ++
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    1 +
 backend/src/backend/gen_insn_selection.cpp         |   30 ++++++++-
 backend/src/backend/gen_insn_selection.hxx         |    2 +
 backend/src/backend/gen_program.cpp                |   14 ++--
 backend/src/backend/gen_program.hpp                |    3 +
 backend/src/backend/gen_register.hpp               |   10 +++
 backend/src/backend/program.cpp                    |   16 +++++
 backend/src/backend/program.h                      |   11 ++-
 backend/src/backend/program.hpp                    |   15 +++++
 src/cl_command_queue.c                             |   19 ++++++
 src/cl_command_queue.h                             |    2 +
 src/cl_command_queue_gen7.c                        |   11 ++-
 src/cl_kernel.c                                    |    9 ++-
 src/cl_mem.c                                       |    1 +
 src/cl_mem.h                                       |    1 +
 21 files changed, 282 insertions(+), 15 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 88f704b..0dd433b 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -288,6 +288,7 @@ namespace gbe
 
   Kernel *Context::compileKernel(void) {
     this->kernel = this->allocateKernel();
+    this->kernel->bCurbeChanged = false;
     this->kernel->simdWidth = this->simdWidth;
     this->buildPatchList();
     this->buildArgList();
@@ -295,12 +296,49 @@ namespace gbe
     this->buildJIPs();
     this->buildStack();
     this->handleSLM();
+    this->buildConstBufs();
     if (this->emitCode() == false) {
       GBE_DELETE(this->kernel);
       this->kernel = NULL;
     }
     return this->kernel;
   }
+
+  void Context::recompileKernel(void) {
+    //content's unit have destroyed after compiler, so can't use unti, fn in recompiler
+    if(this->kernel->bCurbeChanged) {
+      this->reallocCurbe();
+      this->patchCBOffsets();
+      this->kernel->bCurbeChanged = false;
+    }      
+  }
+  
+  void Context::reallocCurbe(void) {
+    for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {   
+      if(kernel->args[argID].type != GBE_ARG_CONSTANT_PTR)
+        continue;
+      if(kernel->args[argID].bufSize == kernel->args[argID].allocedSize)
+        continue;
+
+      //free previous 
+      int32_t offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+      if(offset >= 0)
+          deallocate(offset+GEN_REG_SIZE);
+      kernel->args[argID].allocedSize = 0;
+      
+      if(kernel->args[argID].bufSize > 0) {
+        newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_CONSTANT_BUFFER+argID, kernel->args[argID].bufSize, 32);
+        kernel->args[argID].allocedSize = kernel->args[argID].bufSize; 
+      }
+
+      //add to cbCurbeOffsets
+      std::sort(kernel->patches.begin(), kernel->patches.end());
+      offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+      GBE_ASSERT(offset>=0);
+      cbCurbeOffsets.insert(std::make_pair(kernel->args[argID].reg, offset+GEN_REG_SIZE));
+    }
+    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+  }
   
   int16_t Context::allocate(int16_t size, int16_t alignment) {
     return partitioner->allocate(size, alignment);
@@ -420,6 +458,10 @@ namespace gbe
       kernel->args = NULL;
     for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
       const auto &arg = fn.getArg(argID);
+      kernel->args[argID].bufSize = 0;
+      kernel->args[argID].allocedSize = 0;
+      //in recompiler, unit have been destroyed, so record Register here
+      kernel->args[argID].reg = arg.reg;
       switch (arg.type) {
         case ir::FunctionArgument::VALUE:
         case ir::FunctionArgument::STRUCTURE:
@@ -547,6 +589,34 @@ namespace gbe
     const bool useSLM = fn.getUseSLM();
     kernel->useSLM = useSLM;
   }
+
+  void Context::buildConstBufs(void) {
+    uint32_t argNum = fn.argNum();
+    for (uint32_t argID = 0; argID < argNum; ++argID) {   
+      const auto &arg = fn.getArg(argID);
+      
+      if(arg.type != ir::FunctionArgument::CONSTANT_POINTER)
+        continue;
+
+      ir::Register srcReg = arg.reg;
+      constBaseProps.insert(std::make_pair(srcReg, srcReg));
+
+      //build the const Propagation registers
+      fn.foreachInstruction([this](const ir::Instruction &insn) {
+        using namespace ir;
+        if(insn.getOpcode() == OP_LOAD) return;
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {            
+          const ir::Register reg = insn.getSrc(srcID);
+          if((constBaseProps.contains(reg) != false)) {
+            const ir::Register dstReg = insn.getDst();
+            constBaseProps.insert(std::make_pair(dstReg, constBaseProps.find(reg)->second));
+            break;
+          }                  
+        }
+      });
+    }
+  }
   
   bool Context::isScalarReg(const ir::Register &reg) const {
     GBE_ASSERT(fn.getProfile() == ir::Profile::PROFILE_OCL);
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 55a63a7..c897769 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -57,6 +57,8 @@ namespace gbe
     virtual ~Context(void);
     /*! Compile the code */
     Kernel *compileKernel(void);
+    void recompileKernel(void);
+    void reallocCurbe(void);
     /*! Tells if the labels is used */
     INLINE bool isLabelUsed(ir::LabelIndex index) const {
       return usedLabels.contains(index);
@@ -82,6 +84,10 @@ namespace gbe
     INLINE bool hasJIP(const ir::Instruction *insn) const {
       return JIPs.find(insn) != JIPs.end();
     }
+    INLINE ir::Register getConstBaseReg(const ir::Register reg) const {
+      GBE_ASSERT(constBaseProps.find(reg) != constBaseProps.end());
+      return constBaseProps.find(reg)->second;
+    }
     /*! Allocate some memory in the register file */
     int16_t allocate(int16_t size, int16_t alignment);
     /*! Deallocate previously allocated memory */
@@ -89,6 +95,8 @@ namespace gbe
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
+    /*! Patch the curbe gen register offset */
+    virtual void patchCBOffsets(void) = 0;
     /*! Allocate a new empty kernel (to be implemented) */
     virtual Kernel *allocateKernel(void) = 0;
     /*! Look if a stack is needed and allocate it */
@@ -105,12 +113,16 @@ namespace gbe
     void buildJIPs(void);
     /*! Configure SLM use if needed */
     void handleSLM(void);
+    /* Build constant buffer map */
+    void buildConstBufs(void);
     /*! Insert a new entry with the given size in the Curbe. Return the offset
      *  of the entry
      */
     void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
     /*! Provide for each branch and label the label index target */
     typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
+    typedef map<const ir::Register, ir::Register> ConstBasePropMap;
+    typedef map<const ir::Register, uint32_t> CBRegOffsetMap;
     const ir::Unit &unit;                 //!< Unit that contains the kernel
     const ir::Function &fn;               //!< Function to compile
     std::string name;                     //!< Name of the kernel to compile
@@ -120,6 +132,8 @@ namespace gbe
     RegisterFilePartitioner *partitioner; //!< Handle register file partionning
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
+    ConstBasePropMap constBaseProps;      //!< const buffer reg propagation regs
+    CBRegOffsetMap cbCurbeOffsets;          //!< const buffer base ir reg --> curbe reg offset Map
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index b4c9a65..1f51854 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -83,7 +83,20 @@ namespace gbe
       p->patchJMPI(insnID, (targetID-insnID-1) * 2);
     }
   }
-
+  
+  void GenContext::patchCBOffsets(void) {
+    using namespace ir;
+    for (auto pair : cbMovePos) {
+      const Register reg = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t offset = cbCurbeOffsets.find(reg)->second;
+      p->patchAddrIMM(insnID+1, offset);
+    }
+    
+    GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
+    std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
+  }
+  
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
@@ -218,7 +231,38 @@ namespace gbe
     this->branchPos2.push_back(std::make_pair(label, p->store.size()));
     p->JMPI(src);
   }
+  
+  void GenContext::emitCBMoveInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    const GenRegister src2 = ra->genReg(insn.src(2));
+    const GenRegister a0 = GenRegister::addr8(0);
 
+    p->ADD(src0, src1, GenRegister::negate(src2));
+    
+    this->cbMovePos.push_back(std::make_pair(ir::Register(insn.src(2).value.reg), p->store.size()));
+    p->push();
+      p->curr.execWidth = 8;
+      p->MOV(a0, GenRegister::unpacked_uw(src0.nr, 0));
+      //p->ADD(a0, a0, GenRegister::immuw(0));      
+      p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+    p->pop();
+    
+    if (simdWidth == 16) {
+      const GenRegister dst1 = GenRegister::Qn(dst, 1);
+      const GenRegister src0_1 = GenRegister::Qn(GenRegister::unpacked_uw(src0.nr, 0), 1);
+      this->cbMovePos.push_back(std::make_pair(ir::Register(insn.src(2).value.reg), p->store.size()));
+      p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+        p->MOV(a0, src0_1);
+        //p->ADD(a0, a0, GenRegister::immuw(0));        
+        p->MOV(dst1, GenRegister::indirect(dst1.type, 0, GEN_WIDTH_8));
+      p->pop();
+      } 
+  }
+  
   void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
     p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6af174f..8131b5f 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -55,6 +55,8 @@ namespace gbe
     ~GenContext(void);
     /*! Implements base class */
     virtual bool emitCode(void);
+    /*! Patch the curbe gen register offset */
+    virtual void patchCBOffsets(void);
     /*! Function we emit code for */
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
@@ -80,6 +82,7 @@ namespace gbe
     void emitTernaryInstruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
+    void emitCBMoveInstruction(const SelectionInstruction &insn);
     void emitEotInstruction(const SelectionInstruction &insn);
     void emitNoOpInstruction(const SelectionInstruction &insn);
     void emitWaitInstruction(const SelectionInstruction &insn);
@@ -91,13 +94,14 @@ namespace gbe
     void emitByteScatterInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
-
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
-    map<ir::LabelIndex, uint32_t> labelPos;
+    map<ir::LabelIndex, uint32_t> labelPos;    
     /*! Store the Gen instructions to patch */
     vector<std::pair<ir::LabelIndex, uint32_t>> branchPos2;
+    /*! Store the constant buffer ia mov instructions to patch */
+    vector<std::pair<ir::Register, uint32_t>> cbMovePos;
     /*! Encode Gen ISA */
     GenEncoder *p;
     /*! Instruction selection on Gen ISA (pre-register allocation) */
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index ed7c256..53ae17f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -685,7 +685,15 @@ namespace gbe
     assert(insn.header.opcode == GEN_OPCODE_JMPI);
     this->setSrc1(&insn, GenRegister::immd(jumpDistance));
   }
-
+  
+  void GenEncoder::patchAddrIMM(uint32_t insnID, int32_t offset) {
+    GenInstruction &insn = this->store[insnID];
+    assert(insnID < this->store.size());
+    assert(insn.header.opcode == GEN_OPCODE_MOV);
+    insn.bits2.ia1.src0_indirect_offset = offset;
+    //this->setSrc1(&insn, GenRegister::immuw(offset));
+  }
+  
   void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1) {
     if (needToSplitCmp(this, src0, src1) == false) {
       GenInstruction *insn = this->next(GEN_OPCODE_CMP);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 7e26f0a..8d66289 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -65,6 +65,7 @@ namespace gbe
   public:
     /*! simdWidth is the default width for the instructions */
     GenEncoder(uint32_t simdWidth, uint32_t gen);
+
     /*! Size of the stack (should be large enough) */
     enum { MAX_STATE_NUM = 16 };
     /*! Push the current instruction state */
@@ -156,6 +157,9 @@ namespace gbe
     /*! Patch JMPI (located at index insnID) with the given jump distance */
     void patchJMPI(uint32_t insnID, int32_t jumpDistance);
 
+    /*! Patch the constant buffer ia mov offset */
+    void patchAddrIMM(uint32_t insnID, int32_t offset);
+
     ////////////////////////////////////////////////////////////////////////
     // Helper functions to encode
     ////////////////////////////////////////////////////////////////////////
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 969ec82..7575dc3 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -4,6 +4,7 @@ DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
+DECL_GEN7_SCHEDULE(CBMove,          20,        2,        2)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
 DECL_GEN7_SCHEDULE(NoOp,            20,        2,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index ecaaeeb..ef83d86 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -435,6 +435,8 @@ namespace gbe
     void LABEL(ir::LabelIndex label);
     /*! Jump indexed instruction */
     void JMPI(Reg src, ir::LabelIndex target);
+    /*!constant buffer mov instruction*/
+    void CB_MOVE(Reg dst, Reg src0, Reg src1, Reg src2);    
     /*! Compare instructions */
     void CMP(uint32_t conditional, Reg src0, Reg src1);
     /*! Select instruction with embedded comparison */
@@ -684,6 +686,14 @@ namespace gbe
     insn->index = uint16_t(index);
   }
 
+  void Selection::Opaque::CB_MOVE(Reg dst, Reg src0, Reg src1, Reg src2) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CB_MOVE, 1, 3);
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->src(2) = src2;
+    insn->dst(0) = dst;
+  }
+
   void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 0, 2);
     insn->src(0) = src0;
@@ -1614,6 +1624,21 @@ namespace gbe
   /*! Load instruction pattern */
   DECL_PATTERN(LoadInstruction)
   {
+    void emitCBMove(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister addr)  const
+    {
+      using namespace ir;      
+      GBE_ASSERT(insn.getValueNum() == 1);   //todo: handle vec later
+      
+      GenRegister offset = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      const GenRegister baseReg = sel.selReg(sel.ctx.getConstBaseReg(Register(addr.value.reg)), TYPE_U32);
+      //const uint32_t valueNum = insn.getValueNum();
+      //GenRegister dst[valueNum];
+      //for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+      const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
+
+      sel.CB_MOVE(dst, offset, GenRegister::retype(addr, GEN_TYPE_UD), baseReg);      
+    }    
+
     void emitUntypedRead(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
@@ -1659,10 +1684,13 @@ namespace gbe
       const GenRegister address = sel.selReg(insn.getAddress());
       const AddressSpace space = insn.getAddressSpace();
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+                 insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL);
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
-      if (insn.isAligned() == true)
+      if(insn.getAddressSpace() == MEM_CONSTANT)
+        this->emitCBMove(sel, insn, address);
+      else if (insn.isAligned() == true)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
         const GenRegister value = sel.selReg(insn.getValue(0));
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 691100b..14ff08c 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -24,6 +24,7 @@ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
 DECL_SELECTION_IR(MAD, TernaryInstruction)
 DECL_SELECTION_IR(JMPI, JumpInstruction)
 DECL_SELECTION_IR(EOT, EotInstruction)
+DECL_SELECTION_IR(CB_MOVE, CBMoveInstruction)
 DECL_SELECTION_IR(NOP, NoOpInstruction)
 DECL_SELECTION_IR(WAIT, WaitInstruction)
 DECL_SELECTION_IR(MATH, MathInstruction)
@@ -34,3 +35,4 @@ DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 8a7efdb..4cac5ba 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -37,12 +37,12 @@
 namespace gbe {
 
   GenKernel::GenKernel(const std::string &name) :
-    Kernel(name), insns(NULL), insnNum(0)
+    Kernel(name), insns(NULL), insnNum(0), cxt(NULL)
   {}
-  GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
+  GenKernel::~GenKernel(void) { GBE_DELETE(cxt); GBE_SAFE_DELETE_ARRAY(insns); }
   const char *GenKernel::getCode(void) const { return (const char*) insns; }
   size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
-
+  void GenKernel::recompile(void) { cxt->recompileKernel(); }
   GenProgram::GenProgram(void) {}
   GenProgram::~GenProgram(void) {}
 
@@ -58,7 +58,6 @@ namespace gbe {
   };
 
   Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name) {
-
     // Be careful when the simdWidth is forced by the programmer. We can see it
     // when the function already provides the simd width we need to use (i.e.
     // non zero)
@@ -76,16 +75,17 @@ namespace gbe {
       unit.getFunction(name)->setSimdWidth(simdWidth);
       Context *ctx = GBE_NEW(GenContext, unit, name, limitRegisterPressure);
       kernel = ctx->compileKernel();
-      GBE_DELETE(ctx);
-      if (kernel != NULL)
+      if (kernel != NULL) {
+        ((GenKernel *)kernel)->cxt = (GenContext *)ctx;
         break;
+      }
+      GBE_DELETE(ctx);
     }
 
     // XXX spill must be implemented
     GBE_ASSERTM(kernel != NULL, "Register spilling not supported yet!");
     return kernel;
   }
-
   static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
     NOT_IMPLEMENTED;
     return NULL;
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 68b0427..f41eb0c 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -27,6 +27,7 @@
 
 #include "backend/program.h"
 #include "backend/program.hpp"
+#include "backend/gen_context.hpp"
 
 // Gen ISA instruction
 struct GenInstruction;
@@ -44,8 +45,10 @@ namespace gbe
     virtual const char *getCode(void) const;
     /*! Implements base class */
     virtual size_t getCodeSize(void) const;
+    virtual void recompile(void);
     GenInstruction *insns; //!< Instruction stream
     uint32_t insnNum;      //!< Number of instructions
+    GenContext *cxt;
     GBE_CLASS(GenKernel);  //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 92122a6..d772b0d 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -725,6 +725,16 @@ namespace gbe
       return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
     static INLINE GenRegister mask(uint32_t subnr) {
       return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
     }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index df0df28..e08cb20 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -229,6 +229,18 @@ namespace gbe {
     return kernel->getUseSLM() ? 1 : 0;
   }
 
+  static void kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
+    if (genKernel == NULL) return;
+    gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
+    kernel->setConstBufSize(argID, sz);
+  }
+
+  static void kernelRecompile(gbe_kernel genKernel){
+    if(genKernel == NULL) return;
+    gbe::Kernel *kernel = (gbe::Kernel *) genKernel;
+    kernel->recompile();
+  }
+  
   static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
     return 0u;
   }
@@ -251,6 +263,8 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_recompile_cb *gbe_kernel_recompile = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
 
@@ -275,6 +289,8 @@ namespace gbe
       gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
       gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+      gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
+      gbe_kernel_recompile = gbe::kernelRecompile;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
       genSetupCallBacks();
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index b90c1df..08db4eb 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -81,7 +81,8 @@ enum gbe_curbe_type {
 
 /*! Extra arguments use the negative range of sub-values */
 enum gbe_extra_argument {
-  GBE_STACK_BUFFER = 0 /* Give stack location in curbe */
+  GBE_STACK_BUFFER = 0, /* Give stack location in curbe */
+  GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
 };
 
 /*! Create a new program from the given source code (zero terminated string) */
@@ -155,6 +156,14 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
 typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
 extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
 
+/*! Set the const pointer arg size */
+typedef void (gbe_kernel_set_const_buffer_size_cb)(gbe_kernel, uint32_t argID, size_t sz);
+extern gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size;
+
+/*! Recompiler kernel when enqueue only for const now */
+typedef void (gbe_kernel_recompile_cb)(gbe_kernel);
+extern gbe_kernel_recompile_cb *gbe_kernel_recompile;
+
 /*! Get the curbe offset where to put the data. Returns -1 if not required */
 typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
 extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e0f7dba..a8b2615 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -25,6 +25,7 @@
 #ifndef __GBE_PROGRAM_HPP__
 #define __GBE_PROGRAM_HPP__
 
+#include "ir/instruction.hpp"
 #include "backend/program.h"
 #include "sys/hash_map.hpp"
 #include "sys/vector.hpp"
@@ -42,6 +43,9 @@ namespace gbe {
   struct KernelArgument {
     gbe_arg_type type; //!< Pointer, structure, image, regular value?
     uint32_t size;     //!< Size of the argument
+    uint32_t bufSize;  //!< Size of constant buffer
+    uint32_t allocedSize;  //!< Size of alloced cb, used to check whether need realloc cb
+    ir::Register reg;  //!< because of some member of context like unit and fn destoryed when recompiler, save reg for it
   };
 
   /*! Stores the offset where to patch where to patch */
@@ -94,6 +98,16 @@ namespace gbe {
     INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
     /*! Says if SLM is needed for it */
     INLINE bool getUseSLM(void) const { return this->useSLM; }
+    void setConstBufSize(uint32_t argID, size_t sz) {
+        if(argID >= argNum) return;
+        if(args[argID].type != GBE_ARG_CONSTANT_PTR) return;
+        if(args[argID].bufSize != sz) {
+          this->bCurbeChanged = true;
+          args[argID].bufSize = sz;
+        }
+    }
+    /* recompile for constant buffer alloc */
+    virtual void recompile(void) = 0;
   protected:
     friend class Context;      //!< Owns the kernels
     const std::string name;    //!< Kernel name
@@ -104,6 +118,7 @@ namespace gbe {
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
+    bool     bCurbeChanged;
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 37e78b4..8e9fc31 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -127,6 +127,25 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   return CL_SUCCESS;
 }
 
+LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
+                                                       char * dst)
+{
+  int i;
+  for(i = 0; i < k->arg_n; i++) {
+    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
+    
+    if(arg_type == GBE_ARG_CONSTANT_PTR) {
+      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
+      cl_mem mem = k->args[i].mem;
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(dst + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+    }
+  }
+  return CL_SUCCESS;
+}
+
 #if USE_FULSIM
 extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
 extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 1e2bcc1..7c571da 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -69,5 +69,7 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
 /* Bind all the surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
 
+/*update constant buffer to final curbe */
+extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 3a590bc..0b7b9a9 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -190,6 +190,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   size_t thread_n = 0u;
   cl_int err = CL_SUCCESS;
 
+  gbe_kernel_recompile(ker->opaque);
+  ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
+  if(cst_sz != ker->curbe_sz) {
+    cl_kernel_setup(ker, ker->opaque);
+    cst_sz = ker->curbe_sz;
+  }
+
   /* Setup kernel */
   kernel.name = "KERNEL";
   kernel.grf_blocks = 128;
@@ -224,8 +231,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   if (ker->curbe) {
     assert(cst_sz > 0);
     TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
-      for (i = 0; i < thread_n; ++i)
+    for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
+        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
+    }
     TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
     cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
   }
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 356a8a7..ae1b574 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -109,7 +109,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     if (UNLIKELY(value == NULL))
       return CL_INVALID_KERNEL_ARGS;
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + sz <= k->curbe_sz);
+    //assert(offset + sz <= k->curbe_sz);
     memcpy(k->curbe + offset, value, sz);
     k->args[index].local_sz = 0;
     k->args[index].is_set = 1;
@@ -152,6 +152,10 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
      || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
       return CL_INVALID_ARG_VALUE;
+  
+  if(arg_type == GBE_ARG_CONSTANT_PTR) {
+    gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
+  }
   cl_mem_add_ref(mem);
   if (k->args[index].mem)
     cl_mem_delete(k->args[index].mem);
@@ -175,6 +179,9 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   cl_context ctx = k->program->ctx;
   cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
 
+  if(k->bo != NULL)
+    cl_buffer_unreference(k->bo); 
+ 
   /* Allocate the gen code here */
   const uint32_t code_sz = gbe_kernel_get_code_size(opaque);
   const char *code = gbe_kernel_get_code(opaque);
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 3a8cfdd..e6961d1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -79,6 +79,7 @@ cl_mem_allocate(cl_context ctx,
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
     goto error;
   }
+  mem->size = sz;
 
   /* Append the buffer in the context buffer list */
   pthread_mutex_lock(&ctx->buffer_lock);
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 6992454..db391ee 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -35,6 +35,7 @@ struct _cl_mem {
   uint64_t magic;           /* To identify it as a memory object */
   volatile int ref_n;       /* This object is reference counted */
   cl_buffer bo;             /* Data in GPU memory */
+  size_t size;              /* original request size, not alignment size, used in constant buffer */
   cl_mem prev, next;        /* We chain the memory buffers together */
   cl_context ctx;           /* Context it belongs to */
   cl_mem_flags flags;       /* Flags specified at the creation time */
-- 
1.7.9.5