[Beignet] [PATCH V2] Implement constant buffer based on constant cache.

Tue Sep 3 23:24:54 PDT 2013

Currently, simply allocate enough graphics memory as constant memory space.
And bind it to bti 2. Constant cache read are backed by dword scatter read.
Different from other data port messages, the address need to be dword aligned,
and the addresses are in units of dword.

The constant address space data are placed in order: first global constant,
then the constant buffer kernel argument.

v2: change function & variable naming, to make clear 'curbe' and 'constant buffer'

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/context.cpp            |   12 +----
 backend/src/backend/gen_insn_selection.cpp |   32 +++++++++++-
 backend/src/backend/gen_reg_allocation.cpp |    1 -
 backend/src/backend/program.h              |    2 -
 backend/src/ir/profile.cpp                 |    2 -
 backend/src/ir/profile.hpp                 |    5 +-
 backend/src/llvm/llvm_gen_backend.cpp      |    9 +---
 src/cl_command_queue.c                     |   18 -------
 src/cl_command_queue.h                     |    3 --
 src/cl_command_queue_gen7.c                |   77 +++++++++++++++++++++++-----
 src/cl_driver.h                            |   11 ++--
 src/cl_driver_defs.c                       |    3 +-
 src/cl_gt_device.h                         |    2 +-
 src/cl_kernel.c                            |   10 ----
 src/intel/intel_driver.c                   |    2 +-
 src/intel/intel_gpgpu.c                    |   60 ++++++++++++++++++----
 16 files changed, 158 insertions(+), 91 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5484869..ac3a243 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -458,15 +458,6 @@ namespace gbe
       }
     });
 #undef INSERT_REG
-    this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
-    specialRegs.insert(ir::ocl::constoffst);
-
-    // Insert serialized global constant arrays if used
-    const ir::ConstantSet& constantSet = unit.getConstantSet();
-    if (constantSet.getConstantNum()) {
-      size_t size = constantSet.getDataSize();
-      this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
-    }
 
     // Insert the number of threads
     this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
@@ -640,8 +631,7 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim   ||
-        reg == ir::ocl::constoffst)
+        reg == ir::ocl::workdim)
       return true;
     return false;
   }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1e81dac..b2c2798 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2101,6 +2101,23 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
+    void emitDWordGather(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERT(valueNum == 1);
+      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+      // get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+      sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+
+      sel.DWORD_GATHER(dst, addrDW, bti);
+    }
+
     void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
@@ -2171,8 +2188,19 @@ namespace gbe
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.getAddressSpace() == MEM_CONSTANT)
-        this->emitIndirectMove(sel, insn, address);
+      if (insn.getAddressSpace() == MEM_CONSTANT) {
+        // XXX TODO read 64bit constant through constant cache
+        // Per HW Spec, constant cache messages can read at least DWORD data.
+        // So, byte/short data type, we have to read through data cache.
+        if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, 0x2);
+        else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitDWordGather(sel, insn, address, 0x2);
+        else {
+          const GenRegister value = sel.selReg(insn.getValue(0));
+          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+        }
+      }
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
         this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 0bb75a2..2abfb12 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -573,7 +573,6 @@ namespace gbe
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
     allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
     allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
 
     // Group and barrier IDs are always allocated by the hardware in r0
     RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index d20e7af..ff4d157 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,8 +70,6 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
-  GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
-  GBE_CURBE_GLOBAL_CONSTANT_DATA,
   GBE_CURBE_IMAGE_INFO,
   GBE_CURBE_STACK_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 675018a..927e43d 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,6 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "const_curbe_offset",
         "work_dimension",
     };
 
@@ -76,7 +75,6 @@ namespace ir {
       DECL_NEW_REG(FAMILY_WORD, blockip);
       DECL_NEW_REG(FAMILY_DWORD, barrierid);
       DECL_NEW_REG(FAMILY_DWORD, threadn);
-      DECL_NEW_REG(FAMILY_DWORD, constoffst);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4b0ef5e..c79bc3b 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -63,9 +63,8 @@ namespace ir {
     static const Register blockip = Register(19);  // blockip
     static const Register barrierid = Register(20);// barrierid
     static const Register threadn = Register(21);  // number of threads
-    static const Register constoffst = Register(22); // offset of global constant array's curbe
-    static const Register workdim = Register(23);  // work dimention.
-    static const uint32_t regNum = 24;             // number of special registers
+    static const Register workdim = Register(22);  // work dimention.
+    static const uint32_t regNum = 23;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 12d809d..e747d00 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1243,12 +1243,7 @@ namespace gbe
       ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
       ir::Constant &con = unit.getConstantSet().getConstant(j ++);
       con.setReg(reg.value());
-      if(con.getOffset() != 0) {
-        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
-        ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
-      } else {
-        ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
-      }
+      ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
     }
 
     // Visit all the instructions and emit the IR registers or the value to
@@ -2407,7 +2402,7 @@ namespace gbe
       const ir::Type type = getType(ctx, elemType);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
 
-      if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
+      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 9606d6b..2454db6 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -150,24 +150,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   return CL_SUCCESS;
 }
 
-LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
-                                                       char * dst)
-{
-  int i;
-  for(i = 0; i < k->arg_n; i++) {
-    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-
-    if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
-      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
-      cl_mem mem = k->args[i].mem;
-      cl_buffer_map(mem->bo, 1);
-      void * addr = cl_buffer_get_virtual(mem->bo);
-      memcpy(dst + offset, addr, mem->size);
-      cl_buffer_unmap(mem->bo);
-    }
-  }
-  return CL_SUCCESS;
-}
 
 #if USE_FULSIM
 extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 135d659..9fe1dd1 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -76,8 +76,5 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
 
 /* Bind all the image surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
-
-/*update constant buffer to final curbe */
-extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 1d415d4..68630cf 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker,
     block_ips[curr] = 0;
   }
 
-  /* Copy them to the constant buffer */
+  /* Copy them to the curbe buffer */
   curr = 0;
   for (i = 0; i < thread_n; ++i, data += cst_sz) {
     uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
@@ -95,6 +95,62 @@ error:
   return err;
 }
 
+static void
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+  /* calculate constant buffer size */
+  int32_t arg;
+  size_t offset;
+  gbe_program prog = ker->program->opaque;
+  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
+  size_t global_const_size = gbe_program_get_global_constant_size(prog);
+  uint32_t constant_buf_size = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+      constant_buf_size += ALIGN(mem->size, 4);
+    }
+  }
+  if(global_const_size == 0 && constant_buf_size == 0)
+     return;
+
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+  cl_buffer_map(bo, 1);
+  char * cst_addr = cl_buffer_get_virtual(bo);
+  offset = 0;
+  if (global_const_size > 0) {
+    /* Write the global constant arrays */
+    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+  }
+  offset += ALIGN(global_const_size, 4);
+
+  if(global_const_size == 0) {
+    /* reserve 4 bytes to get rid of 0 address */
+    offset += 4;
+  }
+
+  /* upload constant buffer argument */
+  int32_t curbe_offset = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+
+      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+      assert(curbe_offset >= 0);
+      *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(cst_addr + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+      offset += ALIGN(mem->size, 4);
+    }
+  }
+  cl_buffer_unmap(bo);
+}
+
 /* Will return the total amount of slm used */
 static int32_t
 cl_curbe_fill(cl_kernel ker,
@@ -122,7 +178,6 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
   UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
-  UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
 #undef UPLOAD
 
   /* Write identity for the stack pointer. This is required by the stack pointer
@@ -134,14 +189,6 @@ cl_curbe_fill(cl_kernel ker,
     int32_t i;
     for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
   }
-
-  /* Write global constant arrays */
-  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
-    /* Write the global constant arrays */
-    gbe_program prog = ker->program->opaque;
-    gbe_program_get_global_constant_data(prog, ker->curbe + offset);
-  }
-
   /* Handle the various offsets to SLM */
   const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
   int32_t arg, slm_offset = 0;
@@ -220,9 +267,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Compute the number of HW threads we need */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
-  kernel.cst_sz = cst_sz;
+  kernel.curbe_sz = cst_sz;
 
-  /* Curbe step 1: fill the constant buffer data shared by all threads */
+  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
     kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
     if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
@@ -242,6 +289,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
+
+  cl_upload_constant_buffer(queue, ker);
+
   cl_gpgpu_states_setup(gpgpu, &kernel);
 
   /* Curbe step 2. Give the localID and upload it to video memory */
@@ -250,10 +300,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
     for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
-        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
     }
     TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
-    cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
   }
 
   /* Start a new batch buffer */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 0ce03fe..95d6485 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -100,7 +100,7 @@ typedef enum gpu_command_status {
 typedef struct cl_gpgpu_kernel {
   const char *name;        /* kernel name and bo name */
   uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* total size of all constants */
+  uint32_t curbe_sz;         /* total size of all curbes */
   cl_buffer bo;            /* kernel code in the proper addr space */
   int32_t barrierID;       /* barrierID for _this_ kernel */
   uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
@@ -157,9 +157,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
 typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
 extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
 
-/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
-extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+/* Fills current curbe buffer with data */
+typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
 
 /* Setup all indirect states */
 typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 7c4c866..ae130fa 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -54,8 +54,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
-LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
 LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
 LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
 LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f58e1fd..db4daa3 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -51,7 +51,7 @@
 .single_fp_config = 0, /* XXX */
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
 .global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
+.max_constant_buffer_size = 512 << 10,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
 .host_unified_memory = CL_FALSE,
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 12a08c5..4ba1c11 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 
   mem = *(cl_mem*) value;
 
-  if(arg_type == GBE_ARG_CONSTANT_PTR) {
-    int32_t cbOffset;
-    cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
-    //constant ptr's curbe offset changed, update it
-    if(cbOffset >= 0) {
-      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-      *((uint32_t *)(k->curbe + offset)) = cbOffset;  //cb offset in curbe
-    }
-  }
-
   cl_mem_add_ref(mem);
   if (k->args[index].mem)
     cl_mem_delete(k->args[index].mem);
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 9959447..ef6e6c3 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -380,7 +380,7 @@ cl_intel_driver_new(cl_context_prop props)
   /* We use the first 2 slots(0,1) for all the bufs.
    * Notify the gbe this base index, thus gbe can avoid conflicts
    * when it allocates slots for images*/
-  gbe_set_image_base_index(2);
+  gbe_set_image_base_index(3);
 exit:
   return driver;
 error:
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 1301b66..6dac89d 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -79,7 +79,7 @@ struct intel_gpgpu
   intel_batchbuffer_t *batch;
   cl_gpgpu_kernel *ker;
   drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
-  uint32_t binded_offset[max_buf_n];    /* their offsets in the constant buffer */
+  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
   uint32_t binded_n;                    /* number of buffers binded */
 
   unsigned long img_bitmap;              /* image usage bitmap. */
@@ -96,6 +96,7 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } sampler_state_b;
   struct { drm_intel_bo *bo; } perf_b;
   struct { drm_intel_bo *bo; } scratch_b;
+  struct { drm_intel_bo *bo; } constant_b;
 
   uint32_t per_thread_scratch;
   struct {
@@ -138,6 +139,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
   if (gpgpu->scratch_b.bo)
     drm_intel_bo_unreference(gpgpu->scratch_b.bo);
 
+  if(gpgpu->constant_b.bo)
+    drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
   intel_batchbuffer_delete(gpgpu->batch);
   cl_free(gpgpu);
 }
@@ -231,7 +235,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -319,7 +323,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
   intel_gpgpu_select_pipeline(gpgpu);
   intel_gpgpu_set_base_address(gpgpu);
   intel_gpgpu_load_vfe_state(gpgpu);
-  intel_gpgpu_load_constant_buffer(gpgpu);
+  intel_gpgpu_load_curbe_buffer(gpgpu);
   intel_gpgpu_load_idrt(gpgpu);
 
   if (gpgpu->perf_b.bo) {
@@ -391,7 +395,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   /* Binded buffers */
   gpgpu->binded_n = 0;
   gpgpu->img_bitmap = 0;
-  gpgpu->img_index_base = 2;
+  gpgpu->img_index_base = 3;
   gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
@@ -399,12 +403,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->urb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
-  /* Constant buffer */
+  /* Constant URB  buffer */
   if(gpgpu->curbe_b.bo)
     dri_bo_unreference(gpgpu->curbe_b.bo);
   uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
   size_cb = ALIGN(size_cb, 4096);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
   assert(bo);
   gpgpu->curbe_b.bo = bo;
 
@@ -468,6 +472,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cc_llc_l3;
+  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  assert(gpgpu->constant_b.bo);
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      heap->binding_table[2] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
+
+
 /* Map address space with two 2GB surfaces. One surface for untyped message and
  * one surface for byte scatters / gathers. Actually the HW does not require two
  * surfaces but Fulsim complains
@@ -611,7 +648,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
   desc->desc3.binding_table_entry_count = 0; /* no prefetch */
   desc->desc3.binding_table_pointer = 0;
-  desc->desc4.curbe_read_len = kernel->cst_sz / 32;
+  desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
   desc->desc4.curbe_read_offset = 0;
 
   /* Barriers / SLM are automatically handled on Gen7+ */
@@ -650,7 +687,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 }
 
 static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
   unsigned char *curbe = NULL;
   cl_gpgpu_kernel *k = gpgpu->ker;
@@ -665,9 +702,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t si
   /* Now put all the relocations for our flat address space */
   for (i = 0; i < k->thread_n; ++i)
     for (j = 0; j < gpgpu->binded_n; ++j) {
-      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
+      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
       drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
-                              gpgpu->binded_offset[j]+i*k->cst_sz,
+                              gpgpu->binded_offset[j]+i*k->curbe_sz,
                               gpgpu->binded_buf[j],
                               0,
                               I915_GEM_DOMAIN_RENDER,
@@ -925,7 +962,8 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
   cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
   cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
-  cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
   cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
   cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
   cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
-- 
1.7.9.5