[Beignet] [PATCH V2] Implement constant buffer based on constant cache.

Wed Sep 4 00:11:57 PDT 2013

LGTM, pushed, thanks.

On Wed, Sep 04, 2013 at 02:24:54PM +0800, Ruiling Song wrote:
> Currently, simply allocate enough graphics memory as constant memory space.
> And bind it to bti 2. Constant cache read are backed by dword scatter read.
> Different from other data port messages, the address need to be dword aligned,
> and the addresses are in units of dword.
> 
> The constant address space data are placed in order: first global constant,
> then the constant buffer kernel argument.
> 
> v2: change function & variable naming, to make clear 'curbe' and 'constant buffer'
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/backend/context.cpp            |   12 +----
>  backend/src/backend/gen_insn_selection.cpp |   32 +++++++++++-
>  backend/src/backend/gen_reg_allocation.cpp |    1 -
>  backend/src/backend/program.h              |    2 -
>  backend/src/ir/profile.cpp                 |    2 -
>  backend/src/ir/profile.hpp                 |    5 +-
>  backend/src/llvm/llvm_gen_backend.cpp      |    9 +---
>  src/cl_command_queue.c                     |   18 -------
>  src/cl_command_queue.h                     |    3 --
>  src/cl_command_queue_gen7.c                |   77 +++++++++++++++++++++++-----
>  src/cl_driver.h                            |   11 ++--
>  src/cl_driver_defs.c                       |    3 +-
>  src/cl_gt_device.h                         |    2 +-
>  src/cl_kernel.c                            |   10 ----
>  src/intel/intel_driver.c                   |    2 +-
>  src/intel/intel_gpgpu.c                    |   60 ++++++++++++++++++----
>  16 files changed, 158 insertions(+), 91 deletions(-)
> 
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index 5484869..ac3a243 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -458,15 +458,6 @@ namespace gbe
>        }
>      });
>  #undef INSERT_REG
> -    this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
> -    specialRegs.insert(ir::ocl::constoffst);
> -
> -    // Insert serialized global constant arrays if used
> -    const ir::ConstantSet& constantSet = unit.getConstantSet();
> -    if (constantSet.getConstantNum()) {
> -      size_t size = constantSet.getDataSize();
> -      this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
> -    }
>  
>      // Insert the number of threads
>      this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
> @@ -640,8 +631,7 @@ namespace gbe
>          reg == ir::ocl::goffset0  ||
>          reg == ir::ocl::goffset1  ||
>          reg == ir::ocl::goffset2  ||
> -        reg == ir::ocl::workdim   ||
> -        reg == ir::ocl::constoffst)
> +        reg == ir::ocl::workdim)
>        return true;
>      return false;
>    }
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 1e81dac..b2c2798 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2101,6 +2101,23 @@ namespace gbe
>        sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
>      }
>  
> +    void emitDWordGather(Selection::Opaque &sel,
> +                         const ir::LoadInstruction &insn,
> +                         GenRegister addr,
> +                         uint32_t bti) const
> +    {
> +      using namespace ir;
> +      const uint32_t valueNum = insn.getValueNum();
> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();
> +      GBE_ASSERT(valueNum == 1);
> +      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
> +      // get dword based address
> +      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +      sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
> +
> +      sel.DWORD_GATHER(dst, addrDW, bti);
> +    }
> +
>      void emitRead64(Selection::Opaque &sel,
>                           const ir::LoadInstruction &insn,
>                           GenRegister addr,
> @@ -2171,8 +2188,19 @@ namespace gbe
>        GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(type);
> -      if (insn.getAddressSpace() == MEM_CONSTANT)
> -        this->emitIndirectMove(sel, insn, address);
> +      if (insn.getAddressSpace() == MEM_CONSTANT) {
> +        // XXX TODO read 64bit constant through constant cache
> +        // Per HW Spec, constant cache messages can read at least DWORD data.
> +        // So, byte/short data type, we have to read through data cache.
> +        if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> +          this->emitRead64(sel, insn, address, 0x2);
> +        else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> +          this->emitDWordGather(sel, insn, address, 0x2);
> +        else {
> +          const GenRegister value = sel.selReg(insn.getValue(0));
> +          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
> +        }
> +      }
>        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
>          this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
>        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 0bb75a2..2abfb12 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -573,7 +573,6 @@ namespace gbe
>      allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
>      allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
>      allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
> -    allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
>  
>      // Group and barrier IDs are always allocated by the hardware in r0
>      RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index d20e7af..ff4d157 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -70,8 +70,6 @@ enum gbe_curbe_type {
>    GBE_CURBE_GROUP_NUM_Y,
>    GBE_CURBE_GROUP_NUM_Z,
>    GBE_CURBE_WORK_DIM,
> -  GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
> -  GBE_CURBE_GLOBAL_CONSTANT_DATA,
>    GBE_CURBE_IMAGE_INFO,
>    GBE_CURBE_STACK_POINTER,
>    GBE_CURBE_KERNEL_ARGUMENT,
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 675018a..927e43d 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -40,7 +40,6 @@ namespace ir {
>          "stack_pointer",
>          "block_ip",
>          "barrier_id", "thread_number",
> -        "const_curbe_offset",
>          "work_dimension",
>      };
>  
> @@ -76,7 +75,6 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_WORD, blockip);
>        DECL_NEW_REG(FAMILY_DWORD, barrierid);
>        DECL_NEW_REG(FAMILY_DWORD, threadn);
> -      DECL_NEW_REG(FAMILY_DWORD, constoffst);
>        DECL_NEW_REG(FAMILY_DWORD, workdim);
>      }
>  #undef DECL_NEW_REG
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 4b0ef5e..c79bc3b 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -63,9 +63,8 @@ namespace ir {
>      static const Register blockip = Register(19);  // blockip
>      static const Register barrierid = Register(20);// barrierid
>      static const Register threadn = Register(21);  // number of threads
> -    static const Register constoffst = Register(22); // offset of global constant array's curbe
> -    static const Register workdim = Register(23);  // work dimention.
> -    static const uint32_t regNum = 24;             // number of special registers
> +    static const Register workdim = Register(22);  // work dimention.
> +    static const uint32_t regNum = 23;             // number of special registers
>      extern const char *specialRegMean[];           // special register name.
>    } /* namespace ocl */
>  
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 12d809d..e747d00 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1243,12 +1243,7 @@ namespace gbe
>        ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
>        ir::Constant &con = unit.getConstantSet().getConstant(j ++);
>        con.setReg(reg.value());
> -      if(con.getOffset() != 0) {
> -        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
> -        ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
> -      } else {
> -        ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
> -      }
> +      ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
>      }
>  
>      // Visit all the instructions and emit the IR registers or the value to
> @@ -2407,7 +2402,7 @@ namespace gbe
>        const ir::Type type = getType(ctx, elemType);
>        const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>  
> -      if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
> +      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
>          // One message is enough here. Nothing special to do
>          if (elemNum <= 4) {
>            // Build the tuple data in the vector
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 9606d6b..2454db6 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -150,24 +150,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
>    return CL_SUCCESS;
>  }
>  
> -LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
> -                                                       char * dst)
> -{
> -  int i;
> -  for(i = 0; i < k->arg_n; i++) {
> -    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
> -
> -    if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
> -      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
> -      cl_mem mem = k->args[i].mem;
> -      cl_buffer_map(mem->bo, 1);
> -      void * addr = cl_buffer_get_virtual(mem->bo);
> -      memcpy(dst + offset, addr, mem->size);
> -      cl_buffer_unmap(mem->bo);
> -    }
> -  }
> -  return CL_SUCCESS;
> -}
>  
>  #if USE_FULSIM
>  extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
> diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> index 135d659..9fe1dd1 100644
> --- a/src/cl_command_queue.h
> +++ b/src/cl_command_queue.h
> @@ -76,8 +76,5 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
>  
>  /* Bind all the image surfaces in the GPGPU state */
>  extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
> -
> -/*update constant buffer to final curbe */
> -extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
>  #endif /* __CL_COMMAND_QUEUE_H__ */
>  
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 1d415d4..68630cf 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker,
>      block_ips[curr] = 0;
>    }
>  
> -  /* Copy them to the constant buffer */
> +  /* Copy them to the curbe buffer */
>    curr = 0;
>    for (i = 0; i < thread_n; ++i, data += cst_sz) {
>      uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
> @@ -95,6 +95,62 @@ error:
>    return err;
>  }
>  
> +static void
> +cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
> +{
> +  /* calculate constant buffer size */
> +  int32_t arg;
> +  size_t offset;
> +  gbe_program prog = ker->program->opaque;
> +  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
> +  size_t global_const_size = gbe_program_get_global_constant_size(prog);
> +  uint32_t constant_buf_size = 0;
> +  for (arg = 0; arg < arg_n; ++arg) {
> +    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
> +    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
> +      cl_mem mem = ker->args[arg].mem;
> +      constant_buf_size += ALIGN(mem->size, 4);
> +    }
> +  }
> +  if(global_const_size == 0 && constant_buf_size == 0)
> +     return;
> +
> +  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
> +  cl_buffer_map(bo, 1);
> +  char * cst_addr = cl_buffer_get_virtual(bo);
> +  offset = 0;
> +  if (global_const_size > 0) {
> +    /* Write the global constant arrays */
> +    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
> +  }
> +  offset += ALIGN(global_const_size, 4);
> +
> +  if(global_const_size == 0) {
> +    /* reserve 4 bytes to get rid of 0 address */
> +    offset += 4;
> +  }
> +
> +  /* upload constant buffer argument */
> +  int32_t curbe_offset = 0;
> +  for (arg = 0; arg < arg_n; ++arg) {
> +    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
> +    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
> +      cl_mem mem = ker->args[arg].mem;
> +
> +      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
> +      assert(curbe_offset >= 0);
> +      *(uint32_t *) (ker->curbe + curbe_offset) = offset;
> +
> +      cl_buffer_map(mem->bo, 1);
> +      void * addr = cl_buffer_get_virtual(mem->bo);
> +      memcpy(cst_addr + offset, addr, mem->size);
> +      cl_buffer_unmap(mem->bo);
> +      offset += ALIGN(mem->size, 4);
> +    }
> +  }
> +  cl_buffer_unmap(bo);
> +}
> +
>  /* Will return the total amount of slm used */
>  static int32_t
>  cl_curbe_fill(cl_kernel ker,
> @@ -122,7 +178,6 @@ cl_curbe_fill(cl_kernel ker,
>    UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
>    UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
>    UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
> -  UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
>  #undef UPLOAD
>  
>    /* Write identity for the stack pointer. This is required by the stack pointer
> @@ -134,14 +189,6 @@ cl_curbe_fill(cl_kernel ker,
>      int32_t i;
>      for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
>    }
> -
> -  /* Write global constant arrays */
> -  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
> -    /* Write the global constant arrays */
> -    gbe_program prog = ker->program->opaque;
> -    gbe_program_get_global_constant_data(prog, ker->curbe + offset);
> -  }
> -
>    /* Handle the various offsets to SLM */
>    const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
>    int32_t arg, slm_offset = 0;
> @@ -220,9 +267,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    /* Compute the number of HW threads we need */
>    TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
>    kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
> -  kernel.cst_sz = cst_sz;
> +  kernel.curbe_sz = cst_sz;
>  
> -  /* Curbe step 1: fill the constant buffer data shared by all threads */
> +  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
>    if (ker->curbe) {
>      kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
>      if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
> @@ -242,6 +289,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    cl_setup_scratch(gpgpu, ker);
>    /* Bind a stack if needed */
>    cl_bind_stack(gpgpu, ker);
> +
> +  cl_upload_constant_buffer(queue, ker);
> +
>    cl_gpgpu_states_setup(gpgpu, &kernel);
>  
>    /* Curbe step 2. Give the localID and upload it to video memory */
> @@ -250,10 +300,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>      TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
>      for (i = 0; i < thread_n; ++i) {
>          memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
> -        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
>      }
>      TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
> -    cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
> +    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
>    }
>  
>    /* Start a new batch buffer */
> diff --git a/src/cl_driver.h b/src/cl_driver.h
> index 0ce03fe..95d6485 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -100,7 +100,7 @@ typedef enum gpu_command_status {
>  typedef struct cl_gpgpu_kernel {
>    const char *name;        /* kernel name and bo name */
>    uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
> -  uint32_t cst_sz;         /* total size of all constants */
> +  uint32_t curbe_sz;         /* total size of all curbes */
>    cl_buffer bo;            /* kernel code in the proper addr space */
>    int32_t barrierID;       /* barrierID for _this_ kernel */
>    uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
> @@ -157,9 +157,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
>  typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
>  extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
>  
> -/* Fills current constant buffer with data */
> -typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
> -extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
> +/* Fills current curbe buffer with data */
> +typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
> +extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
> +
> +typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
> +extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
>  
>  /* Setup all indirect states */
>  typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
> index 7c4c866..ae130fa 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -54,8 +54,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
>  LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
>  LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
>  LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
> +LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
>  LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
> -LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
> +LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
>  LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
>  LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
>  LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
> diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
> index f58e1fd..db4daa3 100644
> --- a/src/cl_gt_device.h
> +++ b/src/cl_gt_device.h
> @@ -51,7 +51,7 @@
>  .single_fp_config = 0, /* XXX */
>  .global_mem_cache_type = CL_READ_WRITE_CACHE,
>  .global_mem_size = 4,
> -.max_constant_buffer_size = 64 << 10,
> +.max_constant_buffer_size = 512 << 10,
>  .max_constant_args = 8,
>  .error_correction_support = CL_FALSE,
>  .host_unified_memory = CL_FALSE,
> diff --git a/src/cl_kernel.c b/src/cl_kernel.c
> index 12a08c5..4ba1c11 100644
> --- a/src/cl_kernel.c
> +++ b/src/cl_kernel.c
> @@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
>  
>    mem = *(cl_mem*) value;
>  
> -  if(arg_type == GBE_ARG_CONSTANT_PTR) {
> -    int32_t cbOffset;
> -    cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
> -    //constant ptr's curbe offset changed, update it
> -    if(cbOffset >= 0) {
> -      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
> -      *((uint32_t *)(k->curbe + offset)) = cbOffset;  //cb offset in curbe
> -    }
> -  }
> -
>    cl_mem_add_ref(mem);
>    if (k->args[index].mem)
>      cl_mem_delete(k->args[index].mem);
> diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
> index 9959447..ef6e6c3 100644
> --- a/src/intel/intel_driver.c
> +++ b/src/intel/intel_driver.c
> @@ -380,7 +380,7 @@ cl_intel_driver_new(cl_context_prop props)
>    /* We use the first 2 slots(0,1) for all the bufs.
>     * Notify the gbe this base index, thus gbe can avoid conflicts
>     * when it allocates slots for images*/
> -  gbe_set_image_base_index(2);
> +  gbe_set_image_base_index(3);
>  exit:
>    return driver;
>  error:
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index 1301b66..6dac89d 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -79,7 +79,7 @@ struct intel_gpgpu
>    intel_batchbuffer_t *batch;
>    cl_gpgpu_kernel *ker;
>    drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
> -  uint32_t binded_offset[max_buf_n];    /* their offsets in the constant buffer */
> +  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
>    uint32_t binded_n;                    /* number of buffers binded */
>  
>    unsigned long img_bitmap;              /* image usage bitmap. */
> @@ -96,6 +96,7 @@ struct intel_gpgpu
>    struct { drm_intel_bo *bo; } sampler_state_b;
>    struct { drm_intel_bo *bo; } perf_b;
>    struct { drm_intel_bo *bo; } scratch_b;
> +  struct { drm_intel_bo *bo; } constant_b;
>  
>    uint32_t per_thread_scratch;
>    struct {
> @@ -138,6 +139,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
>    if (gpgpu->scratch_b.bo)
>      drm_intel_bo_unreference(gpgpu->scratch_b.bo);
>  
> +  if(gpgpu->constant_b.bo)
> +    drm_intel_bo_unreference(gpgpu->constant_b.bo);
> +
>    intel_batchbuffer_delete(gpgpu->batch);
>    cl_free(gpgpu);
>  }
> @@ -231,7 +235,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
>  }
>  
>  static void
> -intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
> +intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
>  {
>    BEGIN_BATCH(gpgpu->batch, 4);
>    OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
> @@ -319,7 +323,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
>    intel_gpgpu_select_pipeline(gpgpu);
>    intel_gpgpu_set_base_address(gpgpu);
>    intel_gpgpu_load_vfe_state(gpgpu);
> -  intel_gpgpu_load_constant_buffer(gpgpu);
> +  intel_gpgpu_load_curbe_buffer(gpgpu);
>    intel_gpgpu_load_idrt(gpgpu);
>  
>    if (gpgpu->perf_b.bo) {
> @@ -391,7 +395,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>    /* Binded buffers */
>    gpgpu->binded_n = 0;
>    gpgpu->img_bitmap = 0;
> -  gpgpu->img_index_base = 2;
> +  gpgpu->img_index_base = 3;
>    gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
>  
>    /* URB */
> @@ -399,12 +403,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>    gpgpu->urb.size_cs_entry = size_cs_entry;
>    gpgpu->max_threads = max_threads;
>  
> -  /* Constant buffer */
> +  /* Constant URB  buffer */
>    if(gpgpu->curbe_b.bo)
>      dri_bo_unreference(gpgpu->curbe_b.bo);
>    uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
>    size_cb = ALIGN(size_cb, 4096);
> -  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
> +  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
>    assert(bo);
>    gpgpu->curbe_b.bo = bo;
>  
> @@ -468,6 +472,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
>                      obj_bo);
>  }
>  
> +static dri_bo*
> +intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
> +{
> +  uint32_t s = size - 1;
> +  assert(size != 0);
> +
> +  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
> +  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
> +  memset(ss2, 0, sizeof(gen7_surface_state_t));
> +  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
> +  ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
> +  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
> +  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
> +  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
> +  ss2->ss5.cache_control = cc_llc_l3;
> +  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
> +
> +  if(gpgpu->constant_b.bo)
> +    dri_bo_unreference(gpgpu->constant_b.bo);
> +  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
> +  assert(gpgpu->constant_b.bo);
> +  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
> +  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
> +                      I915_GEM_DOMAIN_RENDER,
> +                      I915_GEM_DOMAIN_RENDER,
> +                      0,
> +                      heap->binding_table[2] +
> +                      offsetof(gen7_surface_state_t, ss1),
> +                      gpgpu->constant_b.bo);
> +  return gpgpu->constant_b.bo;
> +}
> +
> +
>  /* Map address space with two 2GB surfaces. One surface for untyped message and
>   * one surface for byte scatters / gathers. Actually the HW does not require two
>   * surfaces but Fulsim complains
> @@ -611,7 +648,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
>    desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
>    desc->desc3.binding_table_entry_count = 0; /* no prefetch */
>    desc->desc3.binding_table_pointer = 0;
> -  desc->desc4.curbe_read_len = kernel->cst_sz / 32;
> +  desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
>    desc->desc4.curbe_read_offset = 0;
>  
>    /* Barriers / SLM are automatically handled on Gen7+ */
> @@ -650,7 +687,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
>  }
>  
>  static void
> -intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
> +intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
>  {
>    unsigned char *curbe = NULL;
>    cl_gpgpu_kernel *k = gpgpu->ker;
> @@ -665,9 +702,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t si
>    /* Now put all the relocations for our flat address space */
>    for (i = 0; i < k->thread_n; ++i)
>      for (j = 0; j < gpgpu->binded_n; ++j) {
> -      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
> +      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
>        drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
> -                              gpgpu->binded_offset[j]+i*k->cst_sz,
> +                              gpgpu->binded_offset[j]+i*k->curbe_sz,
>                                gpgpu->binded_buf[j],
>                                0,
>                                I915_GEM_DOMAIN_RENDER,
> @@ -925,7 +962,8 @@ intel_set_gpgpu_callbacks(void)
>    cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
>    cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
>    cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
> -  cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
> +  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
> +  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
>    cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
>    cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
>    cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet