[Beignet] [PATCH 1/2] enable scratch memory allocation and read/write

Tue Aug 6 22:03:46 PDT 2013

My suggestion here is to seprate the scratch OBlock read/write out. And
submit the
Dword Read/Write part firstly, as OBlock read/write is not used right now,
Right?

On Wed, Aug 7, 2013 at 12:53 PM, zhigang gong <zhigang.gong at gmail.com>wrote:

>
>
>
> On Wed, Aug 7, 2013 at 10:05 AM, Ruiling Song <ruiling.song at intel.com>wrote:
>
>> There are two modes of scratch RW, OBlock, HBlock.
>> HBlock was documented as scratch block write/read in HW spec.
>>
>  I guess you mean Dword Block here when you are talking about Hblock,
> right?
>  The only two channel mode supported for scratch read/write are OWord or
> Dword.
>
> While the OBlock was documented as OWord block read/write.
>> I enabled both of them, but only used HBlock RW in later spill patch.
>>
>> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
>> ---
>>  backend/src/backend/context.cpp           |   18 ++++++++
>>  backend/src/backend/context.hpp           |    3 ++
>>  backend/src/backend/gen/gen_mesa_disasm.c |   43 ++++++++++++++---
>>  backend/src/backend/gen_context.cpp       |   71
>> +++++++++++++++++++++++++++++
>>  backend/src/backend/gen_context.hpp       |    6 ++-
>>  backend/src/backend/gen_defs.hpp          |   25 ++++++++++
>>  backend/src/backend/gen_encoder.cpp       |   67
>> ++++++++++++++++++++++++++-
>>  backend/src/backend/gen_encoder.hpp       |    8 ++++
>>  backend/src/backend/program.cpp           |    8 ++++
>>  backend/src/backend/program.h             |    4 ++
>>  backend/src/backend/program.hpp           |    3 ++
>>  src/cl_command_queue_gen7.c               |    9 ++++
>>  src/cl_driver.h                           |    4 ++
>>  src/cl_driver_defs.c                      |    1 +
>>  src/intel/intel_gpgpu.c                   |   53 ++++++++++++++++-----
>>  15 files changed, 303 insertions(+), 20 deletions(-)
>>
>> diff --git a/backend/src/backend/context.cpp
>> b/backend/src/backend/context.cpp
>> index 48160de..5484869 100644
>> --- a/backend/src/backend/context.cpp
>> +++ b/backend/src/backend/context.cpp
>> @@ -268,6 +268,15 @@ namespace gbe
>>      }
>>    }
>>
>> +  static int
>> +  alignScratchSize(int size){
>> +    int i = 0;
>> +
>> +    for(; i < size; i+=1024)
>> +      ;
>> +
>> +    return i;
>> +  }
>>
>  How about just return size * 1024 here, and it'd better to be a macro or
> inline function?
>
>>
>>  ///////////////////////////////////////////////////////////////////////////
>>    // Generic Context (shared by the simulator and the HW context)
>>
>>  ///////////////////////////////////////////////////////////////////////////
>> @@ -284,6 +293,7 @@ namespace gbe
>>        this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
>>      else
>>        this->simdWidth = fn.getSimdWidth();
>> +    this->scratchOffset = 0;
>>    }
>>
>>    Context::~Context(void) {
>> @@ -306,6 +316,8 @@ namespace gbe
>>        this->kernel = NULL;
>>      }
>>      if(this->kernel != NULL)
>> +      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
>> +    if(this->kernel != NULL;
>
>        this->kernel->ctx = this;
>>
>   How about just write it this way
>          if (this->kernel != NULL) {
>            this->kernel->scratchSize =
> alignScratchSize(this->scratchOffset);
>            this->kernel->ctx = this;
>          }
>
>      return this->kernel;
>>    }
>> @@ -337,6 +349,12 @@ namespace gbe
>>      return offset + GEN_REG_SIZE;
>>    }
>>
>> +  uint32_t Context::allocateScratchMem(uint32_t size) {
>> +    uint32_t offset = scratchOffset;
>> +    scratchOffset += size;
>> +    return offset;
>> +  }
>> +
>>    void Context::buildStack(void) {
>>      const auto &stackUse = dag->getUse(ir::ocl::stackptr);
>>      if (stackUse.size() == 0)  // no stack is used if stackptr is unused
>> diff --git a/backend/src/backend/context.hpp
>> b/backend/src/backend/context.hpp
>> index c205388..50c0e70 100644
>> --- a/backend/src/backend/context.hpp
>> +++ b/backend/src/backend/context.hpp
>> @@ -91,6 +91,8 @@ namespace gbe
>>      /* allocate a new entry for a specific image's information */
>>      /*! Get (search or allocate if fail to find one) image info
>> curbeOffset.*/
>>      uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
>> +    /*! allocate size scratch memory and return start address */
>> +    uint32_t allocateScratchMem(uint32_t size);
>>    protected:
>>      /*! Build the instruction stream. Return false if failed */
>>      virtual bool emitCode(void) = 0;
>> @@ -126,6 +128,7 @@ namespace gbe
>>      set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
>>      JIPMap JIPs;                          //!< Where to jump all
>> labels/branches
>>      uint32_t simdWidth;                   //!< Number of lanes per HW
>> threads
>> +    uint32_t scratchOffset;               //!< scratch slot for next
>> scratch memory request
>>      GBE_CLASS(Context);                   //!< Use custom allocators
>>    };
>>
>> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
>> b/backend/src/backend/gen/gen_mesa_disasm.c
>> index ca8ca37..dd4f485 100644
>> --- a/backend/src/backend/gen/gen_mesa_disasm.c
>> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
>> @@ -373,6 +373,28 @@ static const char *data_port_data_cache_category[] =
>> {
>>    "scratch",
>>  };
>>
>> +static const char *data_port_scratch_block_size[] = {
>> +  "1 HWord",
>> +  "2 HWord",
>> +  "Reserve",
>> +  "4 HWord",
>> +};
>>
>
>   Use "n HWord" is a little confused. They are just n simd8 registers and
> has Hword alignment
>   in scratch memory, and the register data type is specified at the
> channel data type.
>
> +
>> +static const char *data_port_scratch_invalidate[] = {
>> +  "no invalidate",
>> +  "invalidate cache line",
>> +};
>> +
>> +static const char *data_port_scratch_data_type[] = {
>> +  "Oword",
>> +  "Dword",
>> +};
>> +
>> +static const char *data_port_scratch_msg_type[] = {
>> +  "Scratch Read",
>> +  "Scratch Write",
>> +};
>> +
>>  static const char *data_port_data_cache_msg_type[] = {
>>    [0] = "OWord Block Read",
>>    [1] = "Unaligned OWord Block Read",
>> @@ -1155,12 +1177,21 @@ int gen_disasm (FILE *file, const void
>> *opaque_insn)
>>                  inst->bits3.sampler_gen7.simd_mode);
>>          break;
>>        case GEN_SFID_DATAPORT_DATA_CACHE:
>> -        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
>> -                inst->bits3.gen7_untyped_rw.bti,
>> -                inst->bits3.gen7_untyped_rw.rgba,
>> -
>>  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
>> -
>>  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
>> -
>>  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
>> +        if(inst->bits3.gen7_untyped_rw.category == 0) {
>> +          format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
>> +                  inst->bits3.gen7_untyped_rw.bti,
>> +                  inst->bits3.gen7_untyped_rw.rgba,
>> +
>>  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
>> +
>>  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
>> +
>>  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
>> +        } else {
>> +          format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
>> +                  inst->bits3.gen7_scratch_rw.offset,
>> +
>>  data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
>> +
>>  data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
>> +
>>  data_port_scratch_data_type[inst->bits3.gen7_scratch_rw.data_type],
>> +
>>  data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
>> +        }
>>          break;
>>        case GEN_SFID_MESSAGE_GATEWAY:
>>          format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
>> diff --git a/backend/src/backend/gen_context.cpp
>> b/backend/src/backend/gen_context.cpp
>> index 97a9527..e0e4a87 100644
>> --- a/backend/src/backend/gen_context.cpp
>> +++ b/backend/src/backend/gen_context.cpp
>> @@ -562,6 +562,77 @@ namespace gbe
>>      p->pop();
>>    }
>>
>> +  void GenContext::scratchWriteOWord(const GenRegister header, uint32_t
>> offset, uint32_t reg_num, uint32_t reg_type) {
>> +    p->push();
>> +    uint32_t simdWidth = p->curr.execWidth;
>> +    const uint32_t nr = header.nr;
>> +    p->curr.predicate = GEN_PREDICATE_NONE;
>> +    p->curr.noMask = 1;
>> +
>> +    p->curr.execWidth = 8;
>> +    p->MOV(header, GenRegister::ud8grf(0,0));
>> +
>> +    p->curr.execWidth = 1;
>> +    p->MOV(GenRegister::ud1grf(nr, 2), GenRegister::immud(offset/16));
>> +    p->pop();
>> +
>> +    int size = typeSize(reg_type)*simdWidth;
>> +    p->push();
>> +    p->SCRATCH_WRITE_OWORD(header, size, reg_num);
>> +    p->pop();
>> +  }
>> +
>> +  void GenContext::scratchReadOWord(const GenRegister dst, const
>> GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) {
>> +    p->push();
>> +    uint32_t simdWidth = p->curr.execWidth;
>> +    const uint32_t nr = header.nr;
>> +    p->curr.predicate = GEN_PREDICATE_NONE;
>> +    p->curr.noMask = 1;
>> +
>> +    p->curr.execWidth = 8;
>> +    p->MOV(header, GenRegister::ud8grf(0,0));
>> +
>> +    p->curr.execWidth = 1;
>> +    p->MOV(GenRegister::ud1grf(nr, 2), GenRegister::immud(offset/16));
>> +    p->pop();
>> +
>> +    int size = typeSize(reg_type)*simdWidth;
>> +    p->push();
>> +    p->SCRATCH_READ_OWORD(dst, header, size, reg_num);
>> +    p->pop();
>> +  }
>> +
>> +  void GenContext::scratchWriteHWord(const GenRegister header, uint32_t
>> offset, uint32_t reg_num, uint32_t reg_type) {
>>
>  I think you should use scratchWriteDWord here? right?
>
>> +    p->push();
>> +    uint32_t simdWidth = p->curr.execWidth;
>> +    p->curr.predicate = GEN_PREDICATE_NONE;
>> +    p->curr.noMask = 1;
>> +
>> +    p->curr.execWidth = 8;
>> +    p->MOV(header, GenRegister::ud8grf(0,0));
>> +    p->pop();
>> +
>> +    int size = typeSize(reg_type)*simdWidth;
>> +    p->push();
>> +    p->SCRATCH_WRITE_HWORD(header, offset/32, size, reg_num);
>> +    p->pop();
>> +  }
>> +
>> +  void GenContext::scratchReadHWord(const GenRegister dst, const
>> GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) {
>> +    p->push();
>> +    uint32_t simdWidth = p->curr.execWidth;
>> +    p->curr.predicate = GEN_PREDICATE_NONE;
>> +    p->curr.noMask = 1;
>> +    p->curr.execWidth = 8;
>> +    p->MOV(header, GenRegister::ud8grf(0,0));
>> +    p->pop();
>> +
>> +    int size = typeSize(reg_type)*simdWidth;
>> +    p->push();
>> +    p->SCRATCH_READ_HWORD(dst, header, offset/32, size, reg_num);
>> +    p->pop();
>> +  }
>> +
>
>    void GenContext::emitTypedWriteInstruction(const SelectionInstruction
>> &insn) {
>>      const GenRegister header =
>> GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
>>      const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
>> diff --git a/backend/src/backend/gen_context.hpp
>> b/backend/src/backend/gen_context.hpp
>> index dc5dc45..3b78342 100644
>> --- a/backend/src/backend/gen_context.hpp
>> +++ b/backend/src/backend/gen_context.hpp
>> @@ -41,6 +41,7 @@ namespace gbe
>>    class Selection;            // Performs instruction selection
>>    class SelectionInstruction; // Pre-RA Gen instruction
>>    class SelectionReg;         // Pre-RA Gen register
>> +  class GenRegister;
>>
>>    /*! Context is the helper structure to build the Gen ISA or simulation
>> code
>>     *  from GenIR
>> @@ -108,7 +109,10 @@ namespace gbe
>>      void emitSampleInstruction(const SelectionInstruction &insn);
>>      void emitTypedWriteInstruction(const SelectionInstruction &insn);
>>      void emitGetImageInfoInstruction(const SelectionInstruction &insn);
>> -
>> +    void scratchWriteOWord(const GenRegister header, uint32_t offset,
>> uint32_t reg_num, uint32_t reg_type);
>> +    void scratchReadOWord(const GenRegister dst, const GenRegister
>> header, uint32_t offset, uint32_t reg_num, uint32_t reg_type);
>> +    void scratchWriteHWord(const GenRegister header, uint32_t offset,
>> uint32_t reg_num, uint32_t reg_type);
>> +    void scratchReadHWord(const GenRegister dst, const GenRegister
>> header, uint32_t offset, uint32_t reg_num, uint32_t reg_type);
>>      /*! Implements base class */
>>      virtual Kernel *allocateKernel(void);
>>      /*! Store the position of each label instruction in the Gen ISA
>> stream */
>> diff --git a/backend/src/backend/gen_defs.hpp
>> b/backend/src/backend/gen_defs.hpp
>> index 5b15e30..00e61c9 100644
>> --- a/backend/src/backend/gen_defs.hpp
>> +++ b/backend/src/backend/gen_defs.hpp
>> @@ -319,6 +319,15 @@ enum GenMessageTarget {
>>  #define GEN_BYTE_SCATTER          12//1100: Byte Scattered Write
>>  #define GEN_UNTYPED_WRITE         13//1101: Untyped Surface Write
>>
>> +/* Data port data cache scratch messages*/
>> +#define GEN_SCRATCH_READ          0
>> +#define GEN_SCRATCH_WRITE         1
>> +#define GEN_SCRATCH_DATA_OWORD    0
>> +#define GEN_SCRATCH_DATA_DWORD    1
>> +#define GEN_SCRATCH_BLOCK_SIZE_1  0
>> +#define GEN_SCRATCH_BLOCK_SIZE_2  1
>> +#define GEN_SCRATCH_BLOCK_SIZE_4  3
>> +
>>  /* Data port render cache Message Type*/
>>  #define GEN_MBLOCK_READ           4  //0100: Media Block Read
>>  #define GEN_TYPED_READ            5  //0101: Typed Surface Read
>> @@ -765,6 +774,22 @@ struct GenInstruction
>>        uint32_t end_of_thread:1;
>>      } gen7_byte_rw;
>>
>> +    /*! Data port Scratch Read/ write */
>> +    struct {
>> +      uint32_t offset:12;
>> +      uint32_t block_size:2;
>> +      uint32_t ignored0:1;
>> +      uint32_t invalidate_after_read:1;
>> +      uint32_t data_type:1;
>> +      uint32_t msg_type:1;
>> +      uint32_t category:1;
>> +      uint32_t header_present:1;
>> +      uint32_t response_length:5;
>> +      uint32_t msg_length:4;
>> +      uint32_t pad2:2;
>> +      uint32_t end_of_thread:1;
>> +    } gen7_scratch_rw;
>> +
>>      /*! Data port OBlock read / write */
>>      struct {
>>        uint32_t bti:8;
>> diff --git a/backend/src/backend/gen_encoder.cpp
>> b/backend/src/backend/gen_encoder.cpp
>> index 64b5bd1..afb193f 100644
>> --- a/backend/src/backend/gen_encoder.cpp
>> +++ b/backend/src/backend/gen_encoder.cpp
>> @@ -147,7 +147,7 @@ namespace gbe
>>      else
>>        NOT_SUPPORTED;
>>    }
>> -#if 0
>> +#if 1
>>    static void setOBlockRW(GenEncoder *p,
>>                            GenInstruction *insn,
>>                            uint32_t bti,
>> @@ -1136,6 +1136,71 @@ namespace gbe
>>       this->setSrc0(insn, msg);
>>       setTypedWriteMessage(this, insn, bti, msg_type, msg_length,
>> header_present);
>>    }
>>
>  I don't think you can reuse the setOBlockRW for the scratch buffer.
>
>
>> +  static void setScratchMessage(GenEncoder *p,
>> +                                   GenInstruction *insn,
>> +                                   uint32_t offset,
>> +                                   uint32_t block_size,
>> +                                   uint32_t data_type,
>> +                                   uint32_t msg_type,
>> +                                   uint32_t msg_length,
>> +                                   uint32_t response_length)
>> +  {
>> +     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
>> +     setMessageDescriptor(p, insn, sfid, msg_length, response_length,
>> true);
>> +     insn->bits3.gen7_scratch_rw.block_size = block_size;
>> +     insn->bits3.gen7_scratch_rw.msg_type = msg_type;
>> +     insn->bits3.gen7_scratch_rw.data_type = data_type;
>> +     insn->bits3.gen7_scratch_rw.offset = offset;
>> +     insn->bits3.gen7_scratch_rw.category = 1;
>> +  }
>> +
>> +  void GenEncoder::SCRATCH_WRITE_HWORD(GenRegister msg, uint32_t offset,
>> uint32_t size, uint32_t src_num)
>> +  {
>> +     assert(src_num == 1 || src_num ==2);
>> +     uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 :
>> GEN_SCRATCH_BLOCK_SIZE_2;
>> +     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>> +     this->setHeader(insn);
>> +     this->setDst(insn, GenRegister::retype(GenRegister::null(),
>> GEN_TYPE_UD));
>> +     this->setSrc0(insn, msg);
>> +     this->setSrc1(insn, GenRegister::immud(0));
>> +     // here src_num means register that will be write out: in terms of
>> 32byte register number
>> +     setScratchMessage(this, insn, offset, block_size,
>> GEN_SCRATCH_DATA_DWORD, GEN_SCRATCH_WRITE, src_num+1, 0);
>> +  }
>> +
>> +  void GenEncoder::SCRATCH_READ_HWORD(GenRegister dst, GenRegister src,
>> uint32_t offset, uint32_t size, uint32_t dst_num)
>> +  {
>> +     assert(dst_num == 1 || dst_num ==2);
>> +     uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 :
>> GEN_SCRATCH_BLOCK_SIZE_2;
>> +     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>> +     this->setHeader(insn);
>> +     this->setDst(insn, dst);
>> +     this->setSrc0(insn, src);
>> +     this->setSrc1(insn, GenRegister::immud(0));
>> +      // here dst_num is the register that will be write-back: in terms
>> of 32byte register
>> +     setScratchMessage(this, insn, offset, block_size,
>> GEN_SCRATCH_DATA_DWORD, GEN_SCRATCH_READ, 1, dst_num);
>> +  }
>> +
>> +  void GenEncoder::SCRATCH_WRITE_OWORD(GenRegister msg, uint32_t size,
>> uint32_t src_num)
>> +  {
>> +     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>> +     this->setHeader(insn);
>> +     this->setDst(insn, GenRegister::retype(GenRegister::null(),
>> GEN_TYPE_UD));
>> +     this->setSrc0(insn, msg);
>> +     this->setSrc1(insn, GenRegister::immud(0));
>> +     // here src_num means registers that will be write out: in terms of
>> 32byte register number
>> +     setOBlockRW(this, insn, 255, size/16, GEN_OBLOCK_WRITE, src_num+1,
>> 0);
>> +  }
>>
>  I'm confused here, you prepared a message header for  Scratch OWord
>  read/write, but then you send a OWord Block Read
>  message. I don't think it's going to work as desired. You can check the  "OWord
> Block Read/Write" in the spec, you can find
>  a restriction there:
>    the only surface type allowed is SURFTYPE_BUFFER.
>  And here, the surface is a scratch buffer.
>
>> +
>> +  void GenEncoder::SCRATCH_READ_OWORD(GenRegister dst, GenRegister src,
>> uint32_t size, uint32_t dst_num)
>> +  {
>> +     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>> +     this->setHeader(insn);
>> +     this->setDst(insn, dst);
>> +     this->setSrc0(insn, src);
>> +     this->setSrc1(insn, GenRegister::immud(0));
>> +      // here dst_num is the register that will be write-back: in terms
>> of 32byte register
>> +     setOBlockRW(this, insn, 255, size/16, GEN_OBLOCK_READ, 1, dst_num);
>> +  }
>>
>>    void GenEncoder::EOT(uint32_t msg) {
>>      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>> diff --git a/backend/src/backend/gen_encoder.hpp
>> b/backend/src/backend/gen_encoder.hpp
>> index 083bd8c..66c4f25 100644
>> --- a/backend/src/backend/gen_encoder.hpp
>> +++ b/backend/src/backend/gen_encoder.hpp
>> @@ -155,6 +155,14 @@ namespace gbe
>>      void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti,
>> uint32_t elemSize);
>>      /*! Byte scatter (for unaligned bytes, shorts and ints) */
>>      void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
>> +    /*! for scratch memory oblock read */
>> +    void SCRATCH_READ_OWORD(GenRegister msg, GenRegister dst, uint32_t
>> size, uint32_t dst_num);
>> +    /*! for scratch memory oblock write */
>> +    void SCRATCH_WRITE_OWORD(GenRegister msg, uint32_t size, uint32_t
>> src_num);
>> +    /*! for scratch memory hblock read */
>> +    void SCRATCH_READ_HWORD(GenRegister msg, GenRegister dst, uint32_t
>> offset, uint32_t size, uint32_t dst_num);
>> +    /*! for scratch memory hblock write */
>> +    void SCRATCH_WRITE_HWORD(GenRegister msg, uint32_t offset, uint32_t
>> size, uint32_t src_num);
>>      /*! Send instruction for the sampler */
>>      void SAMPLE(GenRegister dest,
>>                  GenRegister msg,
>> diff --git a/backend/src/backend/program.cpp
>> b/backend/src/backend/program.cpp
>> index 26c22f3..35d3a7c 100644
>> --- a/backend/src/backend/program.cpp
>> +++ b/backend/src/backend/program.cpp
>> @@ -374,6 +374,12 @@ namespace gbe {
>>      return kernel->getStackSize();
>>    }
>>
>> +  static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
>> +    if (genKernel == NULL) return 0;
>> +    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
>> +    return kernel->getScratchSize();
>> +  }
>> +
>>    static int32_t kernelUseSLM(gbe_kernel genKernel) {
>>      if (genKernel == NULL) return 0;
>>      const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
>> @@ -443,6 +449,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb
>> *gbe_kernel_get_simd_width = NULL
>>  GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb
>> *gbe_kernel_get_curbe_offset = NULL;
>>  GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb
>> *gbe_kernel_get_curbe_size = NULL;
>>  GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb
>> *gbe_kernel_get_stack_size = NULL;
>> +GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb
>> *gbe_kernel_get_scratch_size = NULL;
>>  GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb
>> *gbe_kernel_set_const_buffer_size = NULL;
>>  GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb
>> *gbe_kernel_get_required_work_group_size = NULL;
>>  GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
>> @@ -476,6 +483,7 @@ namespace gbe
>>        gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
>>        gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
>>        gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
>> +      gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
>>        gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
>>        gbe_kernel_get_required_work_group_size =
>> gbe::kernelGetRequiredWorkGroupSize;
>>        gbe_kernel_use_slm = gbe::kernelUseSLM;
>> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
>> index f36bfbf..d20e7af 100644
>> --- a/backend/src/backend/program.h
>> +++ b/backend/src/backend/program.h
>> @@ -198,6 +198,10 @@ extern gbe_kernel_get_curbe_size_cb
>> *gbe_kernel_get_curbe_size;
>>  typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
>>  extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
>>
>> +/*! Get the scratch size (zero if no scratch is required) */
>> +typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
>> +extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
>> +
>>  /*! Get the curbe offset where to put the data. Returns -1 if not
>> required */
>>  typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum
>> gbe_curbe_type type, uint32_t sub_type);
>>  extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
>> diff --git a/backend/src/backend/program.hpp
>> b/backend/src/backend/program.hpp
>> index 2d67310..83aaab8 100644
>> --- a/backend/src/backend/program.hpp
>> +++ b/backend/src/backend/program.hpp
>> @@ -96,6 +96,8 @@ namespace gbe {
>>      INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
>>      /*! Return the size of the stack (zero if none) */
>>      INLINE uint32_t getStackSize(void) const { return this->stackSize; }
>> +    /*! Return the size of the scratch memory needed (zero if none) */
>> +    INLINE uint32_t getScratchSize(void) const { return
>> this->scratchSize; }
>>      /*! Get the SIMD width for the kernel */
>>      INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
>>      /*! Says if SLM is needed for it */
>> @@ -135,6 +137,7 @@ namespace gbe {
>>      uint32_t curbeSize;        //!< Size of the data to push
>>      uint32_t simdWidth;        //!< SIMD size for the kernel (lane
>> number)
>>      uint32_t stackSize;        //!< Stack size (may be 0 if unused)
>> +    uint32_t scratchSize;      //!< Scratch memory size (may be 0 if
>> unused)
>>      bool useSLM;               //!< SLM requires a special HW config
>>      Context *ctx;              //!< Save context after compiler to alloc
>> constant buffer curbe
>>      ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
>> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
>> index 8933213..e58433f 100644
>> --- a/src/cl_command_queue_gen7.c
>> +++ b/src/cl_command_queue_gen7.c
>> @@ -183,6 +183,14 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
>>    cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
>>  }
>>
>> +static void
>> +cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
>> +{
>> +  int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
>> +
>> +  cl_gpgpu_set_scratch(gpgpu, scratch_sz);
>> +}
>> +
>>  LOCAL cl_int
>>  cl_command_queue_ND_range_gen7(cl_command_queue queue,
>>                                 cl_kernel ker,
>> @@ -231,6 +239,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>>    /* Bind all samplers */
>>    cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
>>
>> +  cl_setup_scratch(gpgpu, ker);
>>    /* Bind a stack if needed */
>>    cl_bind_stack(gpgpu, ker);
>>    cl_gpgpu_states_setup(gpgpu, &kernel);
>> diff --git a/src/cl_driver.h b/src/cl_driver.h
>> index 212beb3..673985d 100644
>> --- a/src/cl_driver.h
>> +++ b/src/cl_driver.h
>> @@ -135,6 +135,10 @@ extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
>>  typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t
>> size, uint32_t cchint);
>>  extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
>>
>> +/* Setup scratch */
>> +typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t
>> per_thread_size);
>> +extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
>> +
>>  /* Configure internal state */
>>  typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads,
>> uint32_t size_cs_entry);
>>  extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
>> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
>> index 4952288..9aa926e 100644
>> --- a/src/cl_driver_defs.c
>> +++ b/src/cl_driver_defs.c
>> @@ -50,6 +50,7 @@ LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
>>  LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
>>  LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
>>  LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
>> +LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
>>  LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
>>  LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
>>  LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
>> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
>> index 2791fbe..b7434c4 100644
>> --- a/src/intel/intel_gpgpu.c
>> +++ b/src/intel/intel_gpgpu.c
>> @@ -89,7 +89,9 @@ struct intel_gpgpu
>>    struct { drm_intel_bo *bo; } curbe_b;
>>    struct { drm_intel_bo *bo; } sampler_state_b;
>>    struct { drm_intel_bo *bo; } perf_b;
>> +  struct { drm_intel_bo *bo; } scratch_b;
>>
>> +  uint32_t per_thread_scratch;
>>    struct {
>>      uint32_t num_cs_entries;
>>      uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
>> @@ -127,6 +129,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
>>      drm_intel_bo_unreference(gpgpu->perf_b.bo);
>>    if (gpgpu->stack_b.bo)
>>      drm_intel_bo_unreference(gpgpu->stack_b.bo);
>> +  if (gpgpu->scratch_b.bo)
>> +    drm_intel_bo_unreference(gpgpu->scratch_b.bo);
>> +
>>    intel_batchbuffer_delete(gpgpu->batch);
>>    cl_free(gpgpu);
>>  }
>> @@ -199,18 +204,23 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
>>    BEGIN_BATCH(gpgpu->batch, 8);
>>    OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
>>
>> -  gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
>> -    intel_batchbuffer_alloc_space(gpgpu->batch,0);
>> -
>> -  memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
>> -  vfe->vfe1.gpgpu_mode = 1;
>> -  vfe->vfe1.bypass_gateway_ctl = 1;
>> -  vfe->vfe1.reset_gateway_timer = 1;
>> -  vfe->vfe1.max_threads = gpgpu->max_threads - 1;
>> -  vfe->vfe1.urb_entries = 64;
>> -  vfe->vfe3.curbe_size = 480;
>> -  vfe->vfe4.scoreboard_mask = 0;
>> -  intel_batchbuffer_alloc_space(gpgpu->batch,
>> sizeof(gen6_vfe_state_inline_t));
>> +  if(gpgpu->per_thread_scratch > 0) {
>> +    OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
>> +              I915_GEM_DOMAIN_RENDER,
>> +              I915_GEM_DOMAIN_RENDER,
>> +              gpgpu->per_thread_scratch/1024 - 1);
>> +  }
>> +  else {
>> +    OUT_BATCH(gpgpu->batch, 0);
>> +  }
>> +  /* max_thread | urb entries | (reset_gateway|bypass_gate_way |
>> gpgpu_mode) */
>> +  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 <<
>> 8) | 0xc4);
>> +  OUT_BATCH(gpgpu->batch, 0);
>> +  /* curbe_size */
>> +  OUT_BATCH(gpgpu->batch, 480);
>> +  OUT_BATCH(gpgpu->batch, 0);
>> +  OUT_BATCH(gpgpu->batch, 0);
>> +  OUT_BATCH(gpgpu->batch, 0);
>>    ADVANCE_BATCH(gpgpu->batch);
>>  }
>>
>> @@ -434,6 +444,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>>    if (gpgpu->stack_b.bo)
>>      dri_bo_unreference(gpgpu->stack_b.bo);
>>    gpgpu->stack_b.bo = NULL;
>> +
>>  }
>>
>  Don't need to add the above blank line.
>
>>
>>  static void
>> @@ -537,6 +548,23 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu,
>> drm_intel_bo *buf, uint32_t offset, u
>>  }
>>
>>  static void
>> +intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
>> +{
>> +  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
>> +  drm_intel_bo* old = gpgpu->scratch_b.bo;
>> +  uint32_t total = per_thread_size * gpgpu->max_threads;
>> +
>> +  gpgpu->per_thread_scratch = per_thread_size;
>> +
>> +  if(old && old->size < total) {
>> +    drm_intel_bo_unreference(old);
>> +    old = NULL;
>> +  }
>> +
>> +  if(!old)
>> +    gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO",
>> total, 4096);
>> +}
>> +static void
>>  intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t
>> size, uint32_t cchint)
>>  {
>>    drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
>> @@ -823,5 +851,6 @@ intel_set_gpgpu_callbacks(void)
>>    cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
>>    cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
>>    cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *)
>> intel_gpgpu_bind_sampler;
>> +  cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *)
>> intel_gpgpu_set_scratch;
>>  }
>>
>
>  I'm thinking of is there a way to test the scratch read/write directly?
>  Maybe it's not so straightforward as it's only
>  used when spill/unspill occurs.  And if a kernel triggers spill/unspill,
> it means the kernel is not very simple and can't
>  be a proper unit test case for scratch read/write. Any good idea?
>
>  - Zhigang
>
>>  --
>> 1.7.9.5
>>
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20130807/d469de85/attachment-0001.html>