<div dir="ltr"> <div class="gmail_extra"> <div class="gmail_quote">On Wed, Aug 7, 2013 at 10:05 AM, Ruiling Song <<a href="mailto:ruiling.song@intel.com" target="_blank">ruiling.song@intel.com</a>> wrote: <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">There are two modes of scratch RW, OBlock, HBlock. HBlock was documented as scratch block write/read in HW spec. </blockquote><div style> I guess you mean Dword Block here when you are talking about Hblock, right?</div><div style> The only two channel mode supported for scratch read/write are OWord or Dword.</div> <div style> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> While the OBlock was documented as OWord block read/write. I enabled both of them, but only used HBlock RW in later spill patch. Signed-off-by: Ruiling Song <<a href="mailto:ruiling.song@intel.com">ruiling.song@intel.com</a>> --- backend/src/backend/context.cpp | 18 ++++++++ backend/src/backend/context.hpp | 3 ++ backend/src/backend/gen/gen_mesa_disasm.c | 43 ++++++++++++++--- backend/src/backend/gen_context.cpp | 71 +++++++++++++++++++++++++++++ backend/src/backend/gen_context.hpp | 6 ++- backend/src/backend/gen_defs.hpp | 25 ++++++++++ backend/src/backend/gen_encoder.cpp | 67 ++++++++++++++++++++++++++- backend/src/backend/gen_encoder.hpp | 8 ++++ backend/src/backend/program.cpp | 8 ++++ backend/src/backend/program.h | 4 ++ backend/src/backend/program.hpp | 3 ++ src/cl_command_queue_gen7.c | 9 ++++ src/cl_driver.h | 4 ++ src/cl_driver_defs.c | 1 + src/intel/intel_gpgpu.c | 53 ++++++++++++++++----- 15 files changed, 303 insertions(+), 20 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 48160de..5484869 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -268,6 +268,15 @@ namespace gbe } } + static int + alignScratchSize(int size){ + int i = 0; + + for(; i < size; i+=1024) + ; + + return i; + } </blockquote><div style> How about just return size * 1024 here, and it'd better to be a macro or inline function? </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> /////////////////////////////////////////////////////////////////////////// // Generic Context (shared by the simulator and the HW context) /////////////////////////////////////////////////////////////////////////// @@ -284,6 +293,7 @@ namespace gbe this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH); else this->simdWidth = fn.getSimdWidth(); + this->scratchOffset = 0; } Context::~Context(void) { @@ -306,6 +316,8 @@ namespace gbe this->kernel = NULL; } if(this->kernel != NULL) + this->kernel->scratchSize = alignScratchSize(this->scratchOffset); + if(this->kernel != NULL; </blockquote><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> this->kernel->ctx = this; </blockquote><div style> How about just write it this way</div><div style> if (this->kernel != NULL) {</div><div style> this->kernel->scratchSize = alignScratchSize(this->scratchOffset);</div> <div style> this->kernel->ctx = this;</div><div style><div> } </div><div> </div></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> return this->kernel; } @@ -337,6 +349,12 @@ namespace gbe return offset + GEN_REG_SIZE; } + uint32_t Context::allocateScratchMem(uint32_t size) { + uint32_t offset = scratchOffset; + scratchOffset += size; + return offset; + } + void Context::buildStack(void) { const auto &stackUse = dag->getUse(ir::ocl::stackptr); if (stackUse.size() == 0) // no stack is used if stackptr is unused diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp index c205388..50c0e70 100644 --- a/backend/src/backend/context.hpp +++ b/backend/src/backend/context.hpp @@ -91,6 +91,8 @@ namespace gbe /* allocate a new entry for a specific image's information */ /*! Get (search or allocate if fail to find one) image info curbeOffset.*/ uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size); + /*! allocate size scratch memory and return start address */ + uint32_t allocateScratchMem(uint32_t size); protected: /*! Build the instruction stream. Return false if failed */ virtual bool emitCode(void) = 0; @@ -126,6 +128,7 @@ namespace gbe set<ir::LabelIndex> usedLabels; //!< Set of all used labels JIPMap JIPs; //!< Where to jump all labels/branches uint32_t simdWidth; //!< Number of lanes per HW threads + uint32_t scratchOffset; //!< scratch slot for next scratch memory request GBE_CLASS(Context); //!< Use custom allocators }; diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index ca8ca37..dd4f485 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -373,6 +373,28 @@ static const char *data_port_data_cache_category[] = { "scratch", }; +static const char *data_port_scratch_block_size[] = { + "1 HWord", + "2 HWord", + "Reserve", + "4 HWord", +}; </blockquote><div> </div><div style> Use "n HWord" is a little confused. They are just n simd8 registers and has Hword alignment</div><div style> in scratch memory, and the register data type is specified at the channel data type.</div> <div style> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> + +static const char *data_port_scratch_invalidate[] = { + "no invalidate", + "invalidate cache line", +}; + +static const char *data_port_scratch_data_type[] = { + "Oword", + "Dword", +}; + +static const char *data_port_scratch_msg_type[] = { + "Scratch Read", + "Scratch Write", +}; + static const char *data_port_data_cache_msg_type[] = { [0] = "OWord Block Read", [1] = "Unaligned OWord Block Read", @@ -1155,12 +1177,21 @@ int gen_disasm (FILE *file, const void *opaque_insn) inst->bits3.sampler_gen7.simd_mode); break; case GEN_SFID_DATAPORT_DATA_CACHE: - format (file, " (bti: %d, rgba: %d, %s, %s, %s)", - inst->bits3.gen7_untyped_rw.bti, - inst->bits3.gen7_untyped_rw.rgba, - data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode], - data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category], - data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]); + if(inst->bits3.gen7_untyped_rw.category == 0) { + format (file, " (bti: %d, rgba: %d, %s, %s, %s)", + inst->bits3.gen7_untyped_rw.bti, + inst->bits3.gen7_untyped_rw.rgba, + data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode], + data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category], + data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]); + } else { + format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)", + inst->bits3.gen7_scratch_rw.offset, + data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size], + data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read], + data_port_scratch_data_type[inst->bits3.gen7_scratch_rw.data_type], + data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]); + } break; case GEN_SFID_MESSAGE_GATEWAY: format (file, " (subfunc: %s, notify: %d, ackreq: %d)", diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 97a9527..e0e4a87 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -562,6 +562,77 @@ namespace gbe p->pop(); } + void GenContext::scratchWriteOWord(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) { + p->push(); + uint32_t simdWidth = p->curr.execWidth; + const uint32_t nr = <a href="http://header.nr" target="_blank">header.nr</a>; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + + p->curr.execWidth = 8; + p->MOV(header, GenRegister::ud8grf(0,0)); + + p->curr.execWidth = 1; + p->MOV(GenRegister::ud1grf(nr, 2), GenRegister::immud(offset/16)); + p->pop(); + + int size = typeSize(reg_type)*simdWidth; + p->push(); + p->SCRATCH_WRITE_OWORD(header, size, reg_num); + p->pop(); + } + + void GenContext::scratchReadOWord(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) { + p->push(); + uint32_t simdWidth = p->curr.execWidth; + const uint32_t nr = <a href="http://header.nr" target="_blank">header.nr</a>; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + + p->curr.execWidth = 8; + p->MOV(header, GenRegister::ud8grf(0,0)); + + p->curr.execWidth = 1; + p->MOV(GenRegister::ud1grf(nr, 2), GenRegister::immud(offset/16)); + p->pop(); + + int size = typeSize(reg_type)*simdWidth; + p->push(); + p->SCRATCH_READ_OWORD(dst, header, size, reg_num); + p->pop(); + } + + void GenContext::scratchWriteHWord(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) { </blockquote><div style> I think you should use scratchWriteDWord here? right? </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> + p->push(); + uint32_t simdWidth = p->curr.execWidth; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + + p->curr.execWidth = 8; + p->MOV(header, GenRegister::ud8grf(0,0)); + p->pop(); + + int size = typeSize(reg_type)*simdWidth; + p->push(); + p->SCRATCH_WRITE_HWORD(header, offset/32, size, reg_num); + p->pop(); + } + + void GenContext::scratchReadHWord(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type) { + p->push(); + uint32_t simdWidth = p->curr.execWidth; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 8; + p->MOV(header, GenRegister::ud8grf(0,0)); + p->pop(); + + int size = typeSize(reg_type)*simdWidth; + p->push(); + p->SCRATCH_READ_HWORD(dst, header, offset/32, size, reg_num); + p->pop(); + } +</blockquote><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) { const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem)); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index dc5dc45..3b78342 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -41,6 +41,7 @@ namespace gbe class Selection; // Performs instruction selection class SelectionInstruction; // Pre-RA Gen instruction class SelectionReg; // Pre-RA Gen register + class GenRegister; /*! Context is the helper structure to build the Gen ISA or simulation code * from GenIR @@ -108,7 +109,10 @@ namespace gbe void emitSampleInstruction(const SelectionInstruction &insn); void emitTypedWriteInstruction(const SelectionInstruction &insn); void emitGetImageInfoInstruction(const SelectionInstruction &insn); - + void scratchWriteOWord(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type); + void scratchReadOWord(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type); + void scratchWriteHWord(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type); + void scratchReadHWord(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type); /*! Implements base class */ virtual Kernel *allocateKernel(void); /*! Store the position of each label instruction in the Gen ISA stream */ diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index 5b15e30..00e61c9 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -319,6 +319,15 @@ enum GenMessageTarget { #define GEN_BYTE_SCATTER 12//1100: Byte Scattered Write #define GEN_UNTYPED_WRITE 13//1101: Untyped Surface Write +/* Data port data cache scratch messages*/ +#define GEN_SCRATCH_READ 0 +#define GEN_SCRATCH_WRITE 1 +#define GEN_SCRATCH_DATA_OWORD 0 +#define GEN_SCRATCH_DATA_DWORD 1 +#define GEN_SCRATCH_BLOCK_SIZE_1 0 +#define GEN_SCRATCH_BLOCK_SIZE_2 1 +#define GEN_SCRATCH_BLOCK_SIZE_4 3 + /* Data port render cache Message Type*/ #define GEN_MBLOCK_READ 4 //0100: Media Block Read #define GEN_TYPED_READ 5 //0101: Typed Surface Read @@ -765,6 +774,22 @@ struct GenInstruction uint32_t end_of_thread:1; } gen7_byte_rw; + /*! Data port Scratch Read/ write */ + struct { + uint32_t offset:12; + uint32_t block_size:2; + uint32_t ignored0:1; + uint32_t invalidate_after_read:1; + uint32_t data_type:1; + uint32_t msg_type:1; + uint32_t category:1; + uint32_t header_present:1; + uint32_t response_length:5; + uint32_t msg_length:4; + uint32_t pad2:2; + uint32_t end_of_thread:1; + } gen7_scratch_rw; + /*! Data port OBlock read / write */ struct { uint32_t bti:8; diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 64b5bd1..afb193f 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -147,7 +147,7 @@ namespace gbe else NOT_SUPPORTED; } -#if 0 +#if 1 static void setOBlockRW(GenEncoder *p, GenInstruction *insn, uint32_t bti, @@ -1136,6 +1136,71 @@ namespace gbe this->setSrc0(insn, msg); setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present); } </blockquote><div style> I don't think you can reuse the setOBlockRW for the scratch buffer.</div><div style> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> + static void setScratchMessage(GenEncoder *p, + GenInstruction *insn, + uint32_t offset, + uint32_t block_size, + uint32_t data_type, + uint32_t msg_type, + uint32_t msg_length, + uint32_t response_length) + { + const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE; + setMessageDescriptor(p, insn, sfid, msg_length, response_length, true); + insn->bits3.gen7_scratch_rw.block_size = block_size; + insn->bits3.gen7_scratch_rw.msg_type = msg_type; + insn->bits3.gen7_scratch_rw.data_type = data_type; + insn->bits3.gen7_scratch_rw.offset = offset; + insn->bits3.gen7_scratch_rw.category = 1; + } + + void GenEncoder::SCRATCH_WRITE_HWORD(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num) + { + assert(src_num == 1 || src_num ==2); + uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2; + GenInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); + this->setSrc0(insn, msg); + this->setSrc1(insn, GenRegister::immud(0)); + // here src_num means register that will be write out: in terms of 32byte register number + setScratchMessage(this, insn, offset, block_size, GEN_SCRATCH_DATA_DWORD, GEN_SCRATCH_WRITE, src_num+1, 0); + } + + void GenEncoder::SCRATCH_READ_HWORD(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num) + { + assert(dst_num == 1 || dst_num ==2); + uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2; + GenInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, dst); + this->setSrc0(insn, src); + this->setSrc1(insn, GenRegister::immud(0)); + // here dst_num is the register that will be write-back: in terms of 32byte register + setScratchMessage(this, insn, offset, block_size, GEN_SCRATCH_DATA_DWORD, GEN_SCRATCH_READ, 1, dst_num); + } + + void GenEncoder::SCRATCH_WRITE_OWORD(GenRegister msg, uint32_t size, uint32_t src_num) + { + GenInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); + this->setSrc0(insn, msg); + this->setSrc1(insn, GenRegister::immud(0)); + // here src_num means registers that will be write out: in terms of 32byte register number + setOBlockRW(this, insn, 255, size/16, GEN_OBLOCK_WRITE, src_num+1, 0); + } </blockquote><div style> I'm confused here, you prepared a message header for Scratch OWord read/write, but then you send a OWord Block Read</div> message. I don't think it's going to work as desired. You can check the "OWord Block Read/Write" in the spec, you can find</div><div class="gmail_quote"> a restriction there:</div><div class="gmail_quote"> the only surface type allowed is SURFTYPE_BUFFER.</div> <div class="gmail_quote"> And here, the surface is a scratch buffer. <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> + + void GenEncoder::SCRATCH_READ_OWORD(GenRegister dst, GenRegister src, uint32_t size, uint32_t dst_num) + { + GenInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, dst); + this->setSrc0(insn, src); + this->setSrc1(insn, GenRegister::immud(0)); + // here dst_num is the register that will be write-back: in terms of 32byte register + setOBlockRW(this, insn, 255, size/16, GEN_OBLOCK_READ, 1, dst_num); + } void GenEncoder::EOT(uint32_t msg) { GenInstruction *insn = this->next(GEN_OPCODE_SEND); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index 083bd8c..66c4f25 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -155,6 +155,14 @@ namespace gbe void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize); /*! Byte scatter (for unaligned bytes, shorts and ints) */ void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize); + /*! for scratch memory oblock read */ + void SCRATCH_READ_OWORD(GenRegister msg, GenRegister dst, uint32_t size, uint32_t dst_num); + /*! for scratch memory oblock write */ + void SCRATCH_WRITE_OWORD(GenRegister msg, uint32_t size, uint32_t src_num); + /*! for scratch memory hblock read */ + void SCRATCH_READ_HWORD(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num); + /*! for scratch memory hblock write */ + void SCRATCH_WRITE_HWORD(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num); /*! Send instruction for the sampler */ void SAMPLE(GenRegister dest, GenRegister msg, diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 26c22f3..35d3a7c 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -374,6 +374,12 @@ namespace gbe { return kernel->getStackSize(); } + static int32_t kernelGetScratchSize(gbe_kernel genKernel) { + if (genKernel == NULL) return 0; + const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel; + return kernel->getScratchSize(); + } + static int32_t kernelUseSLM(gbe_kernel genKernel) { if (genKernel == NULL) return 0; const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel; @@ -443,6 +449,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL; +GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL; @@ -476,6 +483,7 @@ namespace gbe gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset; gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize; gbe_kernel_get_stack_size = gbe::kernelGetStackSize; + gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize; gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize; gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize; gbe_kernel_use_slm = gbe::kernelUseSLM; diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index f36bfbf..d20e7af 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -198,6 +198,10 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size; typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel); extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size; +/*! Get the scratch size (zero if no scratch is required) */ +typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel); +extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size; + /*! Get the curbe offset where to put the data. Returns -1 if not required */ typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type); extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset; diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp index 2d67310..83aaab8 100644 --- a/backend/src/backend/program.hpp +++ b/backend/src/backend/program.hpp @@ -96,6 +96,8 @@ namespace gbe { INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; } /*! Return the size of the stack (zero if none) */ INLINE uint32_t getStackSize(void) const { return this->stackSize; } + /*! Return the size of the scratch memory needed (zero if none) */ + INLINE uint32_t getScratchSize(void) const { return this->scratchSize; } /*! Get the SIMD width for the kernel */ INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; } /*! Says if SLM is needed for it */ @@ -135,6 +137,7 @@ namespace gbe { uint32_t curbeSize; //!< Size of the data to push uint32_t simdWidth; //!< SIMD size for the kernel (lane number) uint32_t stackSize; //!< Stack size (may be 0 if unused) + uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused) bool useSLM; //!< SLM requires a special HW config Context *ctx; //!< Save context after compiler to alloc constant buffer curbe ir::SamplerSet *samplerSet;//!< Copy from the corresponding function. diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 8933213..e58433f 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -183,6 +183,14 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3); } +static void +cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker) +{ + int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque); + + cl_gpgpu_set_scratch(gpgpu, scratch_sz); +} + LOCAL cl_int cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_kernel ker, @@ -231,6 +239,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Bind all samplers */ cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz); + cl_setup_scratch(gpgpu, ker); /* Bind a stack if needed */ cl_bind_stack(gpgpu, ker); cl_gpgpu_states_setup(gpgpu, &kernel); diff --git a/src/cl_driver.h b/src/cl_driver.h index 212beb3..673985d 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -135,6 +135,10 @@ extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image; typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint); extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack; +/* Setup scratch */ +typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size); +extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch; + /* Configure internal state */ typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry); extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init; diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index 4952288..9aa926e 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -50,6 +50,7 @@ LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL; LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL; LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL; LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL; +LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL; LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL; LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 2791fbe..b7434c4 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -89,7 +89,9 @@ struct intel_gpgpu struct { drm_intel_bo *bo; } curbe_b; struct { drm_intel_bo *bo; } sampler_state_b; struct { drm_intel_bo *bo; } perf_b; + struct { drm_intel_bo *bo; } scratch_b; + uint32_t per_thread_scratch; struct { uint32_t num_cs_entries; uint32_t size_cs_entry; /* size of one entry in 512bit elements */ @@ -127,6 +129,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu) drm_intel_bo_unreference(gpgpu-><a href="http://perf_b.bo" target="_blank">perf_b.bo</a>); if (gpgpu-><a href="http://stack_b.bo" target="_blank">stack_b.bo</a>) drm_intel_bo_unreference(gpgpu-><a href="http://stack_b.bo" target="_blank">stack_b.bo</a>); + if (gpgpu-><a href="http://scratch_b.bo" target="_blank">scratch_b.bo</a>) + drm_intel_bo_unreference(gpgpu-><a href="http://scratch_b.bo" target="_blank">scratch_b.bo</a>); + intel_batchbuffer_delete(gpgpu->batch); cl_free(gpgpu); } @@ -199,18 +204,23 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu) BEGIN_BATCH(gpgpu->batch, 8); OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2)); - gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*) - intel_batchbuffer_alloc_space(gpgpu->batch,0); - - memset(vfe, 0, sizeof(struct gen6_vfe_state_inline)); - vfe->vfe1.gpgpu_mode = 1; - vfe->vfe1.bypass_gateway_ctl = 1; - vfe->vfe1.reset_gateway_timer = 1; - vfe->vfe1.max_threads = gpgpu->max_threads - 1; - vfe->vfe1.urb_entries = 64; - vfe->vfe3.curbe_size = 480; - vfe->vfe4.scoreboard_mask = 0; - intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t)); + if(gpgpu->per_thread_scratch > 0) { + OUT_RELOC(gpgpu->batch, gpgpu-><a href="http://scratch_b.bo" target="_blank">scratch_b.bo</a>, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + gpgpu->per_thread_scratch/1024 - 1); + } + else { + OUT_BATCH(gpgpu->batch, 0); + } + /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */ + OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4); + OUT_BATCH(gpgpu->batch, 0); + /* curbe_size */ + OUT_BATCH(gpgpu->batch, 480); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(gpgpu->batch); } @@ -434,6 +444,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, if (gpgpu-><a href="http://stack_b.bo" target="_blank">stack_b.bo</a>) dri_bo_unreference(gpgpu-><a href="http://stack_b.bo" target="_blank">stack_b.bo</a>); gpgpu-><a href="http://stack_b.bo" target="_blank">stack_b.bo</a> = NULL; + } </blockquote><div style> Don't need to add the above blank line.</div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> static void @@ -537,6 +548,23 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u } static void +intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size) +{ + drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; + drm_intel_bo* old = gpgpu-><a href="http://scratch_b.bo" target="_blank">scratch_b.bo</a>; + uint32_t total = per_thread_size * gpgpu->max_threads; + + gpgpu->per_thread_scratch = per_thread_size; + + if(old && old->size < total) { + drm_intel_bo_unreference(old); + old = NULL; + } + + if(!old) + gpgpu-><a href="http://scratch_b.bo" target="_blank">scratch_b.bo</a> = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096); +} +static void intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint) { drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; @@ -823,5 +851,6 @@ intel_set_gpgpu_callbacks(void) cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush; cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler; + cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch; } </blockquote><div> </div><div style> I'm thinking of is there a way to test the scratch read/write directly? Maybe it's not so straightforward as it's only</div><div style> used when spill/unspill occurs. And if a kernel triggers spill/unspill, it means the kernel is not very simple and can't</div> <div style> be a proper unit test case for scratch read/write. Any good idea?</div><div style> </div><div style> - Zhigang</div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"> -- 1.7.9.5 _______________________________________________ Beignet mailing list <a href="mailto:Beignet@lists.freedesktop.org">Beignet@lists.freedesktop.org</a> <a href="http://lists.freedesktop.org/mailman/listinfo/beignet" target="_blank">http://lists.freedesktop.org/mailman/listinfo/beignet</a> </blockquote></div> </div></div>