[Beignet] [Beignet 2/3] Put constant buffer of argument to payload.
Yang Rong
rong.r.yang at intel.com
Tue Apr 16 01:48:17 PDT 2013
Change load to ia move. Need recompile when queue range.
Now if thers is not enough gen reg for constant buffer, will assert.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/backend/context.cpp | 70 ++++++++++++++++++++
backend/src/backend/context.hpp | 14 ++++
backend/src/backend/gen_context.cpp | 46 ++++++++++++-
backend/src/backend/gen_context.hpp | 8 ++-
backend/src/backend/gen_encoder.cpp | 10 ++-
backend/src/backend/gen_encoder.hpp | 4 ++
.../src/backend/gen_insn_gen7_schedule_info.hxx | 1 +
backend/src/backend/gen_insn_selection.cpp | 30 ++++++++-
backend/src/backend/gen_insn_selection.hxx | 2 +
backend/src/backend/gen_program.cpp | 14 ++--
backend/src/backend/gen_program.hpp | 3 +
backend/src/backend/gen_register.hpp | 10 +++
backend/src/backend/program.cpp | 16 +++++
backend/src/backend/program.h | 11 ++-
backend/src/backend/program.hpp | 15 +++++
src/cl_command_queue.c | 19 ++++++
src/cl_command_queue.h | 2 +
src/cl_command_queue_gen7.c | 11 ++-
src/cl_kernel.c | 9 ++-
src/cl_mem.c | 1 +
src/cl_mem.h | 1 +
21 files changed, 282 insertions(+), 15 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 88f704b..0dd433b 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -288,6 +288,7 @@ namespace gbe
Kernel *Context::compileKernel(void) {
this->kernel = this->allocateKernel();
+ this->kernel->bCurbeChanged = false;
this->kernel->simdWidth = this->simdWidth;
this->buildPatchList();
this->buildArgList();
@@ -295,12 +296,49 @@ namespace gbe
this->buildJIPs();
this->buildStack();
this->handleSLM();
+ this->buildConstBufs();
if (this->emitCode() == false) {
GBE_DELETE(this->kernel);
this->kernel = NULL;
}
return this->kernel;
}
+
+ void Context::recompileKernel(void) {
+ //content's unit have destroyed after compiler, so can't use unti, fn in recompiler
+ if(this->kernel->bCurbeChanged) {
+ this->reallocCurbe();
+ this->patchCBOffsets();
+ this->kernel->bCurbeChanged = false;
+ }
+ }
+
+ void Context::reallocCurbe(void) {
+ for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
+ if(kernel->args[argID].type != GBE_ARG_CONSTANT_PTR)
+ continue;
+ if(kernel->args[argID].bufSize == kernel->args[argID].allocedSize)
+ continue;
+
+ //free previous
+ int32_t offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+ if(offset >= 0)
+ deallocate(offset+GEN_REG_SIZE);
+ kernel->args[argID].allocedSize = 0;
+
+ if(kernel->args[argID].bufSize > 0) {
+ newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_CONSTANT_BUFFER+argID, kernel->args[argID].bufSize, 32);
+ kernel->args[argID].allocedSize = kernel->args[argID].bufSize;
+ }
+
+ //add to cbCurbeOffsets
+ std::sort(kernel->patches.begin(), kernel->patches.end());
+ offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+ GBE_ASSERT(offset>=0);
+ cbCurbeOffsets.insert(std::make_pair(kernel->args[argID].reg, offset+GEN_REG_SIZE));
+ }
+ kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+ }
int16_t Context::allocate(int16_t size, int16_t alignment) {
return partitioner->allocate(size, alignment);
@@ -420,6 +458,10 @@ namespace gbe
kernel->args = NULL;
for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
const auto &arg = fn.getArg(argID);
+ kernel->args[argID].bufSize = 0;
+ kernel->args[argID].allocedSize = 0;
+ //in recompiler, unit have been destroyed, so record Register here
+ kernel->args[argID].reg = arg.reg;
switch (arg.type) {
case ir::FunctionArgument::VALUE:
case ir::FunctionArgument::STRUCTURE:
@@ -547,6 +589,34 @@ namespace gbe
const bool useSLM = fn.getUseSLM();
kernel->useSLM = useSLM;
}
+
+ void Context::buildConstBufs(void) {
+ uint32_t argNum = fn.argNum();
+ for (uint32_t argID = 0; argID < argNum; ++argID) {
+ const auto &arg = fn.getArg(argID);
+
+ if(arg.type != ir::FunctionArgument::CONSTANT_POINTER)
+ continue;
+
+ ir::Register srcReg = arg.reg;
+ constBaseProps.insert(std::make_pair(srcReg, srcReg));
+
+ //build the const Propagation registers
+ fn.foreachInstruction([this](const ir::Instruction &insn) {
+ using namespace ir;
+ if(insn.getOpcode() == OP_LOAD) return;
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const ir::Register reg = insn.getSrc(srcID);
+ if((constBaseProps.contains(reg) != false)) {
+ const ir::Register dstReg = insn.getDst();
+ constBaseProps.insert(std::make_pair(dstReg, constBaseProps.find(reg)->second));
+ break;
+ }
+ }
+ });
+ }
+ }
bool Context::isScalarReg(const ir::Register ®) const {
GBE_ASSERT(fn.getProfile() == ir::Profile::PROFILE_OCL);
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 55a63a7..c897769 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -57,6 +57,8 @@ namespace gbe
virtual ~Context(void);
/*! Compile the code */
Kernel *compileKernel(void);
+ void recompileKernel(void);
+ void reallocCurbe(void);
/*! Tells if the labels is used */
INLINE bool isLabelUsed(ir::LabelIndex index) const {
return usedLabels.contains(index);
@@ -82,6 +84,10 @@ namespace gbe
INLINE bool hasJIP(const ir::Instruction *insn) const {
return JIPs.find(insn) != JIPs.end();
}
+ INLINE ir::Register getConstBaseReg(const ir::Register reg) const {
+ GBE_ASSERT(constBaseProps.find(reg) != constBaseProps.end());
+ return constBaseProps.find(reg)->second;
+ }
/*! Allocate some memory in the register file */
int16_t allocate(int16_t size, int16_t alignment);
/*! Deallocate previously allocated memory */
@@ -89,6 +95,8 @@ namespace gbe
protected:
/*! Build the instruction stream. Return false if failed */
virtual bool emitCode(void) = 0;
+ /*! Patch the curbe gen register offset */
+ virtual void patchCBOffsets(void) = 0;
/*! Allocate a new empty kernel (to be implemented) */
virtual Kernel *allocateKernel(void) = 0;
/*! Look if a stack is needed and allocate it */
@@ -105,12 +113,16 @@ namespace gbe
void buildJIPs(void);
/*! Configure SLM use if needed */
void handleSLM(void);
+ /* Build constant buffer map */
+ void buildConstBufs(void);
/*! Insert a new entry with the given size in the Curbe. Return the offset
* of the entry
*/
void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
/*! Provide for each branch and label the label index target */
typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
+ typedef map<const ir::Register, ir::Register> ConstBasePropMap;
+ typedef map<const ir::Register, uint32_t> CBRegOffsetMap;
const ir::Unit &unit; //!< Unit that contains the kernel
const ir::Function &fn; //!< Function to compile
std::string name; //!< Name of the kernel to compile
@@ -120,6 +132,8 @@ namespace gbe
RegisterFilePartitioner *partitioner; //!< Handle register file partionning
set<ir::LabelIndex> usedLabels; //!< Set of all used labels
JIPMap JIPs; //!< Where to jump all labels/branches
+ ConstBasePropMap constBaseProps; //!< const buffer reg propagation regs
+ CBRegOffsetMap cbCurbeOffsets; //!< const buffer base ir reg --> curbe reg offset Map
uint32_t simdWidth; //!< Number of lanes per HW threads
GBE_CLASS(Context); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index b4c9a65..1f51854 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -83,7 +83,20 @@ namespace gbe
p->patchJMPI(insnID, (targetID-insnID-1) * 2);
}
}
-
+
+ void GenContext::patchCBOffsets(void) {
+ using namespace ir;
+ for (auto pair : cbMovePos) {
+ const Register reg = pair.first;
+ const int32_t insnID = pair.second;
+ const int32_t offset = cbCurbeOffsets.find(reg)->second;
+ p->patchAddrIMM(insnID+1, offset);
+ }
+
+ GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
+ std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
+ }
+
void GenContext::emitStackPointer(void) {
using namespace ir;
@@ -218,7 +231,38 @@ namespace gbe
this->branchPos2.push_back(std::make_pair(label, p->store.size()));
p->JMPI(src);
}
+
+ void GenContext::emitCBMoveInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src0 = ra->genReg(insn.src(0));
+ const GenRegister src1 = ra->genReg(insn.src(1));
+ const GenRegister src2 = ra->genReg(insn.src(2));
+ const GenRegister a0 = GenRegister::addr8(0);
+ p->ADD(src0, src1, GenRegister::negate(src2));
+
+ this->cbMovePos.push_back(std::make_pair(ir::Register(insn.src(2).value.reg), p->store.size()));
+ p->push();
+ p->curr.execWidth = 8;
+ p->MOV(a0, GenRegister::unpacked_uw(src0.nr, 0));
+ //p->ADD(a0, a0, GenRegister::immuw(0));
+ p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+ p->pop();
+
+ if (simdWidth == 16) {
+ const GenRegister dst1 = GenRegister::Qn(dst, 1);
+ const GenRegister src0_1 = GenRegister::Qn(GenRegister::unpacked_uw(src0.nr, 0), 1);
+ this->cbMovePos.push_back(std::make_pair(ir::Register(insn.src(2).value.reg), p->store.size()));
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(a0, src0_1);
+ //p->ADD(a0, a0, GenRegister::immuw(0));
+ p->MOV(dst1, GenRegister::indirect(dst1.type, 0, GEN_WIDTH_8));
+ p->pop();
+ }
+ }
+
void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6af174f..8131b5f 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -55,6 +55,8 @@ namespace gbe
~GenContext(void);
/*! Implements base class */
virtual bool emitCode(void);
+ /*! Patch the curbe gen register offset */
+ virtual void patchCBOffsets(void);
/*! Function we emit code for */
INLINE const ir::Function &getFunction(void) const { return fn; }
/*! Simd width chosen for the current function */
@@ -80,6 +82,7 @@ namespace gbe
void emitTernaryInstruction(const SelectionInstruction &insn);
void emitCompareInstruction(const SelectionInstruction &insn);
void emitJumpInstruction(const SelectionInstruction &insn);
+ void emitCBMoveInstruction(const SelectionInstruction &insn);
void emitEotInstruction(const SelectionInstruction &insn);
void emitNoOpInstruction(const SelectionInstruction &insn);
void emitWaitInstruction(const SelectionInstruction &insn);
@@ -91,13 +94,14 @@ namespace gbe
void emitByteScatterInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
-
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Store the position of each label instruction in the Gen ISA stream */
- map<ir::LabelIndex, uint32_t> labelPos;
+ map<ir::LabelIndex, uint32_t> labelPos;
/*! Store the Gen instructions to patch */
vector<std::pair<ir::LabelIndex, uint32_t>> branchPos2;
+ /*! Store the constant buffer ia mov instructions to patch */
+ vector<std::pair<ir::Register, uint32_t>> cbMovePos;
/*! Encode Gen ISA */
GenEncoder *p;
/*! Instruction selection on Gen ISA (pre-register allocation) */
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index ed7c256..53ae17f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -685,7 +685,15 @@ namespace gbe
assert(insn.header.opcode == GEN_OPCODE_JMPI);
this->setSrc1(&insn, GenRegister::immd(jumpDistance));
}
-
+
+ void GenEncoder::patchAddrIMM(uint32_t insnID, int32_t offset) {
+ GenInstruction &insn = this->store[insnID];
+ assert(insnID < this->store.size());
+ assert(insn.header.opcode == GEN_OPCODE_MOV);
+ insn.bits2.ia1.src0_indirect_offset = offset;
+ //this->setSrc1(&insn, GenRegister::immuw(offset));
+ }
+
void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1) {
if (needToSplitCmp(this, src0, src1) == false) {
GenInstruction *insn = this->next(GEN_OPCODE_CMP);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 7e26f0a..8d66289 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -65,6 +65,7 @@ namespace gbe
public:
/*! simdWidth is the default width for the instructions */
GenEncoder(uint32_t simdWidth, uint32_t gen);
+
/*! Size of the stack (should be large enough) */
enum { MAX_STATE_NUM = 16 };
/*! Push the current instruction state */
@@ -156,6 +157,9 @@ namespace gbe
/*! Patch JMPI (located at index insnID) with the given jump distance */
void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+ /*! Patch the constant buffer ia mov offset */
+ void patchAddrIMM(uint32_t insnID, int32_t offset);
+
////////////////////////////////////////////////////////////////////////
// Helper functions to encode
////////////////////////////////////////////////////////////////////////
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 969ec82..7575dc3 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -4,6 +4,7 @@ DECL_GEN7_SCHEDULE(Unary, 20, 4, 2)
DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
+DECL_GEN7_SCHEDULE(CBMove, 20, 2, 2)
DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
DECL_GEN7_SCHEDULE(NoOp, 20, 2, 2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index ecaaeeb..ef83d86 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -435,6 +435,8 @@ namespace gbe
void LABEL(ir::LabelIndex label);
/*! Jump indexed instruction */
void JMPI(Reg src, ir::LabelIndex target);
+ /*!constant buffer mov instruction*/
+ void CB_MOVE(Reg dst, Reg src0, Reg src1, Reg src2);
/*! Compare instructions */
void CMP(uint32_t conditional, Reg src0, Reg src1);
/*! Select instruction with embedded comparison */
@@ -684,6 +686,14 @@ namespace gbe
insn->index = uint16_t(index);
}
+ void Selection::Opaque::CB_MOVE(Reg dst, Reg src0, Reg src1, Reg src2) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_CB_MOVE, 1, 3);
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->src(2) = src2;
+ insn->dst(0) = dst;
+ }
+
void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 0, 2);
insn->src(0) = src0;
@@ -1614,6 +1624,21 @@ namespace gbe
/*! Load instruction pattern */
DECL_PATTERN(LoadInstruction)
{
+ void emitCBMove(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister addr) const
+ {
+ using namespace ir;
+ GBE_ASSERT(insn.getValueNum() == 1); //todo: handle vec later
+
+ GenRegister offset = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ const GenRegister baseReg = sel.selReg(sel.ctx.getConstBaseReg(Register(addr.value.reg)), TYPE_U32);
+ //const uint32_t valueNum = insn.getValueNum();
+ //GenRegister dst[valueNum];
+ //for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
+
+ sel.CB_MOVE(dst, offset, GenRegister::retype(addr, GEN_TYPE_UD), baseReg);
+ }
+
void emitUntypedRead(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
@@ -1659,10 +1684,13 @@ namespace gbe
const GenRegister address = sel.selReg(insn.getAddress());
const AddressSpace space = insn.getAddressSpace();
GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+ insn.getAddressSpace() == MEM_CONSTANT ||
insn.getAddressSpace() == MEM_PRIVATE ||
insn.getAddressSpace() == MEM_LOCAL);
GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
- if (insn.isAligned() == true)
+ if(insn.getAddressSpace() == MEM_CONSTANT)
+ this->emitCBMove(sel, insn, address);
+ else if (insn.isAligned() == true)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
const GenRegister value = sel.selReg(insn.getValue(0));
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 691100b..14ff08c 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -24,6 +24,7 @@ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
DECL_SELECTION_IR(MAD, TernaryInstruction)
DECL_SELECTION_IR(JMPI, JumpInstruction)
DECL_SELECTION_IR(EOT, EotInstruction)
+DECL_SELECTION_IR(CB_MOVE, CBMoveInstruction)
DECL_SELECTION_IR(NOP, NoOpInstruction)
DECL_SELECTION_IR(WAIT, WaitInstruction)
DECL_SELECTION_IR(MATH, MathInstruction)
@@ -34,3 +35,4 @@ DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 8a7efdb..4cac5ba 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -37,12 +37,12 @@
namespace gbe {
GenKernel::GenKernel(const std::string &name) :
- Kernel(name), insns(NULL), insnNum(0)
+ Kernel(name), insns(NULL), insnNum(0), cxt(NULL)
{}
- GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
+ GenKernel::~GenKernel(void) { GBE_DELETE(cxt); GBE_SAFE_DELETE_ARRAY(insns); }
const char *GenKernel::getCode(void) const { return (const char*) insns; }
size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
-
+ void GenKernel::recompile(void) { cxt->recompileKernel(); }
GenProgram::GenProgram(void) {}
GenProgram::~GenProgram(void) {}
@@ -58,7 +58,6 @@ namespace gbe {
};
Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name) {
-
// Be careful when the simdWidth is forced by the programmer. We can see it
// when the function already provides the simd width we need to use (i.e.
// non zero)
@@ -76,16 +75,17 @@ namespace gbe {
unit.getFunction(name)->setSimdWidth(simdWidth);
Context *ctx = GBE_NEW(GenContext, unit, name, limitRegisterPressure);
kernel = ctx->compileKernel();
- GBE_DELETE(ctx);
- if (kernel != NULL)
+ if (kernel != NULL) {
+ ((GenKernel *)kernel)->cxt = (GenContext *)ctx;
break;
+ }
+ GBE_DELETE(ctx);
}
// XXX spill must be implemented
GBE_ASSERTM(kernel != NULL, "Register spilling not supported yet!");
return kernel;
}
-
static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
NOT_IMPLEMENTED;
return NULL;
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 68b0427..f41eb0c 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -27,6 +27,7 @@
#include "backend/program.h"
#include "backend/program.hpp"
+#include "backend/gen_context.hpp"
// Gen ISA instruction
struct GenInstruction;
@@ -44,8 +45,10 @@ namespace gbe
virtual const char *getCode(void) const;
/*! Implements base class */
virtual size_t getCodeSize(void) const;
+ virtual void recompile(void);
GenInstruction *insns; //!< Instruction stream
uint32_t insnNum; //!< Number of instructions
+ GenContext *cxt;
GBE_CLASS(GenKernel); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 92122a6..d772b0d 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -725,6 +725,16 @@ namespace gbe
return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
+ static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UW,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
static INLINE GenRegister mask(uint32_t subnr) {
return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
}
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index df0df28..e08cb20 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -229,6 +229,18 @@ namespace gbe {
return kernel->getUseSLM() ? 1 : 0;
}
+ static void kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
+ if (genKernel == NULL) return;
+ gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
+ kernel->setConstBufSize(argID, sz);
+ }
+
+ static void kernelRecompile(gbe_kernel genKernel){
+ if(genKernel == NULL) return;
+ gbe::Kernel *kernel = (gbe::Kernel *) genKernel;
+ kernel->recompile();
+ }
+
static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
return 0u;
}
@@ -251,6 +263,8 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_recompile_cb *gbe_kernel_recompile = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
@@ -275,6 +289,8 @@ namespace gbe
gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+ gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
+ gbe_kernel_recompile = gbe::kernelRecompile;
gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
gbe_kernel_use_slm = gbe::kernelUseSLM;
genSetupCallBacks();
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index b90c1df..08db4eb 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -81,7 +81,8 @@ enum gbe_curbe_type {
/*! Extra arguments use the negative range of sub-values */
enum gbe_extra_argument {
- GBE_STACK_BUFFER = 0 /* Give stack location in curbe */
+ GBE_STACK_BUFFER = 0, /* Give stack location in curbe */
+ GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
};
/*! Create a new program from the given source code (zero terminated string) */
@@ -155,6 +156,14 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
+/*! Set the const pointer arg size */
+typedef void (gbe_kernel_set_const_buffer_size_cb)(gbe_kernel, uint32_t argID, size_t sz);
+extern gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size;
+
+/*! Recompiler kernel when enqueue only for const now */
+typedef void (gbe_kernel_recompile_cb)(gbe_kernel);
+extern gbe_kernel_recompile_cb *gbe_kernel_recompile;
+
/*! Get the curbe offset where to put the data. Returns -1 if not required */
typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e0f7dba..a8b2615 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -25,6 +25,7 @@
#ifndef __GBE_PROGRAM_HPP__
#define __GBE_PROGRAM_HPP__
+#include "ir/instruction.hpp"
#include "backend/program.h"
#include "sys/hash_map.hpp"
#include "sys/vector.hpp"
@@ -42,6 +43,9 @@ namespace gbe {
struct KernelArgument {
gbe_arg_type type; //!< Pointer, structure, image, regular value?
uint32_t size; //!< Size of the argument
+ uint32_t bufSize; //!< Size of constant buffer
+ uint32_t allocedSize; //!< Size of alloced cb, used to check whether need realloc cb
+ ir::Register reg; //!< because of some member of context like unit and fn destoryed when recompiler, save reg for it
};
/*! Stores the offset where to patch where to patch */
@@ -94,6 +98,16 @@ namespace gbe {
INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
/*! Says if SLM is needed for it */
INLINE bool getUseSLM(void) const { return this->useSLM; }
+ void setConstBufSize(uint32_t argID, size_t sz) {
+ if(argID >= argNum) return;
+ if(args[argID].type != GBE_ARG_CONSTANT_PTR) return;
+ if(args[argID].bufSize != sz) {
+ this->bCurbeChanged = true;
+ args[argID].bufSize = sz;
+ }
+ }
+ /* recompile for constant buffer alloc */
+ virtual void recompile(void) = 0;
protected:
friend class Context; //!< Owns the kernels
const std::string name; //!< Kernel name
@@ -104,6 +118,7 @@ namespace gbe {
uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
uint32_t stackSize; //!< Stack size (may be 0 if unused)
bool useSLM; //!< SLM requires a special HW config
+ bool bCurbeChanged;
GBE_CLASS(Kernel); //!< Use custom allocators
};
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 37e78b4..8e9fc31 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -127,6 +127,25 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
return CL_SUCCESS;
}
+LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
+ char * dst)
+{
+ int i;
+ for(i = 0; i < k->arg_n; i++) {
+ enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
+
+ if(arg_type == GBE_ARG_CONSTANT_PTR) {
+ uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
+ cl_mem mem = k->args[i].mem;
+ cl_buffer_map(mem->bo, 1);
+ void * addr = cl_buffer_get_virtual(mem->bo);
+ memcpy(dst + offset, addr, mem->size);
+ cl_buffer_unmap(mem->bo);
+ }
+ }
+ return CL_SUCCESS;
+}
+
#if USE_FULSIM
extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 1e2bcc1..7c571da 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -69,5 +69,7 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
/* Bind all the surfaces in the GPGPU state */
extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+/*update constant buffer to final curbe */
+extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
#endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 3a590bc..0b7b9a9 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -190,6 +190,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
size_t thread_n = 0u;
cl_int err = CL_SUCCESS;
+ gbe_kernel_recompile(ker->opaque);
+ ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
+ if(cst_sz != ker->curbe_sz) {
+ cl_kernel_setup(ker, ker->opaque);
+ cst_sz = ker->curbe_sz;
+ }
+
/* Setup kernel */
kernel.name = "KERNEL";
kernel.grf_blocks = 128;
@@ -224,8 +231,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
if (ker->curbe) {
assert(cst_sz > 0);
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
- for (i = 0; i < thread_n; ++i)
+ for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
+ cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
+ }
TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
}
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 356a8a7..ae1b574 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -109,7 +109,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
if (UNLIKELY(value == NULL))
return CL_INVALID_KERNEL_ARGS;
offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
- assert(offset + sz <= k->curbe_sz);
+ //assert(offset + sz <= k->curbe_sz);
memcpy(k->curbe + offset, value, sz);
k->args[index].local_sz = 0;
k->args[index].is_set = 1;
@@ -152,6 +152,10 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
|| (arg_type != GBE_ARG_IMAGE && mem->is_image)))
return CL_INVALID_ARG_VALUE;
+
+ if(arg_type == GBE_ARG_CONSTANT_PTR) {
+ gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
+ }
cl_mem_add_ref(mem);
if (k->args[index].mem)
cl_mem_delete(k->args[index].mem);
@@ -175,6 +179,9 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
cl_context ctx = k->program->ctx;
cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
+ if(k->bo != NULL)
+ cl_buffer_unreference(k->bo);
+
/* Allocate the gen code here */
const uint32_t code_sz = gbe_kernel_get_code_size(opaque);
const char *code = gbe_kernel_get_code(opaque);
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 3a8cfdd..e6961d1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -79,6 +79,7 @@ cl_mem_allocate(cl_context ctx,
err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
goto error;
}
+ mem->size = sz;
/* Append the buffer in the context buffer list */
pthread_mutex_lock(&ctx->buffer_lock);
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 6992454..db391ee 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -35,6 +35,7 @@ struct _cl_mem {
uint64_t magic; /* To identify it as a memory object */
volatile int ref_n; /* This object is reference counted */
cl_buffer bo; /* Data in GPU memory */
+ size_t size; /* original request size, not alignment size, used in constant buffer */
cl_mem prev, next; /* We chain the memory buffers together */
cl_context ctx; /* Context it belongs to */
cl_mem_flags flags; /* Flags specified at the creation time */
--
1.7.9.5
More information about the Beignet
mailing list