[Beignet] [PATCH] GBE: fixed the stack allocation.
Zhigang Gong
zhigang.gong at intel.com
Thu Jan 16 18:48:31 PST 2014
Yongjia wrote a case hit the previous 1KB limitation. I took a look at
the stack pointer related code then I found the implementation is not
comply with the OCL spec.
According to OpenCL spec, section 6.9:
d. Variable length arrays and structures with flexible (or unsized) arrays are not supported.
Thus all the local variable size should be constant, and we can
manipulate the stack pointer easier , no need to do the alignment
calculating at runtime, and could get the eaxct stack size then
allocate stack size on demand. I still put a limitation there which
is 64KB.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/context.cpp | 7 +++-
backend/src/ir/function.cpp | 2 +-
backend/src/ir/function.hpp | 5 +++
backend/src/llvm/llvm_gen_backend.cpp | 61 +++++++++++----------------------
4 files changed, 32 insertions(+), 43 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 7a80dc8..2125bd1 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -389,7 +389,12 @@ namespace gbe
return;
// Be sure that the stack pointer is set
GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
- this->kernel->stackSize = 1*KB; // XXX compute that in a better way
+ uint32_t stackSize = 1*KB;
+ while (stackSize < fn.getStackSize()) {
+ stackSize <<= 1;
+ GBE_ASSERT(stackSize <= 64*KB);
+ }
+ this->kernel->stackSize = stackSize;
}
uint32_t Context::newCurbeEntry(gbe_curbe_type value,
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 4273f66..71dcc1f 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -43,7 +43,7 @@ namespace ir {
///////////////////////////////////////////////////////////////////////////
Function::Function(const std::string &name, const Unit &unit, Profile profile) :
- name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0)
+ name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
{
initProfile(*this);
samplerSet = GBE_NEW(SamplerSet);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 11e268b..2468e73 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -314,6 +314,10 @@ namespace ir {
void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
/*! Get required work group size. */
const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+ /*! Get stack size. */
+ INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+ /*! Push stack size. */
+ INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
private:
friend class Context; //!< Can freely modify a function
std::string name; //!< Function name
@@ -330,6 +334,7 @@ namespace ir {
uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise
bool useSLM; //!< Is SLM required?
uint32_t slmSize; //!< local variable size inside kernel function
+ uint32_t stackSize; //!< stack size for private memory.
SamplerSet *samplerSet; //!< samplers used in this function.
ImageSet* imageSet; //!< Image set in this function's arguments..
size_t compileWgSize[3]; //!< required work group size specified by
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index b0555aa..ebead77 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2757,32 +2757,23 @@ namespace gbe
Value *src = I.getOperand(0);
Type *elemType = I.getType()->getElementType();
ir::ImmediateIndex immIndex;
- bool needMultiply = true;
+ uint32_t elementSize = getTypeByteSize(unit, elemType);
// Be aware, we manipulate pointers
if (ctx.getPointerSize() == ir::POINTER_32_BITS)
- immIndex = ctx.newImmediate(uint32_t(getTypeByteSize(unit, elemType)));
+ immIndex = ctx.newImmediate(uint32_t(elementSize));
else
- immIndex = ctx.newImmediate(uint64_t(getTypeByteSize(unit, elemType)));
+ immIndex = ctx.newImmediate(uint64_t(elementSize));
// OK, we try to see if we know compile time the size we need to allocate
- if (I.isArrayAllocation() == false) // one element allocated only
- needMultiply = false;
- else {
+ if (I.isArrayAllocation() == true) {
Constant *CPV = dyn_cast<Constant>(src);
- if (CPV) {
- const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
- ir::Immediate imm = ctx.getImmediate(immIndex);
- imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
- ctx.setImmediate(immIndex, imm);
- needMultiply = false;
- } else {
- // Brutal but cheap way to get arrays aligned on 4 bytes: we just align
- // the element on 4 bytes!
- ir::Immediate imm = ctx.getImmediate(immIndex);
- imm.data.u64 = ALIGN(imm.data.u64, 4);
- ctx.setImmediate(immIndex, imm);
- }
+ GBE_ASSERT(CPV);
+ const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
+ ir::Immediate imm = ctx.getImmediate(immIndex);
+ imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
+ elementSize *= elemNum;
+ ctx.setImmediate(immIndex, imm);
}
// Now emit the stream of instructions to get the allocated pointer
@@ -2797,32 +2788,20 @@ namespace gbe
// align the stack pointer according to data alignment
if(align > 1) {
- // (ptr + (align-1)) & ~(align-1)
- ir::ImmediateIndex immAlign;
- immAlign = ctx.newIntegerImmediate(align-1, ir::TYPE_U32);
- ir::Register alignReg = ctx.reg(ctx.getPointerFamily());
- ctx.LOADI(ir::TYPE_S32, alignReg, immAlign);
- ctx.ADD(ir::TYPE_U32, stack, stack, alignReg);
-
- alignReg = ctx.reg(ctx.getPointerFamily());
- immAlign = ctx.newIntegerImmediate(~(align-1), ir::TYPE_U32);
- ctx.LOADI(ir::TYPE_S32, alignReg, immAlign);
- ctx.AND(ir::TYPE_U32, stack, stack, alignReg);
+ uint32_t prevStackPtr = ctx.getFunction().getStackSize();
+ uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
+ ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+ ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+ ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+ ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+ ctx.getFunction().pushStackSize(step);
}
// Set the destination register properly
ctx.MOV(imm.type, dst, stack);
- // Easy case, we just increment the stack pointer
- if (needMultiply == false) {
- ctx.LOADI(imm.type, reg, immIndex);
- ctx.ADD(imm.type, stack, stack, reg);
- }
- // Harder case (variable length array) that requires a multiply
- else {
- ctx.LOADI(imm.type, reg, immIndex);
- ctx.MUL(imm.type, reg, this->getRegister(src), reg);
- ctx.ADD(imm.type, stack, stack, reg);
- }
+ ctx.LOADI(imm.type, reg, immIndex);
+ ctx.ADD(imm.type, stack, stack, reg);
+ ctx.getFunction().pushStackSize(elementSize);
}
static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
--
1.7.9.5
More information about the Beignet
mailing list