[Beignet] [PATCH] GBE: fixed the stack allocation.

Thu Jan 16 18:48:31 PST 2014

Yongjia wrote a case hit the previous 1KB limitation. I took a look at
the stack pointer related code then I found the implementation is not
comply with the OCL spec.

According to OpenCL spec, section 6.9:

d. Variable length arrays and structures with flexible (or unsized) arrays are not supported.

Thus all the local variable size should be constant, and we can
manipulate the stack pointer easier , no need to do the alignment
calculating at runtime, and could get the eaxct stack size then
allocate stack size on demand. I still put a limitation there which
is 64KB.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/context.cpp       |    7 +++-
 backend/src/ir/function.cpp           |    2 +-
 backend/src/ir/function.hpp           |    5 +++
 backend/src/llvm/llvm_gen_backend.cpp |   61 +++++++++++----------------------
 4 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 7a80dc8..2125bd1 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -389,7 +389,12 @@ namespace gbe
       return;
     // Be sure that the stack pointer is set
     GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
-    this->kernel->stackSize = 1*KB; // XXX compute that in a better way
+    uint32_t stackSize = 1*KB;
+    while (stackSize < fn.getStackSize()) {
+      stackSize <<= 1;
+      GBE_ASSERT(stackSize <= 64*KB);
+    }
+    this->kernel->stackSize = stackSize;
   }
 
   uint32_t Context::newCurbeEntry(gbe_curbe_type value,
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 4273f66..71dcc1f 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -43,7 +43,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   Function::Function(const std::string &name, const Unit &unit, Profile profile) :
-    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0)
+    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
   {
     initProfile(*this);
     samplerSet = GBE_NEW(SamplerSet);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 11e268b..2468e73 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -314,6 +314,10 @@ namespace ir {
     void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
     /*! Get required work group size. */
     const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+    /*! Get stack size. */
+    INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Push stack size. */
+    INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -330,6 +334,7 @@ namespace ir {
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
     uint32_t slmSize;               //!< local variable size inside kernel function
+    uint32_t stackSize;             //!< stack size for private memory.
     SamplerSet *samplerSet;         //!< samplers used in this function.
     ImageSet* imageSet;             //!< Image set in this function's arguments..
     size_t compileWgSize[3];        //!< required work group size specified by
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index b0555aa..ebead77 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2757,32 +2757,23 @@ namespace gbe
     Value *src = I.getOperand(0);
     Type *elemType = I.getType()->getElementType();
     ir::ImmediateIndex immIndex;
-    bool needMultiply = true;
+    uint32_t elementSize = getTypeByteSize(unit, elemType);
 
     // Be aware, we manipulate pointers
     if (ctx.getPointerSize() == ir::POINTER_32_BITS)
-      immIndex = ctx.newImmediate(uint32_t(getTypeByteSize(unit, elemType)));
+      immIndex = ctx.newImmediate(uint32_t(elementSize));
     else
-      immIndex = ctx.newImmediate(uint64_t(getTypeByteSize(unit, elemType)));
+      immIndex = ctx.newImmediate(uint64_t(elementSize));
 
     // OK, we try to see if we know compile time the size we need to allocate
-    if (I.isArrayAllocation() == false) // one element allocated only
-      needMultiply = false;
-    else {
+    if (I.isArrayAllocation() == true) {
       Constant *CPV = dyn_cast<Constant>(src);
-      if (CPV) {
-        const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
-        ir::Immediate imm = ctx.getImmediate(immIndex);
-        imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
-        ctx.setImmediate(immIndex, imm);
-        needMultiply = false;
-      } else {
-        // Brutal but cheap way to get arrays aligned on 4 bytes: we just align
-        // the element on 4 bytes!
-        ir::Immediate imm = ctx.getImmediate(immIndex);
-        imm.data.u64 = ALIGN(imm.data.u64, 4);
-        ctx.setImmediate(immIndex, imm);
-      }
+      GBE_ASSERT(CPV);
+      const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
+      ir::Immediate imm = ctx.getImmediate(immIndex);
+      imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
+      elementSize *= elemNum;
+      ctx.setImmediate(immIndex, imm);
     }
 
     // Now emit the stream of instructions to get the allocated pointer
@@ -2797,32 +2788,20 @@ namespace gbe
 
     // align the stack pointer according to data alignment
     if(align > 1) {
-      // (ptr + (align-1)) & ~(align-1)
-      ir::ImmediateIndex immAlign;
-      immAlign = ctx.newIntegerImmediate(align-1, ir::TYPE_U32);
-      ir::Register alignReg = ctx.reg(ctx.getPointerFamily());
-      ctx.LOADI(ir::TYPE_S32, alignReg, immAlign);
-      ctx.ADD(ir::TYPE_U32, stack, stack, alignReg);
-
-      alignReg = ctx.reg(ctx.getPointerFamily());
-      immAlign = ctx.newIntegerImmediate(~(align-1), ir::TYPE_U32);
-      ctx.LOADI(ir::TYPE_S32, alignReg, immAlign);
-      ctx.AND(ir::TYPE_U32, stack, stack, alignReg);
+      uint32_t prevStackPtr = ctx.getFunction().getStackSize();
+      uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
+      ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+      ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+      ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+      ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+      ctx.getFunction().pushStackSize(step);
     }
     // Set the destination register properly
     ctx.MOV(imm.type, dst, stack);
 
-    // Easy case, we just increment the stack pointer
-    if (needMultiply == false) {
-      ctx.LOADI(imm.type, reg, immIndex);
-      ctx.ADD(imm.type, stack, stack, reg);
-    }
-    // Harder case (variable length array) that requires a multiply
-    else {
-      ctx.LOADI(imm.type, reg, immIndex);
-      ctx.MUL(imm.type, reg, this->getRegister(src), reg);
-      ctx.ADD(imm.type, stack, stack, reg);
-    }
+    ctx.LOADI(imm.type, reg, immIndex);
+    ctx.ADD(imm.type, stack, stack, reg);
+    ctx.getFunction().pushStackSize(elementSize);
   }
 
   static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
-- 
1.7.9.5