[Beignet] [PATCH] GBE: remove stacksize 64KB limitation.
Yang Rong
rong.r.yang at intel.com
Sun Feb 14 06:42:16 UTC 2016
If stacksize large 64KB, the formula of calculate the stackptr should
change, form "threadId * perThreadSize + laneId*perLaneSize" to
"(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/backend/context.cpp | 2 +-
backend/src/backend/gen75_context.cpp | 33 +++++++++++++--------------
backend/src/backend/gen_context.cpp | 42 +++++++++++++++--------------------
backend/src/backend/gen_context.hpp | 3 ++-
4 files changed, 36 insertions(+), 44 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5adeabc..0991786 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -398,7 +398,7 @@ namespace gbe
uint32_t stackSize = 128;
while (stackSize < fn.getStackSize()) {
stackSize *= 3;
- GBE_ASSERT(stackSize <= 64*KB);
+ //GBE_ASSERT(stackSize <= 64*KB);
}
this->kernel->stackSize = stackSize;
}
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index fa8b029..37063d7 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -66,37 +66,34 @@ namespace gbe
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
- const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
const GenRegister stackptr = ra->genReg(selStatckPtr);
-
- loadLaneID(stackptr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
// We compute the per-lane stack pointer here
- // private address start from zero
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
//p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
- p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
- p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
- p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
- p->curr.execWidth = this->simdWidth;
- p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
- p->curr.execWidth = 1;
- p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
- p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
- if(perThreadSize > 0xffff) {
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
- } else
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+ p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+ p->SHR(stackptr, stackptr, GenRegister::immud(7));
+ p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
+ p->ADD(tmpReg, tmpReg, stackptr); //threadId
+
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+ loadLaneID(stackptr);
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize
p->pop();
}
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0ea0dd0..99190d3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -148,33 +148,33 @@ namespace gbe
}
/* Get proper block ip register according to current label width. */
- static GenRegister getBlockIP(GenContext &ctx) {
+ GenRegister GenContext::getBlockIP(void) {
GenRegister blockip;
- if (!ctx.isDWLabel())
- blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+ if (!isDWLabel())
+ blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
else
- blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+ blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
return blockip;
}
/* Set current block ip register to a specified constant label value. */
- static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
- if (!ctx.isDWLabel())
- ctx.p->MOV(blockip, GenRegister::immuw(label));
+ void GenContext::setBlockIP(GenRegister blockip, uint32_t label) {
+ if (!isDWLabel())
+ p->MOV(blockip, GenRegister::immuw(label));
else
- ctx.p->MOV(blockip, GenRegister::immud(label));
+ p->MOV(blockip, GenRegister::immud(label));
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
- const GenRegister blockip = getBlockIP(*this);
+ const GenRegister blockip = getBlockIP();
p->push();
p->curr.noMask = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- setBlockIP(*this, blockip, getMaxLabel());
+ setBlockIP(blockip, getMaxLabel());
p->curr.noMask = 0;
- setBlockIP(*this, blockip, 0);
+ setBlockIP(blockip, 0);
p->curr.execWidth = 1;
if (ra->isAllocated(ir::ocl::zero))
p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0));
@@ -219,7 +219,6 @@ namespace gbe
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
- const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
@@ -228,28 +227,23 @@ namespace gbe
const GenRegister stackptr = ra->genReg(selStatckPtr);
// borrow block ip as temporary register as we will
// initialize block ip latter.
- const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD);
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
loadLaneID(stackptr);
// We compute the per-lane stack pointer here
- // threadId * perThreadSize + laneId*perLaneSize
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
// let private address start from zero
//p->MOV(stackptr, GenRegister::immud(0));
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
p->curr.execWidth = this->simdWidth;
- p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
- p->curr.execWidth = 1;
- if(perThreadSize > 0xffff) {
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize));
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
- } else
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize));
- p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, tmpReg);
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize
p->pop();
}
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 22ec0ea..25cce85 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -110,7 +110,8 @@ namespace gbe
}
void loadLaneID(GenRegister dst);
-
+ GenRegister getBlockIP(void);
+ void setBlockIP(GenRegister blockip, uint32_t label);
void collectShifter(GenRegister dest, GenRegister src);
void loadTopHalf(GenRegister dest, GenRegister src);
void storeTopHalf(GenRegister dest, GenRegister src);
--
2.1.4
More information about the Beignet
mailing list