[Beignet] [Patch V2 1/2] Use NP2 stack size to avoid cache line conflict.
Yang Rong
rong.r.yang at intel.com
Tue Jun 23 18:58:25 PDT 2015
The L3 cacheline size 64B, so calc the stack size from 64, and mul 3 per step.
Gen only support D * W before GEN8. So when calculate per lane stack address, need take care of the mul.
V2: calc the stack size from 128B, because long16 need 128B alignment.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/backend/context.cpp | 4 ++--
backend/src/backend/gen75_context.cpp | 13 ++++++-------
backend/src/backend/gen_context.cpp | 13 ++++++-------
3 files changed, 14 insertions(+), 16 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 0dc60b7..b8dfa8c 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -400,9 +400,9 @@ namespace gbe
return;
// Be sure that the stack pointer is set
// GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
- uint32_t stackSize = 1*KB;
+ uint32_t stackSize = 128;
while (stackSize < fn.getStackSize()) {
- stackSize <<= 1;
+ stackSize *= 3;
GBE_ASSERT(stackSize <= 64*KB);
}
this->kernel->stackSize = stackSize;
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index caf7043..b9dfb18 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -74,12 +74,7 @@ namespace gbe
const uint32_t perLaneSize = kernel->getStackSize();
const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
- GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
- GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
- // Use shifts rather than muls which are limited to 32x16 bit sources
- const uint32_t perLaneShift = logi2(perLaneSize);
- const uint32_t perThreadShift = logi2(perThreadSize);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
@@ -95,11 +90,15 @@ namespace gbe
p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
p->curr.execWidth = this->simdWidth;
- p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+ p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
p->curr.execWidth = 1;
p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
- p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+ if(perThreadSize > 0xffff) {
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
+ } else
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
p->curr.execWidth = this->simdWidth;
p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
p->pop();
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 43d14d2..db27377 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -182,12 +182,7 @@ namespace gbe
const uint32_t perLaneSize = kernel->getStackSize();
const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
- GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
- GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
- // Use shifts rather than muls which are limited to 32x16 bit sources
- const uint32_t perLaneShift = logi2(perLaneSize);
- const uint32_t perThreadShift = logi2(perThreadSize);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
@@ -201,9 +196,13 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
p->curr.execWidth = this->simdWidth;
- p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+ p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
p->curr.execWidth = 1;
- p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+ if(perThreadSize > 0xffff) {
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
+ } else
+ p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
p->curr.execWidth = this->simdWidth;
p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
p->pop();
--
1.8.3.2
More information about the Beignet
mailing list