[Beignet] [PATCH] GBE: remove stacksize 64KB limitation.
Song, Ruiling
ruiling.song at intel.com
Mon Feb 15 06:29:55 UTC 2016
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Yang Rong
> Sent: Sunday, February 14, 2016 2:42 PM
> To: beignet at lists.freedesktop.org
> Cc: Yang, Rong R <rong.r.yang at intel.com>
> Subject: [Beignet] [PATCH] GBE: remove stacksize 64KB limitation.
>
> If stacksize large 64KB, the formula of calculate the stackptr should
> change, form "threadId * perThreadSize + laneId*perLaneSize" to
> "(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword.
>
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
> backend/src/backend/context.cpp | 2 +-
> backend/src/backend/gen75_context.cpp | 33 +++++++++++++--------------
> backend/src/backend/gen_context.cpp | 42 +++++++++++++++-------------------
> -
> backend/src/backend/gen_context.hpp | 3 ++-
> 4 files changed, 36 insertions(+), 44 deletions(-)
>
> diff --git a/backend/src/backend/context.cpp
> b/backend/src/backend/context.cpp
> index 5adeabc..0991786 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -398,7 +398,7 @@ namespace gbe
> uint32_t stackSize = 128;
> while (stackSize < fn.getStackSize()) {
> stackSize *= 3;
> - GBE_ASSERT(stackSize <= 64*KB);
> + //GBE_ASSERT(stackSize <= 64*KB);
> }
> this->kernel->stackSize = stackSize;
> }
> diff --git a/backend/src/backend/gen75_context.cpp
> b/backend/src/backend/gen75_context.cpp
> index fa8b029..37063d7 100644
> --- a/backend/src/backend/gen75_context.cpp
> +++ b/backend/src/backend/gen75_context.cpp
> @@ -66,37 +66,34 @@ namespace gbe
>
> // Check that everything is consistent in the kernel code
> const uint32_t perLaneSize = kernel->getStackSize();
> - const uint32_t perThreadSize = perLaneSize * this->simdWidth;
> GBE_ASSERT(perLaneSize > 0);
>
> const GenRegister selStatckPtr = this->simdWidth == 8 ?
> GenRegister::ud8grf(ir::ocl::stackptr) :
> GenRegister::ud16grf(ir::ocl::stackptr);
> const GenRegister stackptr = ra->genReg(selStatckPtr);
> -
> - loadLaneID(stackptr);
> + // borrow block ip as temporary register as we will
> + // initialize block ip latter.
> + const GenRegister tmpReg =
> GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
>
> // We compute the per-lane stack pointer here
> - // private address start from zero
> + // threadId * perThreadSize + laneId*perLaneSize or
> + // (threadId * simdWidth + laneId)*perLaneSize
> p->push();
> p->curr.execWidth = 1;
> p->curr.predicate = GEN_PREDICATE_NONE;
> //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5),
> GenRegister::immud(0x1ff));
> - p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5),
> GenRegister::immud(0x7f));
> - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5),
> GenRegister::immud(0x180));
> - p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4),
> GenRegister::immud(7));
> - p->curr.execWidth = this->simdWidth;
> - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));
> //perLaneSize < 64K
> - p->curr.execWidth = 1;
> - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),
> GenRegister::immud(2));
> - p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),
> GenRegister::ud1grf(126, 4));
> - if(perThreadSize > 0xffff) {
> - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),
> GenRegister::immuw(perLaneSize));
> - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),
> GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize <
> 64K
> - } else
> - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),
> GenRegister::immuw(perThreadSize));
> + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
> + p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
> + p->SHR(stackptr, stackptr, GenRegister::immud(7));
> + p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
> + p->ADD(tmpReg, tmpReg, stackptr); //threadId
> +
> + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));
> //threadId * simdWidth
> p->curr.execWidth = this->simdWidth;
> - p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
> + loadLaneID(stackptr);
> + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);
> //threadId * simdWidth + laneId, must < 64K
> + p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId
> * simdWidth + laneId)*perLaneSize
According to Hardware Spec:
For IVB and HSW, When both src0 and src1 are of type D or UD, only the low 16 bits of each element of src1 are used. The accumulator maintains full 48-bit precision.
So looks like you should place (threadId * simdWidth + laneId) at src1.
Have you ever do some try on IVB or HSW?
Thanks!
Ruiling
More information about the Beignet
mailing list