[Beignet] [PATCH] GBE: if PointerFamily is FAMILY_QWORD, chv and bxt need special handle.
Yang Rong
rong.r.yang at intel.com
Thu Jan 5 09:11:34 UTC 2017
Mov the ud to uq, need move ud to unpacked ud first, and then mov
unpacked ud to uq.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/backend/gen8_context.cpp | 61 +++++++++++++++++++++++++++++++++++
backend/src/backend/gen8_context.hpp | 1 +
backend/src/backend/gen9_context.cpp | 62 ++++++++++++++++++++++++++++++++++++
backend/src/backend/gen9_context.hpp | 1 +
4 files changed, 125 insertions(+)
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index eede52c..2bb8ad1 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1393,6 +1393,67 @@ namespace gbe
p->pop();
}
+ void ChvContext::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getStackSize() == 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ GBE_ASSERT(perLaneSize > 0);
+
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+ loadLaneID(stackptr);
+
+ // We compute the per-lane stack pointer here
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
+ // let private address start from zero
+ //p->MOV(stackptr, GenRegister::immud(0));
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->curr.execWidth = 1;
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+ p->curr.execWidth = this->simdWidth;
+ p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+ if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+ const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+ GenRegister::ul8grf(ir::ocl::stackptr) :
+ GenRegister::ul16grf(ir::ocl::stackptr);
+ GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+ GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+ int simdWidth = p->curr.execWidth;
+ if (simdWidth == 16) {
+ // we need do second quarter first, because the dst type is QW,
+ // while the src is DW. If we do first quater first, the 1st
+ // quarter's dst would contain the 2nd quarter's src.
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+ p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+ }
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(sp, stackptr);
+ p->MOV(stackptr2, sp);
+ }
+ p->pop();
+ }
+
/* Init value according to WORKGROUP OP
* Emit assert is invalid combination operation - datatype */
static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index d715cbc..6b75540 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -125,6 +125,7 @@ namespace gbe
virtual void newSelection(void);
virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ virtual void emitStackPointer(void);
};
}
#endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 8976ede..483b2c3 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -22,6 +22,7 @@
#include "backend/gen9_context.hpp"
#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_program.hpp"
namespace gbe
{
@@ -170,6 +171,67 @@ namespace gbe
p->pop();
}
+ void BxtContext::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getStackSize() == 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ GBE_ASSERT(perLaneSize > 0);
+
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+ loadLaneID(stackptr);
+
+ // We compute the per-lane stack pointer here
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
+ // let private address start from zero
+ //p->MOV(stackptr, GenRegister::immud(0));
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->curr.execWidth = 1;
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+ p->curr.execWidth = this->simdWidth;
+ p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+ if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+ const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+ GenRegister::ul8grf(ir::ocl::stackptr) :
+ GenRegister::ul16grf(ir::ocl::stackptr);
+ GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+ GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+ int simdWidth = p->curr.execWidth;
+ if (simdWidth == 16) {
+ // we need do second quarter first, because the dst type is QW,
+ // while the src is DW. If we do first quater first, the 1st
+ // quarter's dst would contain the 2nd quarter's src.
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+ p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+ }
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(sp, stackptr);
+ p->MOV(stackptr2, sp);
+ }
+ p->pop();
+ }
+
void KblContext::newSelection(void) {
this->sel = GBE_NEW(SelectionKbl, *this);
}
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 2f24b56..9977e9a 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -67,6 +67,7 @@ namespace gbe
virtual void newSelection(void);
virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ virtual void emitStackPointer(void);
};
/* This class is used to implement the kabylake
specific logic for context. */
--
2.7.4
More information about the Beignet
mailing list