[Beignet] [PATCH 09/10 OpenCL-2.0] Use forward message to handle work group functions.
junyan.he at inbox.com
junyan.he at inbox.com
Wed Apr 22 20:26:42 PDT 2015
From: Junyan He <junyan.he at linux.intel.com>
We will use forward message and n0.2 notification to
sync all threads.
Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
backend/src/backend/gen_context.cpp | 334 +++++++++++++++++++++++++++++++++++
backend/src/backend/gen_context.hpp | 1 +
2 files changed, 335 insertions(+)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index a9663d7..97c81f4 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1680,6 +1680,340 @@ namespace gbe
p->ATOMIC(dst, function, src, bti, insn.srcNum);
}
+ static void workgroupOpBetweenThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
+ uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+
+ if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+ p->SEL_CMP(GEN_CONDITIONAL_LE, msgData, threadData, msgData);
+ }
+
+ p->pop();
+ }
+
+ static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
+ uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+
+ /* Setting the init value here. */
+ if (wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+ GenRegister::retype(threadData, theVal.type);
+ if (theVal.type == GEN_TYPE_UD) {
+ p->MOV(threadData, GenRegister::immud(0xFFFFFFFF));
+ }
+ }
+
+ if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+ // TODO: theVal may be scalar.
+ GenRegister v = theVal;
+ v.vstride = GEN_VERTICAL_STRIDE_0;
+ v.width = GEN_WIDTH_1;
+ v.hstride = GEN_HORIZONTAL_STRIDE_0;
+ for (uint32_t i = 0; i < simd; i++) {
+ p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, v);
+ v.subnr += typeSize(theVal.type);
+ if (v.subnr == 32) {
+ v.subnr = 0;
+ v.nr++;
+ }
+ }
+ }
+
+ p->MOV(msgData, threadData);
+ p->pop();
+ }
+
+ void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ GenRegister nextThreadID = ra->genReg(insn.src(1));
+ const GenRegister theVal = ra->genReg(insn.src(0));
+ GenRegister threadid = ra->genReg(GenRegister::ud1grf(ir::ocl::threadid));
+ GenRegister msgData = GenRegister::retype(nextThreadID, dst.type); // The data forward.
+ msgData.vstride = GEN_VERTICAL_STRIDE_0;
+ msgData.width = GEN_WIDTH_1;
+ msgData.hstride = GEN_HORIZONTAL_STRIDE_0;
+ GenRegister threadData =
+ GenRegister::retype(GenRegister::offset(nextThreadID, 0, 24), dst.type); // Res within thread.
+ threadData.vstride = GEN_VERTICAL_STRIDE_0;
+ threadData.width = GEN_WIDTH_1;
+ threadData.hstride = GEN_HORIZONTAL_STRIDE_0;
+ uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t simd = p->curr.execWidth;
+ GenRegister flag_save = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW);
+ flag_save.vstride = GEN_VERTICAL_STRIDE_0;
+ flag_save.width = GEN_WIDTH_1;
+ flag_save.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+ p->push(); { /* First, so something within thread. */
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ /* Do some calculation within each thread. */
+ workgroupOpInThread(msgData, theVal, threadData, simd, wg_op, p);
+ } p->pop();
+
+ p->push(); { /* We begin from threadid 0. */
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_EQ, threadid, GenRegister::immud(0x0));
+
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.inversePredicate = 1;
+ p->MOV(flag_save, GenRegister::immuw(0x0));
+ p->curr.inversePredicate = 0;
+ p->MOV(flag_save, GenRegister::immuw(0xffff));
+
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(flagReg, flag_save);
+ } p->pop();
+
+ p->push(); {
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.inversePredicate = 1;
+ p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ /* Do something when get the msg. */
+ workgroupOpBetweenThread(msgData, theVal, threadData, simd, wg_op, p);
+
+ /* Restore the flag. */
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(flagReg, flag_save);
+ } p->pop();
+
+ p->push(); { /* then send msg. */
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 1;
+ GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD);
+ offLen.vstride = GEN_VERTICAL_STRIDE_0;
+ offLen.width = GEN_WIDTH_1;
+ offLen.hstride = GEN_HORIZONTAL_STRIDE_0;
+ uint32_t szEnc = typeSize(theVal.type) >> 1;
+ if (szEnc == 4) {
+ szEnc = 3;
+ }
+ p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21)));
+
+ GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD);
+ tidEuid.vstride = GEN_VERTICAL_STRIDE_0;
+ tidEuid.width = GEN_WIDTH_1;
+ tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0;
+ p->SHL(tidEuid, tidEuid, GenRegister::immud(16));
+
+ p->curr.execWidth = 8;
+ p->FWD_GATEWAY_MSG(nextThreadID, 2);
+ } p->pop();
+
+ p->push(); { /* If we are first thread, wait last one to notify us. */
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->IF(GenRegister::immuw(6));
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ } p->pop();
+
+ /* Broadcast the result. */
+ if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+ p->push(); {
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.inversePredicate = 1;
+ p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ /* Do something when get the msg. */
+ p->curr.execWidth = simd;
+ p->MOV(dst, msgData);
+
+ p->curr.execWidth = 8;
+ p->FWD_GATEWAY_MSG(nextThreadID, 2);
+
+ p->curr.execWidth = 1;
+ p->curr.inversePredicate = 0;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->IF(GenRegister::immuw(6));
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ } p->pop();
+ }
+ }
+
+#if 0
+ void GenContext::emitWGBroadcastInstruction(const SelectionInstruction &insn) {
+ GenRegister dimX, dimY, dimZ;
+ GenRegister lid0, lid1, lid2;
+ int dim = insn.srcNum - 2;
+ if (p->curr.execWidth == 16) {
+ lid0 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid0));
+ lid1 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid1));
+ lid2 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid2));
+ } else {
+ lid0 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid0));
+ lid1 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid1));
+ lid2 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid2));
+ }
+
+ p->push(); { /* First, is the specified LocalID belong to this thread ? */
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ dimX = ra->genReg(insn.src(1));
+ p->CMP(GEN_CONDITIONAL_EQ, dimX, lid0);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ if (dim >= 2) {
+ dimY = ra->genReg(insn.src(2));
+ p->CMP(GEN_CONDITIONAL_EQ, dimY, lid1);
+ }
+ if (dim == 3) {
+ dimZ = ra->genReg(insn.src(3));
+ p->CMP(GEN_CONDITIONAL_EQ, dimY, lid2);
+ }
+ } p->pop();
+
+ GenRegister res = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW);
+ GenRegister fbl = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 4), GEN_TYPE_UD);
+ p->push(); {
+ res.vstride = GEN_VERTICAL_STRIDE_0;
+ res.width = GEN_WIDTH_1;
+ res.hstride = GEN_HORIZONTAL_STRIDE_0;
+ fbl.vstride = GEN_VERTICAL_STRIDE_0;
+ fbl.width = GEN_WIDTH_1;
+ fbl.hstride = GEN_HORIZONTAL_STRIDE_0;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->MOV(res, flagReg);
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ /* if (x == get_local_id(0) && y == get_local_id(1) && z == get_local_id(2)) in this thread. */
+ p->CMP(GEN_CONDITIONAL_NEQ, res, GenRegister::immuw(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.inversePredicate = 1;
+ p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0x0));
+ p->curr.inversePredicate = 0;
+ p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0xffff));
+
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(flagReg, GenRegister::retype(fbl, GEN_TYPE_UW));
+
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.inversePredicate = 1;
+ p->IF(GenRegister::immuw(6));
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ } p->pop();
+
+
+ p->push(); { /* Fill all the workitems in the same thread with the value. */
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.execWidth = 1;
+ p->MOV(GenRegister::addr1(0), GenRegister::immuw(theVal.nr*32 + theVal.subnr));
+ if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ p->FBL(fbl, res);
+ p->MUL(fbl, fbl, GenRegister::immud(typeSize(theVal.type)));
+ p->ADD(GenRegister::addr1(0), GenRegister::addr1(0), GenRegister::retype(fbl, GEN_TYPE_UW));
+ }
+
+ GenRegister v = GenRegister::to_indirectNx1(theVal, theVal.nr*32 + theVal.subnr, 0);
+ v.vstride = GEN_VERTICAL_STRIDE_0;
+ v.width = GEN_WIDTH_1;
+ v.hstride = GEN_HORIZONTAL_STRIDE_0;
+ if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
+ p->push();
+ p->curr.execWidth = 1;
+ p->MOV(dst, v);
+ p->pop();
+ } else {
+ if (simd == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->MOV(dst, v);
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(dst, 1), v);
+ p->pop();
+ } else {
+ p->push();
+ p->curr.execWidth = 8;
+ p->MOV(dst, v);
+ p->pop();
+ }
+ }
+
+ GenRegister data = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 0), GEN_TYPE_UD);
+ data.vstride = GEN_VERTICAL_STRIDE_0;
+ data.width = GEN_WIDTH_1;
+ data.hstride = GEN_HORIZONTAL_STRIDE_0;
+ p->MOV(data, v);
+ } p->pop();
+
+ p->push(); { /* Fill all the workitems with the value from forward MSG. */
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.inversePredicate = 1;
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ GenRegister data = GenRegister::retype(nextThreadID, dst.type);
+ data.vstride = GEN_VERTICAL_STRIDE_0;
+ data.width = GEN_WIDTH_1;
+ data.hstride = GEN_HORIZONTAL_STRIDE_0;
+ p->MOV(dst, data);
+ } p->pop();
+
+ p->push(); {/* Then we forward the value to the other threads. */
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 1;
+ GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD);
+ offLen.vstride = GEN_VERTICAL_STRIDE_0;
+ offLen.width = GEN_WIDTH_1;
+ offLen.hstride = GEN_HORIZONTAL_STRIDE_0;
+ uint32_t szEnc = typeSize(theVal.type) >> 1;
+ if (szEnc == 4) {
+ szEnc = 3;
+ }
+ p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21)));
+
+ GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD);
+ tidEuid.vstride = GEN_VERTICAL_STRIDE_0;
+ tidEuid.width = GEN_WIDTH_1;
+ tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0;
+ p->SHL(tidEuid, tidEuid, GenRegister::immud(16));
+
+ p->curr.execWidth = 8;
+ p->FWD_GATEWAY_MSG(nextThreadID, 2);
+ } p->pop();
+
+ p->push(); {
+ /* If we are first thread, wait last one to notify us. */
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->IF(GenRegister::immuw(6));
+ p->WAIT(2);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ENDIF(GenRegister::immuw(2));
+ } p->pop();
+ }
+#endif
+
void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
GenRegister src = ra->genReg(insn.src(0));
if(sel->isScalarReg(src.reg()))
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6ca88db..95d336e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -167,6 +167,7 @@ namespace gbe
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
virtual void emitI64MULInstruction(const SelectionInstruction &insn);
virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+ void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
--
1.7.9.5
More information about the Beignet
mailing list