[Beignet] [PATCH 09/10 OpenCL-2.0] Use forward message to handle work group functions.

junyan.he at inbox.com junyan.he at inbox.com
Wed Apr 22 20:26:42 PDT 2015


From: Junyan He <junyan.he at linux.intel.com>

We will use forward message and n0.2 notification to
sync all threads.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp |  334 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen_context.hpp |    1 +
 2 files changed, 335 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index a9663d7..97c81f4 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1680,6 +1680,340 @@ namespace gbe
     p->ATOMIC(dst, function, src, bti, insn.srcNum);
   }
 
+  static void workgroupOpBetweenThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
+                                       uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+
+    if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+      p->SEL_CMP(GEN_CONDITIONAL_LE, msgData, threadData, msgData);
+    }
+
+    p->pop();
+  }
+
+  static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
+                                  uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+
+    /* Setting the init value here. */
+    if (wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+      GenRegister::retype(threadData, theVal.type);
+      if (theVal.type == GEN_TYPE_UD) {
+        p->MOV(threadData, GenRegister::immud(0xFFFFFFFF));
+      }
+    }
+
+    if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+      // TODO: theVal may be scalar.
+      GenRegister v = theVal;
+      v.vstride = GEN_VERTICAL_STRIDE_0;
+      v.width = GEN_WIDTH_1;
+      v.hstride = GEN_HORIZONTAL_STRIDE_0;
+      for (uint32_t i = 0; i < simd; i++) {
+        p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, v);
+        v.subnr += typeSize(theVal.type);
+        if (v.subnr == 32) {
+          v.subnr = 0;
+          v.nr++;
+        }
+      }
+    }
+
+    p->MOV(msgData, threadData);
+    p->pop();
+  }
+
+  void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    GenRegister nextThreadID = ra->genReg(insn.src(1));
+    const GenRegister theVal = ra->genReg(insn.src(0));
+    GenRegister threadid = ra->genReg(GenRegister::ud1grf(ir::ocl::threadid));
+    GenRegister msgData = GenRegister::retype(nextThreadID, dst.type); // The data forward.
+    msgData.vstride = GEN_VERTICAL_STRIDE_0;
+    msgData.width = GEN_WIDTH_1;
+    msgData.hstride = GEN_HORIZONTAL_STRIDE_0;
+    GenRegister threadData =
+      GenRegister::retype(GenRegister::offset(nextThreadID, 0, 24), dst.type); // Res within thread.
+    threadData.vstride = GEN_VERTICAL_STRIDE_0;
+    threadData.width = GEN_WIDTH_1;
+    threadData.hstride = GEN_HORIZONTAL_STRIDE_0;
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+    GenRegister flag_save = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW);
+    flag_save.vstride = GEN_VERTICAL_STRIDE_0;
+    flag_save.width = GEN_WIDTH_1;
+    flag_save.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+    p->push(); { /* First, so something within thread. */
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      /* Do some calculation within each thread. */
+      workgroupOpInThread(msgData, theVal, threadData, simd, wg_op, p);
+    } p->pop();
+
+    p->push(); { /* We begin from threadid 0. */
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, threadid, GenRegister::immud(0x0));
+
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.inversePredicate = 1;
+      p->MOV(flag_save, GenRegister::immuw(0x0));
+      p->curr.inversePredicate = 0;
+      p->MOV(flag_save, GenRegister::immuw(0xffff));
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(flagReg, flag_save);
+    } p->pop();
+
+    p->push(); {
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.inversePredicate = 1;
+      p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */
+      p->WAIT(2);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->ENDIF(GenRegister::immuw(2));
+      /* Do something when get the msg. */
+      workgroupOpBetweenThread(msgData, theVal, threadData, simd, wg_op, p);
+
+      /* Restore the flag. */
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(flagReg, flag_save);
+    } p->pop();
+
+    p->push(); { /* then send msg. */
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.execWidth = 1;
+      GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD);
+      offLen.vstride = GEN_VERTICAL_STRIDE_0;
+      offLen.width = GEN_WIDTH_1;
+      offLen.hstride = GEN_HORIZONTAL_STRIDE_0;
+      uint32_t szEnc = typeSize(theVal.type) >> 1;
+      if (szEnc == 4) {
+        szEnc = 3;
+      }
+      p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21)));
+
+      GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD);
+      tidEuid.vstride = GEN_VERTICAL_STRIDE_0;
+      tidEuid.width = GEN_WIDTH_1;
+      tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0;
+      p->SHL(tidEuid, tidEuid, GenRegister::immud(16));
+
+      p->curr.execWidth = 8;
+      p->FWD_GATEWAY_MSG(nextThreadID, 2);
+    } p->pop();
+
+    p->push(); { /* If we are first thread, wait last one to notify us. */
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->IF(GenRegister::immuw(6));
+      p->WAIT(2);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->ENDIF(GenRegister::immuw(2));
+    } p->pop();
+
+    /* Broadcast the result. */
+    if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) {
+      p->push(); {
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.noMask = 1;
+        p->curr.execWidth = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->curr.inversePredicate = 1;
+        p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */
+        p->WAIT(2);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ENDIF(GenRegister::immuw(2));
+        /* Do something when get the msg. */
+        p->curr.execWidth = simd;
+        p->MOV(dst, msgData);
+
+        p->curr.execWidth = 8;
+        p->FWD_GATEWAY_MSG(nextThreadID, 2);
+
+        p->curr.execWidth = 1;
+        p->curr.inversePredicate = 0;
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->IF(GenRegister::immuw(6));
+        p->WAIT(2);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ENDIF(GenRegister::immuw(2));
+      } p->pop();
+    }
+  }
+
+#if 0
+  void GenContext::emitWGBroadcastInstruction(const SelectionInstruction &insn) {
+    GenRegister dimX, dimY, dimZ;
+    GenRegister lid0, lid1, lid2;
+    int dim = insn.srcNum - 2;
+    if (p->curr.execWidth == 16) {
+      lid0 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid0));
+      lid1 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid1));
+      lid2 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid2));
+    } else {
+      lid0 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid0));
+      lid1 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid1));
+      lid2 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid2));
+    }
+
+    p->push(); { /* First, is the specified LocalID belong to this thread ? */
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      dimX = ra->genReg(insn.src(1));
+      p->CMP(GEN_CONDITIONAL_EQ, dimX, lid0);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      if (dim >= 2) {
+        dimY = ra->genReg(insn.src(2));
+        p->CMP(GEN_CONDITIONAL_EQ, dimY, lid1);
+      }
+      if (dim == 3) {
+        dimZ = ra->genReg(insn.src(3));
+        p->CMP(GEN_CONDITIONAL_EQ, dimY, lid2);
+      }
+    } p->pop();
+
+    GenRegister res = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW);
+    GenRegister fbl = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 4), GEN_TYPE_UD);
+    p->push(); {
+      res.vstride = GEN_VERTICAL_STRIDE_0;
+      res.width = GEN_WIDTH_1;
+      res.hstride = GEN_HORIZONTAL_STRIDE_0;
+      fbl.vstride = GEN_VERTICAL_STRIDE_0;
+      fbl.width = GEN_WIDTH_1;
+      fbl.hstride = GEN_HORIZONTAL_STRIDE_0;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->MOV(res, flagReg);
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      /* if (x == get_local_id(0) && y == get_local_id(1) && z == get_local_id(2)) in this thread. */
+      p->CMP(GEN_CONDITIONAL_NEQ, res, GenRegister::immuw(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.inversePredicate = 1;
+      p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0x0));
+      p->curr.inversePredicate = 0;
+      p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0xffff));
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(flagReg, GenRegister::retype(fbl, GEN_TYPE_UW));
+
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.inversePredicate = 1;
+      p->IF(GenRegister::immuw(6));
+      p->WAIT(2);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->ENDIF(GenRegister::immuw(2));
+    } p->pop();
+
+
+    p->push(); {  /* Fill all the workitems in the same thread with the value. */
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.execWidth = 1;
+      p->MOV(GenRegister::addr1(0), GenRegister::immuw(theVal.nr*32 + theVal.subnr));
+      if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        p->FBL(fbl, res);
+        p->MUL(fbl, fbl, GenRegister::immud(typeSize(theVal.type)));
+        p->ADD(GenRegister::addr1(0), GenRegister::addr1(0), GenRegister::retype(fbl, GEN_TYPE_UW));
+      }
+
+      GenRegister v = GenRegister::to_indirectNx1(theVal, theVal.nr*32 + theVal.subnr, 0);
+      v.vstride = GEN_VERTICAL_STRIDE_0;
+      v.width = GEN_WIDTH_1;
+      v.hstride = GEN_HORIZONTAL_STRIDE_0;
+      if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
+        p->push();
+        p->curr.execWidth = 1;
+        p->MOV(dst, v);
+        p->pop();
+      } else {
+        if (simd == 16) {
+          p->push();
+          p->curr.execWidth = 8;
+          p->MOV(dst, v);
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(dst, 1), v);
+          p->pop();
+        } else {
+          p->push();
+          p->curr.execWidth = 8;
+          p->MOV(dst, v);
+          p->pop();
+        }
+      }
+
+      GenRegister data = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 0), GEN_TYPE_UD);
+      data.vstride = GEN_VERTICAL_STRIDE_0;
+      data.width = GEN_WIDTH_1;
+      data.hstride = GEN_HORIZONTAL_STRIDE_0;
+      p->MOV(data, v);
+    } p->pop();
+
+    p->push(); {  /* Fill all the workitems with the value from forward MSG. */
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.inversePredicate = 1;
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      GenRegister data = GenRegister::retype(nextThreadID, dst.type);
+      data.vstride = GEN_VERTICAL_STRIDE_0;
+      data.width = GEN_WIDTH_1;
+      data.hstride = GEN_HORIZONTAL_STRIDE_0;
+      p->MOV(dst, data);
+    } p->pop();
+
+    p->push(); {/* Then we forward the value to the other threads. */
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.execWidth = 1;
+      GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD);
+      offLen.vstride = GEN_VERTICAL_STRIDE_0;
+      offLen.width = GEN_WIDTH_1;
+      offLen.hstride = GEN_HORIZONTAL_STRIDE_0;
+      uint32_t szEnc = typeSize(theVal.type) >> 1;
+      if (szEnc == 4) {
+        szEnc = 3;
+      }
+      p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21)));
+
+      GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD);
+      tidEuid.vstride = GEN_VERTICAL_STRIDE_0;
+      tidEuid.width = GEN_WIDTH_1;
+      tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0;
+      p->SHL(tidEuid, tidEuid, GenRegister::immud(16));
+
+      p->curr.execWidth = 8;
+      p->FWD_GATEWAY_MSG(nextThreadID, 2);
+    } p->pop();
+
+    p->push(); {
+      /* If we are first thread, wait last one to notify us. */
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->IF(GenRegister::immuw(6));
+      p->WAIT(2);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->ENDIF(GenRegister::immuw(2));
+    } p->pop();
+  }
+#endif
+
   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
     GenRegister src = ra->genReg(insn.src(0));
     if(sel->isScalarReg(src.reg()))
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6ca88db..95d336e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -167,6 +167,7 @@ namespace gbe
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
     virtual void emitI64MULInstruction(const SelectionInstruction &insn);
     virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+    void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
 
-- 
1.7.9.5





More information about the Beignet mailing list