[Beignet] [PATCH] Experimental workgroup reduce add thread communication
Grigore Lupescu
grigore.lupescu at intel.com
Mon Jan 11 08:57:33 PST 2016
RESULTS (discussion/explanations in comments)
A. thread comm master => 1.608 Msum/S, 408ms
B. thread comm master + add4 => 1.730 Msum/S, 378ms
C. (this) slm/barrier + add4, 8 READ/thread => 2.495 Msum/S, ~262ms
D. (this) slm/barrier + add4, 4 READ/thread => 3.813 Msum/S
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/backend/gen_context.cpp | 39 +++++++++----
backend/src/backend/gen_insn_selection.cpp | 90 ++++++++++++++++++++++++++++--
2 files changed, 112 insertions(+), 17 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0ea0dd0..3fcc8ce 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2943,21 +2943,32 @@ namespace gbe
}
}
}
- } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) {
+ }
+ else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){
+ tmp.hstride = GEN_HORIZONTAL_STRIDE_1;
+ tmp.vstride = GEN_VERTICAL_STRIDE_4;
+ tmp.width = GEN_WIDTH_4;
+
GBE_ASSERT(tmp.type == theVal.type);
- GenRegister v = GenRegister::toUniform(tmp, theVal.type);
- for (uint32_t i = 0; i < simd; i++) {
- p->ADD(threadData, threadData, v);
- v.subnr += typeSize(theVal.type);
- if (v.subnr == 32) {
- v.subnr = 0;
- v.nr++;
- }
+ GenRegister partialSum = tmp;
+
+ /* adjust offset, compute add with ADD4/ADD */
+ for (uint32_t i = 1; i < simd/4; i++){
+ tmp = tmp.suboffset(tmp, 4);
+ p->push();
+ p->curr.execWidth = GEN_WIDTH_16;
+ p->ADD(partialSum, partialSum, tmp);
+ p->pop();
}
- }
+ for (uint32_t i = 0; i < 4; i++){
+ partialSum.width = GEN_WIDTH_1;
+ p->ADD(threadData, threadData, partialSum);
+ partialSum = GenRegister::suboffset(partialSum, 1);
+ }
+ }
p->pop();
- }
+}
#define SEND_RESULT_MSG() \
do { \
@@ -3028,6 +3039,8 @@ do { \
workgroupOpInThread(msgData, theVal, threadData, tmp, simd, wg_op, p);
} p->pop();
+/* deactivate code for other OPs - EXPERIMENTAL ADD_REDUCE only */
+#if 0
/* If we are the only one thread, no need to send msg, just broadcast the result.*/
p->push(); {
p->curr.predicate = GEN_PREDICATE_NONE;
@@ -3123,7 +3136,6 @@ do { \
p->curr.predicate = GEN_PREDICATE_NONE;
p->WAIT(2);
p->patchJMPI(jip, (p->n_instruction() - jip), 0);
-
/* Do something when get the msg. */
p->curr.execWidth = simd;
p->MOV(dst, msgData);
@@ -3143,6 +3155,9 @@ do { \
p->patchJMPI(jip, (p->n_instruction() - jip), 0);
} p->pop();
}
+#endif
+
+ p->MOV(dst, threadData);
if (oneThreadJip >=0)
p->patchJMPI(oneThreadJip, (p->n_instruction() - oneThreadJip), 0);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 001a3c5..3fa03a3 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6042,9 +6042,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.curr.noMask = 1;
sel.curr.execWidth = 8;
- sel.MOV(tmp, sr0_0);
+ sel.MOV(tmp, sel.selReg(ocl::threadid, ir::TYPE_U32));
- sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(2));
+ sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(4));
sel.ADD(addr, addr, GenRegister::immud(slmAddr));
sel.push(); {
@@ -6086,7 +6086,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.curr.execWidth = 8;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
- sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(1));
+ sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(0));
/* Wrap the next thread id. */
sel.push(); {
@@ -6192,7 +6192,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
if (workGroupOp == WORKGROUP_OP_BROADCAST) {
return emitWGBroadcast(sel, insn);
- } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {
+ } else if (workGroupOp >= WORKGROUP_OP_REDUCE_MIN && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {
const uint32_t slmAddr = insn.getSlmAddr();
/* First, we create the TheadID/localID map, in order to get which thread hold the next 16 workitems. */
@@ -6223,7 +6223,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.curr.subFlag = 1;
sel.WORKGROUP_OP(workGroupOp, dst, src, nextThreadID, threadID, threadNum, tmp);
} sel.pop();
- } else {
+ }
+ else if (workGroupOp == WORKGROUP_OP_REDUCE_ADD) { /* EXPERIMENTAL */
+
+ const Type type = insn.getType();
+ GenRegister dst = sel.selReg(insn.getDst(0), type);
+ const uint32_t srcNum = insn.getSrcNum();
+
+ GBE_ASSERT(srcNum == 3);
+ GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
+ GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
+
+ GenRegister threadID = sel.selReg(ocl::threadid, ir::TYPE_U32);
+ GenRegister threadNum = sel.selReg(ocl::threadn, ir::TYPE_U32);
+ GenRegister tmp = GenRegister::retype(
+ sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ GenRegister src = sel.selReg(insn.getSrc(2), type);
+
+ GenRegister nextThreadID = sel.selReg(sel.reg(FAMILY_WORD), type);
+ GenRegister result = sel.selReg(sel.reg(FAMILY_WORD), type);
+ GenRegister addr = GenRegister::retype(
+ sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32), GEN_TYPE_UD);
+ GenRegister data_in = sel.selReg(sel.reg(FAMILY_WORD), type);
+
+ vector<GenRegister> fakeTemps;
+ fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type));
+ fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type));
+
+ const uint32_t slmAddr = insn.getSlmAddr();
+
+ /* compute individual slice of workitems, (e.g. 0->16 workitems) */
+ sel.WORKGROUP_OP(workGroupOp, result, src,
+ nextThreadID, threadID, threadNum, tmp);
+
+ /* write result data to SLM with offset using threadID*/
+ sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32),
+ GenRegister::immud(4));
+ sel.ADD(addr, addr, GenRegister::immud(slmAddr));
+
+ /****** TODO (1) OPTIMIZE STORAGE - maybe store data more efficient
+ * or more compact ?
+ */
+ sel.UNTYPED_WRITE(addr,
+ &result, 1, GenRegister::immw(0xfe), fakeTemps);
+
+ /* barrier, all threads have finished computing and writing results */
+ /****** TODO (2) OPTIMIZE BARRIER - not sure if all flags are required
+ * Maybe other methods to ensure data has been written to SLM ? */
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+ /* read data from SLM and compute */
+ sel.MOV(addr, GenRegister::immud(slmAddr));
+ sel.MOV(dst, GenRegister::immud(0));
+
+ /****** TODO (3) OPTIMIZE SLM - code is inneficient at SLM read...
+ * Each thread regardless of other threads reads from SLM
+ * exactly 8 results - high performance penalty. Solutions:
+ * 1. Use threadnum and IF conditions. Performance penalty from IF ?
+ * 2. Read multiple elements with UNTYPED_READ ? Sync read threads ?
+ * 3. Use TYPED_READ ? Other methods to read SLM ?
+ *
+ * RESULTS
+ * A. thread comm master => 1.608 Msum/S, 408ms
+ * B. thread comm master + add4 => 1.730 Msum/S, 378ms
+ * C. (this) slm/barrier + add4, 8 READ/thread => 2.495 Msum/S, ~262ms
+ * D. (this) slm/barrier + add4, 4 READ/thread => 3.813 Msum/S
+ *
+ * DEFAULT implementation (with 8 results read from SLM per thread)
+ * NOTE: After the SLM read, each thread has its own set of results
+ * computed by other threads - each thread can work on the final
+ * result (in parallel) without the need for other communication
+ */
+ for(int i=0; i<8; i++){
+ sel.UNTYPED_READ(addr,
+ &data_in, 1, GenRegister::immw(0xFE), fakeTemps);
+ sel.ADD(addr, addr, GenRegister::immud(4));
+ sel.ADD(dst, dst, data_in);
+ }
+ }
+
+ else {
GBE_ASSERT(0);
}
--
2.1.4
More information about the Beignet
mailing list