[Beignet] [PATCH] Improved SLM access in workgroup reduce

Grigore Lupescu grigore.lupescu at intel.com
Tue Feb 16 14:33:11 UTC 2016


Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/backend/gen_context.cpp        | 64 ++++++++++++++++++++----------
 backend/src/backend/gen_insn_selection.cpp | 33 +++++++++++----
 backend/src/llvm/llvm_gen_backend.cpp      | 13 +++++-
 3 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 601a4ff..4b57634 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3000,8 +3000,8 @@ do { \
  * allocated work-items. SIMD16=> 16 work-items allocated for each thread
  * 2. Each thread will write the computed reduce OP result in SLM memory
  * based on the threadId
- * 3. After a memory fence for sync each thread will read the SLM memory based
- * on a loop using the thread number value (threadN)
+ * 3. After a memory fence, each thread will read in chunks of 4 elements,
+ * the SLM region, using a loop based on the thread count value (threadN)
  * 4. At the end each thread has the final value computed individually
  */
   void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
@@ -3024,18 +3024,17 @@ do { \
     GenRegister result = GenRegister::offset(dst, 0, 16);
     result = GenRegister::toUniform(result, dst.type);
 
-    /* FIXME need continuous GRF allocation */
-    GenRegister msg = GenRegister::retype(ra->genReg(insn.src(4)), dst.type);
-    GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(5)), GEN_TYPE_UD);
+    /* Use of continuous GRF allocation from insn selection */
+    GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
+    GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
     GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
     GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
 
-    GenRegister threadN = GenRegister::retype(tmp, GEN_TYPE_UD);
-
-    /* Do some calculation within each thread. */
+    /* Do some calculation within each thread */
     workgroupOpInThread(msg, theVal, threadData, tmp, simd, wg_op, p);
 
-    p->curr.execWidth = 16;
+    /* Store thread count for future use on read/write to SLM */
+    GenRegister threadN = GenRegister::retype(tmp, GEN_TYPE_UD);
     p->MOV(threadN, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
 
     /* All threads write the partial results to SLM memory */
@@ -3045,29 +3044,54 @@ do { \
     p->ADD(msgAddr, msgAddr, msgSlmOff);
     p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
 
-    /* Wait for write to complete in workgroup */
-    p->FENCE(tmp);
-
-    /* Initialiaze partialData register, it will hold the final result */
+    /* Init partialData register, it will hold the final result */
     initValue(p, partialData, wg_op);
 
-    /* Perform a loop based on thread count */
+    /* Thread 0 will write extra elements for future reads in chunks of 4 */
+    p->push();{
+      p->curr.noMask = 1;
+      p->curr.flag = 0;
+      p->curr.subFlag = 1;
+      p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.execWidth = 8;
+      p->MOV(msgData.offset(msgData, 0), partialData);
+      p->MOV(msgData.offset(msgData, 1), partialData);
+      p->MOV(msgData.offset(msgData, 2), partialData);
+      p->MUL(msgAddr, threadN, GenRegister::immd(0x4));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 3);
+    } p->pop();
+
+    /* Round threadN to nearest upper number divisible with 4 required for
+     * reading in chunks of 4 elements from SLM */
+    p->ADD(threadN, threadN, GenRegister::immd(0x3));
+    p->SHR(threadN, threadN, GenRegister::immd(0x2));
+    p->SHL(threadN, threadN, GenRegister::immd(0x2));
+
+    /* Wait for all writes to complete in work-group */
+    p->FENCE(tmp);
+
+    /* Perform a loop, based on thread count (which is now multiple of 4) */
     p->push();{
       jip0 = p->n_instruction();
 
       p->curr.execWidth = 8;
       p->curr.predicate = GEN_PREDICATE_NONE;
 
-      /* TODO can be optimized further with larger SLM reads (4 elem) */
-      p->ADD(threadN, threadN, GenRegister::immd(-1));
+      /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      p->ADD(threadN, threadN, GenRegister::immd(-4));
       p->MUL(msgAddr, threadN, GenRegister::immd(0x4));
       p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+      p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 4);
 
-      /* Perform OP after read, partialData will hold the result */
-      workgroupOpBetweenThread(partialData, msgData, wg_op, p);
+      /* Perform operation, process 4 elements, partialData will hold result */
+      workgroupOpBetweenThread(partialData, msgData.offset(msgData, 0), wg_op, p);
+      workgroupOpBetweenThread(partialData, msgData.offset(msgData, 1), wg_op, p);
+      workgroupOpBetweenThread(partialData, msgData.offset(msgData, 2), wg_op, p);
+      workgroupOpBetweenThread(partialData, msgData.offset(msgData, 3), wg_op, p);
 
-      /* While threadN is not 0, redo cycle of read SLM / update value */
+      /* While threadN is not 0, cycle read SLM / update value */
       p->curr.noMask = 1;
       p->curr.flag = 0;
       p->curr.subFlag = 1;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 0ad3f33..b6313ff 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -687,7 +687,8 @@ namespace gbe
     /*! Work Group Operations */
     void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister data,
                       GenRegister threadId, GenRegister threadN,
-                      GenRegister tmp, GenRegister msg, GenRegister slmOff);
+                      GenRegister tmp, GenRegister slmOff, vector<GenRegister> msg,
+                      uint32_t msgSizeReq);
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
        as much as possible. */
@@ -1913,19 +1914,32 @@ namespace gbe
                                        GenRegister threadId,
                                        GenRegister threadN,
                                        GenRegister tmp,
-                                       GenRegister msg,
-                                       GenRegister slmOff) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2, 6);
+                                       GenRegister slmOff,
+                                       vector<GenRegister> msg,
+                                       uint32_t msgSizeReq = 6)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 5);
+    SelectionVector *vector = this->appendVector();
+
+    /* allocate continuous GRF registers for READ/WRITE to SLM */
+    GBE_ASSERT(msg.size() >= msgSizeReq);
+    vector->regNum = msg.size();
+    vector->offsetID = 0;
+    vector->reg = &insn->dst(2);
+    vector->isSrc = 0;
+
     insn->extra.workgroupOp = wg_op;
+
     insn->dst(0) = dst;
     insn->dst(1) = tmp;
+    for(uint32_t i = 0; i < msg.size(); i++)
+      insn->dst(2 + i) = msg[i];
 
     insn->src(0) = src;
     insn->src(1) = data;
     insn->src(2) = threadId;
     insn->src(3) = threadN;
-    insn->src(4) = msg;
-    insn->src(5) = slmOff;
+    insn->src(4) = slmOff;
   }
 
   // Boiler plate to initialize the selection library at c++ pre-main
@@ -6146,13 +6160,16 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
       GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), type);
       GenRegister data = sel.selReg(sel.reg(FAMILY_WORD), type);
-      GenRegister msg = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
       GenRegister slmOff = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
 
+      vector<GenRegister> msg;
+      for(uint32_t i = 0; i < 6; i++)
+        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+
       /* compute individual slice of workitems, (e.g. 0->16 workitems) */
       sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
       sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId,
-                       threadN, tmp, msg, slmOff);
+                       threadN, tmp, slmOff, msg);
       return true;
     }
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 886b6f3..c04d423 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3817,13 +3817,22 @@ namespace gbe
     }
 
     if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_REDUCE_ADD && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
-      /* Because we can not know the thread ID and the EUID for every physical
+      /* 1. For thread SLM based communication (default):
+       * Threads will use SLM to write partial results computed individually
+         and then read the whole set. Because the read is done in chunks of 4
+         extra padding is required.
+
+       * 2. For thread message based communication:
+       * Because we can not know the thread ID and the EUID for every physical
          thead which the work items execute on before the run time. We need to
          sync the thread execution order when using work group functions. We
          create the workitems/threadID map table in slm.
+
          When we come to here, the global thread local vars should have all been
          allocated, so it's safe for us to steal a piece of SLM for this usage. */
-      uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one subslice.
+
+      // at most 64 thread for one subslice, along with extra padding
+      uint32_t mapSize = sizeof(uint32_t) * (64 + 4);
       f.setUseSLM(true);
       uint32_t oldSlm = f.getSLMSize();
       f.setSLMSize(oldSlm + mapSize);
-- 
2.5.0



More information about the Beignet mailing list