[Beignet] [PATCH V2 08/17] Backend: Code refactoring, additional comments on implementation

Mon Apr 11 14:37:58 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/backend/gen_context.cpp        | 222 ++++++++++++++++-------------
 backend/src/backend/gen_insn_selection.cpp | 101 +++++++------
 2 files changed, 177 insertions(+), 146 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 38bd0f2..6219b6c 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2351,45 +2351,9 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
-  static void workgroupOp(GenRegister dst,
-                         GenRegister src1,
-                         GenRegister src2,
-                         uint32_t wg_op,
-                         GenEncoder *p)
-  {
-    // REDUCE
-    if (wg_op == ir::WORKGROUP_OP_ANY)
-      p->OR(dst, src1, src2);
-    else if (wg_op == ir::WORKGROUP_OP_ALL)
-      p->AND(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    // INCLUSIVE
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    // EXCLUSIVE
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    else
-      GBE_ASSERT(0);
-  }
-
-  static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+  /* Init value according to WORKGROUP OP
+   * Emit assert is invalid combination operation - datatype */
+  static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
   {
 
     if (wg_op == ir::WORKGROUP_OP_ALL)
@@ -2464,8 +2428,53 @@ namespace gbe
       GBE_ASSERT(0);
   }
 
-  static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
-                                   GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+  /* Perform WORKGROUP OP on 2 input elements (registers) */
+  static void wgOpPerform(GenRegister dst,
+                         GenRegister src1,
+                         GenRegister src2,
+                         uint32_t wg_op,
+                         GenEncoder *p)
+  {
+    /* perform OP REDUCE on 2 elements */
+    if (wg_op == ir::WORKGROUP_OP_ANY)
+      p->OR(dst, src1, src2);
+    else if (wg_op == ir::WORKGROUP_OP_ALL)
+      p->AND(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN INCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN EXCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    else
+      GBE_ASSERT(0);
+  }
+
+  static void wgOpPerformThread(GenRegister threadDst,
+                                  GenRegister inputVal,
+                                  GenRegister threadExchangeData,
+                                   GenRegister resultVal,
+                                   uint32_t simd,
+                                   uint32_t wg_op,
+                                   GenEncoder *p)
+  {
    p->push();
    p->curr.predicate = GEN_PREDICATE_NONE;
    p->curr.noMask = 1;
@@ -2485,7 +2494,7 @@ namespace gbe
    /* init thread data to min/max/null values */
    p->push(); {
      p->curr.execWidth = simd;
-     initValue(p, threadExchangeData, wg_op);
+     wgOpInitValue(p, threadExchangeData, wg_op);
      p->MOV(resultVal, inputVal);
    } p->pop();
 
@@ -2550,17 +2559,17 @@ namespace gbe
          wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
          wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
          wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
-       workgroupOp(result[0], result[0], input[i], wg_op, p);
+       wgOpPerform(result[0], result[0], input[i], wg_op, p);
 
      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
          wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
          wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
-       workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
+       wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
 
      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
          wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
          wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-       workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
+       wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
 
      else
        GBE_ASSERT(0);
@@ -2594,11 +2603,11 @@ namespace gbe
    {
      p->curr.execWidth = 1;
      /* set result[0] to min/max/null */
-     initValue(p, result[0], wg_op);
+     wgOpInitValue(p, result[0], wg_op);
 
      p->curr.execWidth = 16;
      /* value exchanged with other threads */
-     workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+     wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
      /* partial result thread */
      p->MOV(threadDst, resultVal);
    }
@@ -2607,14 +2616,19 @@ namespace gbe
  }
 
 /**
- * Basic idea:
- * 1. All the threads firstly calculate the max/min/add value for the
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
  * allocated work-items. SIMD16=> 16 work-items allocated for each thread
- * 2. Each thread will write the computed reduce OP result in SLM memory
- * based on the threadId
- * 3. After a memory fence, each thread will read in chunks of 4 elements,
- * the SLM region, using a loop based on the thread count value (threadN)
- * 4. At the end each thread has the final value computed individually
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
  */
   void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
     const GenRegister dst = ra->genReg(insn.dst(0));
@@ -2630,10 +2644,10 @@ namespace gbe
     uint32_t simd = p->curr.execWidth;
     int32_t jip0, jip1;
 
-    /* Masked elements should be properly set to init value */
+    /* masked elements should be properly set to init value */
     p->push(); {
       p->curr.noMask = 1;
-      initValue(p, tmp, wg_op);
+      wgOpInitValue(p, tmp, wg_op);
       p->curr.noMask = 0;
       p->MOV(tmp, theVal);
       p->curr.noMask = 1;
@@ -2642,43 +2656,41 @@ namespace gbe
 
     threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
 
-    /* Use of continuous GRF allocation from insn selection */
+    /* use of continuous GRF allocation from insn selection */
     GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
     GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
     GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
     GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
 
-    /* Do some calculation within each thread */
-    workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
 
     p->curr.execWidth = 16;
     p->MOV(theVal, dst);
     threadData = GenRegister::toUniform(threadData, dst.type);
 
-    /* Store thread count for future use on read/write to SLM */
+    /* store thread count for future use on read/write to SLM */
     if (wg_op == ir::WORKGROUP_OP_ANY ||
-        wg_op == ir::WORKGROUP_OP_ALL ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
-        //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
-        threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
-        p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
     }
     else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
-        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
-        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
     {
-        //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
-        threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
-        p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
     }
 
-    /* All threads write the partial results to SLM memory */
+    /* all threads write the partial results to SLM memory */
     if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
     {
       GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
@@ -2700,10 +2712,10 @@ namespace gbe
       p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
     }
 
-    /* Init partialData register, it will hold the final result */
-    initValue(p, partialData, wg_op);
+    /* init partialData register, it will hold the final result */
+    wgOpInitValue(p, partialData, wg_op);
 
-    /* Add call to barrier */
+    /* add call to barrier */
     p->push();
       p->curr.execWidth = 8;
       p->curr.physicalFlag = 0;
@@ -2714,11 +2726,11 @@ namespace gbe
       p->WAIT();
     p->pop();
 
-    /* Perform a loop, based on thread count (which is now multiple of 4) */
+    /* perform a loop, based on thread count (which is now multiple of 4) */
     p->push();{
       jip0 = p->n_instruction();
 
-      /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
       if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
       {
         p->curr.execWidth = 8;
@@ -2734,8 +2746,8 @@ namespace gbe
         msgDataH.hstride = 2;
         p->MOV(msgDataL, msgDataH);
 
-        /* Perform operation, partialData will hold result */
-        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
       }
       else
       {
@@ -2746,11 +2758,11 @@ namespace gbe
         p->ADD(msgAddr, msgAddr, msgSlmOff);
         p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
 
-        /* Perform operation, partialData will hold result */
-        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
       }
 
-      /* While threadN is not 0, cycle read SLM / update value */
+      /* while threadN is not 0, cycle read SLM / update value */
       p->curr.noMask = 1;
       p->curr.flag = 0;
       p->curr.subFlag = 1;
@@ -2762,22 +2774,25 @@ namespace gbe
     } p->pop();
 
     if(wg_op == ir::WORKGROUP_OP_ANY ||
-            wg_op == ir::WORKGROUP_OP_ALL ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
-        /* Save result to final register location dst */
-        p->curr.execWidth = 16;
-        p->MOV(dst, partialData);
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+      p->MOV(dst, partialData);
     }
-    else {
-        /* Save result to final register location dst */
-        p->curr.execWidth = 16;
+    else
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+
       if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
           || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
         p->ADD(dst, dst, partialData);
-      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
       {
         p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
@@ -2790,7 +2805,8 @@ namespace gbe
                        dst.offset(dst, 3, 0), partialData);
         }
       }
-      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
       {
         p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
@@ -2807,11 +2823,11 @@ namespace gbe
 
     /* corner cases for threads 0 */
     if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
-            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
-            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
     {
       p->push();{
         p->curr.flag = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 07bdef8..152054e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6446,66 +6446,75 @@ namespace gbe
   /*! WorkGroup instruction pattern */
   DECL_PATTERN(WorkGroupInstruction)
   {
-    /* SLM bassed communication between threads, most of the logic bellow */
+    /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+     * Shared local memory bassed communication between threads,
+     * prepare for the workgroup op in gen context
+     * Algorithm logic is in gen context,  */
     INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
     {
       using namespace ir;
-      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
 
+      GBE_ASSERT(insn.getSrcNum() == 3);
+      GBE_ASSERT(insn.getSrc(0) == ocl::threadn);
+      GBE_ASSERT(insn.getSrc(1) == ocl::threadid);
+
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
       const Type type = insn.getType();
       GenRegister dst = sel.selReg(insn.getDst(0), type);
       GenRegister src = sel.selReg(insn.getSrc(2), type);
-      GenRegister threadId = sel.selReg(ocl::threadid, ir::TYPE_U32);
-      GenRegister threadN = sel.selReg(ocl::threadn, ir::TYPE_U32);
-      const uint32_t srcNum = insn.getSrcNum();
-
-      GBE_ASSERT(srcNum == 3);
-      GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
-      GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
       GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
       GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type);
-      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32);
+      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32);
+      GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32);
+      GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32);
 
+      /* Allocate registers for message sending
+       * (read/write to shared local memory) */
       vector<GenRegister> msg;
       for(uint32_t i = 0; i < 6; i++)
-        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
 
-      /* insert a barrier to make sure all the var we are interested in
+      /* Insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
 
-      /* compute individual slice of workitems, (e.g. 0->16 workitems) */
+      /* Pass the shared local memory offset  */
       sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
 
-      /* barrier for syn prior to workgroup */
-      sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg);
+      /* Perform workgroup op */
+      sel.WORKGROUP_OP(workGroupOp, dst, src, data,
+                       localThreadID, localThreadNUM, tmp, slmOff, msg);
 
       return true;
     }
 
-    INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const {
-      /*  1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
-          2. CMP IDs    Compare the local IDs with the specified ones in the function call.
-          3. STORE         Use flag to control the store of the var. Only the specified item will execute the store.
-          4. BARRIER    Ensure the specified value has been stored.
-          5. LOAD          Load the stored value to all the dst value, the dst of all the items will have same value,
-          so broadcasted.       */
+    /* WORKGROUP OP: BROADCAST
+     * 1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
+       2. CMP IDs    Compare the local IDs with the specified ones in the function call.
+       3. STORE      Use flag to control the store of the var. Only the specified item will execute the store.
+       4. BARRIER    Ensure the specified value has been stored.
+       5. LOAD       Load the stored value to all the dst value, the dst of all the items will have same value,
+       so broadcasted. */
+    INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+    {
       using namespace ir;
+
+      const uint32_t srcNum = insn.getSrcNum();
+      GBE_ASSERT(srcNum >= 2);
+
       const Type type = insn.getType();
       const GenRegister src = sel.selReg(insn.getSrc(0), type);
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
-      const uint32_t srcNum = insn.getSrcNum();
       const uint32_t slmAddr = insn.getSlmAddr();
-      GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+      GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
       vector<GenRegister> fakeTemps;
       fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
       fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
 
-      GBE_ASSERT(srcNum >= 2);
       GenRegister coords[3];
-      for (uint32_t i = 1; i < srcNum; i++) {
+      for (uint32_t i = 1; i < srcNum; i++)
         coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD);
-      }
 
       sel.push(); {
         sel.curr.predicate = GEN_PREDICATE_NONE;
@@ -6515,7 +6524,8 @@ namespace gbe
 
       /* insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
 
       sel.push(); {
         sel.curr.flag = 0;
@@ -6524,24 +6534,27 @@ namespace gbe
         sel.curr.noMask = 1;
         GenRegister lid0, lid1, lid2;
         uint32_t dim = srcNum - 1;
-        lid0 = GenRegister::retype(sel.selReg(ir::ocl::lid0, TYPE_U32), GEN_TYPE_UD);
-        lid1 = GenRegister::retype(sel.selReg(ir::ocl::lid1, TYPE_U32), GEN_TYPE_UD);
-        lid2 = GenRegister::retype(sel.selReg(ir::ocl::lid2, TYPE_U32), GEN_TYPE_UD);
+        lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD);
+        lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD);
+        lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD);
 
-        sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+        sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0,
+                GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
         if (dim >= 2)
-          sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
         if (dim >= 3)
-          sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
 
-        /* write to SLM for BYTE/WORD/DWORD types */
+        /* write to shared local memory for BYTE/WORD/DWORD types */
         if (typeSize(src.type) <= 4) {
           GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
           GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD);
           sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps);
         }
-        /* write to SLM for QWORD types */
+        /* write to shared local memory for QWORD types */
         else if (typeSize(src.type) == 8) {
           sel.push(); {
           /* arrange data in QWORD */
@@ -6556,8 +6569,9 @@ namespace gbe
           /* unpack into 2 DWORD */
           sel.UNPACK_LONG(srcQW, src);
 
-          /* perform write to SLM */
-          sel.UNTYPED_WRITE(_addr, srcVec.data(), 2, GenRegister::immw(0xfe), fakeTemps);
+          /* emit write through SEND */
+          sel.UNTYPED_WRITE(_addr, srcVec.data(), 2,
+                            GenRegister::immw(0xfe), fakeTemps);
           }sel.pop();
         }
         else
@@ -6565,15 +6579,16 @@ namespace gbe
 
       } sel.pop();
       /* make sure the slm var have the valid value now */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
 
-      /* read from SLM for BYTE/WORD/DWORD types */
+      /* read from shared local memory for BYTE/WORD/DWORD types */
       if (typeSize(src.type) <= 4) {
         GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
         GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD);
         sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps);
       }
-      /* read from SLM for QWORD types */
+      /* read from shared local memory for QWORD types */
       else if (typeSize(src.type) == 8) {
         GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
         vector<GenRegister> _dst;
@@ -6582,7 +6597,7 @@ namespace gbe
         GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL);
 
         sel.push(); {
-        /* read from SLM */
+        /* emit read through SEND */
         sel.curr.execWidth = 8;
         sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps);
 
-- 
2.5.0