[Beignet] [PATCH 1/2] Backend: Copy workgroup emit function to gen8

Mon May 16 01:42:10 UTC 2016

From: Pan Xiuli <xiuli.pan at intel.com>

Since long type is not supported before gen8, need to make a copy for
future change.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen8_context.cpp | 528 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen8_context.hpp |   2 +
 backend/src/backend/gen_context.hpp  |   2 +-
 3 files changed, 531 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index e5ccc0f..477b22b 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1317,4 +1317,532 @@ namespace gbe
     p->pop();
   }
 
+  /* Init value according to WORKGROUP OP
+   * Emit assert is invalid combination operation - datatype */
+  static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+  {
+
+    if (wg_op == ir::WORKGROUP_OP_ALL)
+    {
+      if (dataReg.type == GEN_TYPE_D
+          || dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
+      else if(dataReg.type == GEN_TYPE_L ||
+          dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_ANY
+      || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x0));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(dataReg, GenRegister::immf(0x0));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x0));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x80000000));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    /* unsupported operation */
+    else
+      GBE_ASSERT(0);
+  }
+
+  /* Perform WORKGROUP OP on 2 input elements (registers) */
+  static void wgOpPerform(GenRegister dst,
+                         GenRegister src1,
+                         GenRegister src2,
+                         uint32_t wg_op,
+                         GenEncoder *p)
+  {
+    /* perform OP REDUCE on 2 elements */
+    if (wg_op == ir::WORKGROUP_OP_ANY)
+      p->OR(dst, src1, src2);
+    else if (wg_op == ir::WORKGROUP_OP_ALL)
+      p->AND(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN INCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN EXCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    else
+      GBE_ASSERT(0);
+  }
+
+  static void wgOpPerformThread(GenRegister threadDst,
+                                  GenRegister inputVal,
+                                  GenRegister threadExchangeData,
+                                   GenRegister resultVal,
+                                   uint32_t simd,
+                                   uint32_t wg_op,
+                                   GenEncoder *p)
+  {
+   p->push();
+   p->curr.predicate = GEN_PREDICATE_NONE;
+   p->curr.noMask = 1;
+   p->curr.execWidth = 1;
+
+   /* setting the type */
+   resultVal = GenRegister::retype(resultVal, inputVal.type);
+   threadDst = GenRegister::retype(threadDst, inputVal.type);
+   threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
+
+   vector<GenRegister> input;
+   vector<GenRegister> result;
+
+   /* for workgroup all and any we can use simd_all/any for each thread */
+   if (wg_op == ir::WORKGROUP_OP_ALL || wg_op == ir::WORKGROUP_OP_ANY) {
+     GenRegister constZero = GenRegister::immuw(0);
+     GenRegister flag01 = GenRegister::flag(0, 1);
+
+     p->push();
+     {
+       p->curr.predicate = GEN_PREDICATE_NONE;
+       p->curr.noMask = 1;
+       p->curr.execWidth = simd;
+       p->MOV(resultVal, GenRegister::immud(1));
+       p->curr.execWidth = 1;
+       if (wg_op == ir::WORKGROUP_OP_ALL)
+         p->MOV(flag01, GenRegister::immw(-1));
+       else
+         p->MOV(flag01, constZero);
+
+       p->curr.execWidth = simd;
+       p->curr.noMask = 0;
+
+       p->curr.flag = 0;
+       p->curr.subFlag = 1;
+       p->CMP(GEN_CONDITIONAL_NEQ, inputVal, constZero);
+
+       if (p->curr.execWidth == 16)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+         else
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+       else if (p->curr.execWidth == 8)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+         else
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+       else
+         NOT_IMPLEMENTED;
+       p->SEL(threadDst, resultVal, constZero);
+       p->SEL(threadExchangeData, resultVal, constZero);
+     }
+     p->pop();
+   } else {
+     if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       p->MOV(threadExchangeData, inputVal);
+       p->pop();
+       return;
+     }
+
+     /* init thread data to min/max/null values */
+     p->push(); {
+       p->curr.execWidth = simd;
+       wgOpInitValue(p, threadExchangeData, wg_op);
+       p->MOV(resultVal, inputVal);
+     } p->pop();
+
+     GenRegister resultValSingle = resultVal;
+     resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     resultValSingle.width = GEN_WIDTH_1;
+
+     GenRegister inputValSingle = inputVal;
+     inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     inputValSingle.width = GEN_WIDTH_1;
+
+
+     /* make an array of registers for easy accesing */
+     for(uint32_t i = 0; i < simd; i++){
+       /* add all resultVal offset reg positions from list */
+       result.push_back(resultValSingle);
+       input.push_back(inputValSingle);
+
+       /* move to next position */
+       resultValSingle.subnr += typeSize(resultValSingle.type);
+       if (resultValSingle.subnr == 32) {
+           resultValSingle.subnr = 0;
+           resultValSingle.nr++;
+       }
+       /* move to next position */
+       inputValSingle.subnr += typeSize(inputValSingle.type);
+       if (inputValSingle.subnr == 32) {
+           inputValSingle.subnr = 0;
+           inputValSingle.nr++;
+       }
+     }
+
+     uint32_t start_i = 0;
+     if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
+       p->MOV(result[0], input[0]);
+       start_i = 1;
+     }
+
+     else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+       p->MOV(result[1], input[0]);
+       start_i = 2;
+     }
+
+     /* algorithm workgroup */
+     for (uint32_t i = start_i; i < simd; i++)
+     {
+       if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+         wgOpPerform(result[0], result[0], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
+
+       else
+         GBE_ASSERT(0);
+     }
+   }
+
+   if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[0]);
+     /* partial result thread */
+     p->MOV(threadDst, result[0]);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[simd - 1]);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 1;
+     /* set result[0] to min/max/null */
+     wgOpInitValue(p, result[0], wg_op);
+
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+
+   p->pop();
+ }
+
+/**
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
+ * allocated work-items. SIMD16=> 16 work-items allocated for each thread
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
+ */
+  void Gen8Context::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
+    GenRegister threadData = ra->genReg(insn.src(3));
+    GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
+    GenRegister threadId = ra->genReg(insn.src(0));
+    GenRegister threadLoop = ra->genReg(insn.src(1));
+    GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister localBarrier = ra->genReg(insn.src(5));
+
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+    int32_t jip0, jip1;
+
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
+    threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
+
+    /* use of continuous GRF allocation from insn selection */
+    GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
+    GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+    GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
+    GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+
+    p->curr.execWidth = 16;
+    p->MOV(theVal, dst);
+    threadData = GenRegister::toUniform(threadData, dst.type);
+
+    /* store thread count for future use on read/write to SLM */
+    if (wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+    }
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+    }
+
+    /* all threads write the partial results to SLM memory */
+    if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+    {
+      GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
+      GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
+      p->MOV(msgData.offset(msgData, 0), threadDataL);
+      p->MOV(msgData.offset(msgData, 1), threadDataH);
+
+      p->curr.execWidth = 8;
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+    }
+    else
+    {
+      p->curr.execWidth = 8;
+      p->MOV(msgData, threadData);
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+    }
+
+    /* init partialData register, it will hold the final result */
+    wgOpInitValue(p, partialData, wg_op);
+
+    /* add call to barrier */
+    p->push();
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+      p->BARRIER(localBarrier);
+      p->curr.execWidth = 1;
+      p->WAIT();
+    p->pop();
+
+    /* perform a loop, based on thread count (which is now multiple of 4) */
+    p->push();{
+      jip0 = p->n_instruction();
+
+      /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2);
+
+        GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D);
+        GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D);
+        msgDataL.hstride = 2;
+        msgDataH.hstride = 2;
+        p->MOV(msgDataL, msgDataH);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+      else
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+
+      /* while threadN is not 0, cycle read SLM / update value */
+      p->curr.noMask = 1;
+      p->curr.flag = 0;
+      p->curr.subFlag = 1;
+      p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      jip1 = p->n_instruction();
+      p->JMPI(GenRegister::immud(0));
+      p->patchJMPI(jip1, jip0 - jip1, 0);
+    } p->pop();
+
+    if(wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+      p->MOV(dst, partialData);
+    }
+    else
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+
+      if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+          || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+        p->ADD(dst, dst, partialData);
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+    }
+
+    /* corner cases for threads 0 */
+    if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      p->push();{
+        p->curr.flag = 0;
+        p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+        p->curr.execWidth = 16;
+        p->MOV(dst, theVal);
+      } p->pop();
+    }
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 2e6eae5..771e20b 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -76,6 +76,8 @@ namespace gbe
 
     virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
 
+    virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+
     static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
 
   protected:
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 47713da..ebc55e6 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -180,7 +180,7 @@ namespace gbe
     virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
     void emitCalcTimestampInstruction(const SelectionInstruction &insn);
     void emitStoreProfilingInstruction(const SelectionInstruction &insn);
-    void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+    virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
     void emitPrintfInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
-- 
2.7.4