[Beignet] [PATCH 09/10] Backend: Full support workgroup reduce, scan inc/exc on DWORD and bellow datatypes

Wed Apr 6 07:21:03 UTC 2016

On Thu, Mar 31, 2016 at 06:28:33PM +0300, grigore.lupescu at intel.com wrote:
> From: Grigore Lupescu <grigore.lupescu at intel.com>
> 
> Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
> ---
>  backend/src/backend/gen_context.cpp | 329 ++++++++++++++++++++----------------
>  1 file changed, 180 insertions(+), 149 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index d7f2479..31232dd 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -2346,38 +2346,39 @@ namespace gbe
>      p->TYPED_WRITE(header, true, bti);
>    }
>  
> -  static void workgroupOpBetweenThread(GenRegister partialRes,
> -                                       GenRegister value,
> -                                       uint32_t wg_op,
> -                                       GenEncoder *p)
> +  static void workgroupOp(GenRegister dst,
> +                         GenRegister src1,
> +                         GenRegister src2,
> +                         uint32_t wg_op,
> +                         GenEncoder *p)
>    {
>      // REDUCE
>      if (wg_op == ir::WORKGROUP_OP_ANY)
> -      p->OR(partialRes, partialRes, value);
> +      p->OR(dst, src1, src2);
>      else if (wg_op == ir::WORKGROUP_OP_ALL)
> -      p->AND(partialRes, partialRes, value);
> +      p->AND(dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
> -      p->ADD(partialRes, partialRes, value);
> +      p->ADD(dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
> -      p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
> -      p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
>  
>      // INCLUSIVE
>      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
> -      p->ADD(partialRes, partialRes, value);
> +      p->ADD(dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
> -      p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
> -      p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
>  
>      // EXCLUSIVE
>      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
> -      p->ADD(partialRes, partialRes, value);
> +      p->ADD(dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
> -      p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
>      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> -      p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
> +      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
>  
>      else
>        GBE_ASSERT(0);
> @@ -2459,139 +2460,147 @@ namespace gbe
>        GBE_ASSERT(0);
>    }
>  
> -  static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
> -                                  GenRegister tmp, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
> -    p->push();
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 1;
> -
> -    /* Setting the init value here. */
> -    msgData = GenRegister::retype(msgData, theVal.type);
> -    threadData = GenRegister::retype(threadData, theVal.type);
> -    initValue(p, threadData, wg_op);
> -
> -    if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) {
> -      /* We need to set the value out of dispatch mask to MAX. */
> -      tmp = GenRegister::retype(tmp, theVal.type);
> -      p->push();
> -      p->curr.predicate = GEN_PREDICATE_NONE;
> -      p->curr.noMask = 1;
> -      p->curr.execWidth = simd;
> -      initValue(p, tmp, wg_op);
> -      p->curr.noMask = 0;
> -      p->MOV(tmp, theVal);
> -      p->pop();
> -    }
> -    else {
> -      if (wg_op == ir::WORKGROUP_OP_ANY)
> -        p->OR(threadData, threadData, theVal);
> -      else if (wg_op == ir::WORKGROUP_OP_ALL)
> -        p->AND(threadData, threadData, theVal);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
> -        p->ADD(threadData, threadData, theVal);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
> -        p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, theVal);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
> -        p->SEL_CMP(GEN_CONDITIONAL_GE, threadData, threadData, theVal);
> -      p->pop();
> -      return;
> -    }
> -
> -    GBE_ASSERT(tmp.type == theVal.type);
> -    GenRegister partialRes = tmp;
> -    GenRegister finalRes = tmp;
> -
> -    if(wg_op >= ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
> -        wg_op <= ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> -
> -    for (uint32_t i = 1; i < simd; i++)
> -    {
> -      p->push(); {
> -      p->curr.execWidth = 1;
> -
> -      tmp.hstride = GEN_HORIZONTAL_STRIDE_0;
> -      tmp.vstride = GEN_VERTICAL_STRIDE_0;
> -      tmp.width = GEN_WIDTH_1;
> -      tmp.subnr += typeSize(tmp.type);
> -      if (tmp.subnr == 32) {
> -        tmp.subnr = 0;
> -        tmp.nr++;
> -      }
> -
> -      // REDUCE
> -      if (wg_op == ir::WORKGROUP_OP_ANY)
> -        p->OR(partialRes, partialRes, tmp);
> -      else if (wg_op == ir::WORKGROUP_OP_ALL)
> -        p->AND(partialRes, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
> -        p->ADD(partialRes, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
> -        p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
> -        p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, tmp);
> -
> -      // INCLUSIVE
> -      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
> -        p->ADD(tmp, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
> -        p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
> -        p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
> -
> -      // EXCLUSIVE
> -      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
> -        p->ADD(tmp, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
> -        p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
> -      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> -        p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
> -
> -      else
> -        GBE_ASSERT(0);
> -
> -      if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD)
> -        partialRes = tmp;
> -      } p->pop();
> -    }
> -
> -    p->curr.execWidth = 16;
> -    if(wg_op < ir::WORKGROUP_OP_INCLUSIVE_ADD){
> -      p->MOV(threadData, partialRes);
> -      p->MOV(msgData, finalRes);
> -    }
> -    else if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD
> -        && wg_op < ir::WORKGROUP_OP_EXCLUSIVE_ADD)
> -    {
> -      p->MOV(threadData, tmp);
> -      p->MOV(msgData, finalRes);
> -    }
> -    else {
> -        p->MOV(threadData, tmp);
> -
> -        p->curr.execWidth = 16;
> -        initValue(p, msgData, wg_op);
> -
> -        p->push(); {
> -        p->curr.execWidth = 8;
> -        finalRes = finalRes.suboffset(finalRes, 0);
> -        msgData = msgData.suboffset(msgData, 1);
> -        p->MOV(msgData, finalRes); } p->pop();
> -
> -        p->push(); {
> -        p->curr.execWidth = 1;
> -        finalRes = finalRes.suboffset(finalRes, 7);
> -        msgData = msgData.suboffset(msgData, 7);
> -        p->MOV(msgData, finalRes); } p->pop();
> -
> -        p->push(); {
> -        p->curr.execWidth = 8;
> -        finalRes = finalRes.suboffset(finalRes, 1);
> -        msgData = msgData.suboffset(msgData, 1);
> -        p->MOV(msgData, finalRes); } p->pop();
> -    }
> -    p->pop();
> -}
> +  static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
> +                                   GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
> +   p->push();
> +   p->curr.predicate = GEN_PREDICATE_NONE;
> +   p->curr.noMask = 1;
> +   p->curr.execWidth = 1;
> +
> +   /* setting the type */
> +   resultVal = GenRegister::retype(resultVal, inputVal.type);
> +   threadDst = GenRegister::retype(threadDst, inputVal.type);
> +   threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
> +
> +   if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
> +     p->MOV(threadExchangeData, inputVal);
> +     p->pop();
> +     return;
> +   }
> +
> +   /* init thread data to min/max/null values */
> +   p->push(); {
> +     p->curr.execWidth = simd;
> +     initValue(p, threadExchangeData, wg_op);
> +     p->MOV(resultVal, inputVal);
> +   } p->pop();
> +
> +   GenRegister resultValSingle = resultVal;
> +   resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
> +   resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
> +   resultValSingle.width = GEN_WIDTH_1;
> +
> +   GenRegister inputValSingle = inputVal;
> +   inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
> +   inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
> +   inputValSingle.width = GEN_WIDTH_1;
> +
> +   vector<GenRegister> input;
> +   vector<GenRegister> result;
> +
> +   /* make an array of registers for easy accesing */
> +   for(uint32_t i = 0; i < simd; i++){
> +     /* add all resultVal offset reg positions from list */
> +     result.push_back(resultValSingle);
> +     input.push_back(inputValSingle);
> +
> +     /* move to next position */
> +     resultValSingle.subnr += typeSize(resultValSingle.type);
> +     if (resultValSingle.subnr == 32) {
> +         resultValSingle.subnr = 0;
> +         resultValSingle.nr++;
> +     }
> +     /* move to next position */
> +     inputValSingle.subnr += typeSize(inputValSingle.type);
> +     if (inputValSingle.subnr == 32) {
> +         inputValSingle.subnr = 0;
> +         inputValSingle.nr++;
> +     }
> +   }
> +
> +   uint32_t start_i = 0;
> +   if(wg_op == ir::WORKGROUP_OP_ANY ||
> +       wg_op == ir::WORKGROUP_OP_ALL ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
> +       wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
> +     p->MOV(result[0], input[0]);
> +     start_i = 1;
> +   }
> +
> +   else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
> +     p->MOV(result[1], input[0]);
> +     start_i = 2;
> +   }
> +
> +   /* algorithm workgroup */
> +   for (uint32_t i = start_i; i < simd; i++)
> +   {
> +     if(wg_op == ir::WORKGROUP_OP_ANY ||
> +         wg_op == ir::WORKGROUP_OP_ALL ||
> +         wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
> +         wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
> +         wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
> +       workgroupOp(result[0], result[0], input[i], wg_op, p);
> +
> +     else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
> +         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
> +         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
> +       workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
> +
> +     else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
> +         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
> +         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> +       workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
> +
> +     else
> +       GBE_ASSERT(0);
> +   }

I think the above for loop should be optimized to SIMD instructions, if
we simply use 16 op, then the workgroup with size that are not multiple
of simd will have bugs. This will cause utest workgroup_all/any fail.

> +
> +   if(wg_op == ir::WORKGROUP_OP_ANY ||
> +       wg_op == ir::WORKGROUP_OP_ALL ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
> +   {
> +     p->curr.execWidth = 16;
> +     /* value exchanged with other threads */
> +     p->MOV(threadExchangeData, result[0]);
> +     /* partial result thread */
> +     p->MOV(threadDst, result[0]);
> +   }
> +   else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
> +   {
> +     p->curr.execWidth = 16;
> +     /* value exchanged with other threads */
> +     p->MOV(threadExchangeData, result[simd - 1]);
> +     /* partial result thread */
> +     p->MOV(threadDst, resultVal);
> +   }
> +   else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
> +       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
> +       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> +   {
> +     p->curr.execWidth = 1;
> +     /* set result[0] to min/max/null */
> +     initValue(p, result[0], wg_op);
> +
> +     p->curr.execWidth = 16;
> +     /* value exchanged with other threads */
> +     workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
> +     /* partial result thread */
> +     p->MOV(threadDst, resultVal);
> +   }
> +
> +   p->pop();
> + }
>  
>  /**
>   * Basic idea:
> @@ -2626,6 +2635,9 @@ namespace gbe
>  
>      /* Do some calculation within each thread */
>      workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
> +
> +    p->curr.execWidth = 16;
> +    p->MOV(theVal, dst);
>      threadData = GenRegister::toUniform(threadData, dst.type);
>  
>      /* Store thread count for future use on read/write to SLM */
> @@ -2690,7 +2702,7 @@ namespace gbe
>        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
>  
>        /* Perform operation, process 4 elements, partialData will hold result */
> -      workgroupOpBetweenThread(partialData, msgData.offset(msgData, 0), wg_op, p);
> +      workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
>  
>        /* While threadN is not 0, cycle read SLM / update value */
>        p->curr.noMask = 1;
> @@ -2726,6 +2738,25 @@ namespace gbe
>            || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
>          p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
>      }
> +
> +    /* corner cases for threads 0 */
> +    if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
> +            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
> +            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
> +            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
> +            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
> +            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
> +    {
> +      p->push();{
> +        p->curr.flag = 0;
> +        p->curr.subFlag = 1;
> +        p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +
> +        p->curr.execWidth = 16;
> +        p->MOV(dst, theVal);
> +      } p->pop();
> +    }
>    }
>  
>    void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
> -- 
> 2.5.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet