[Beignet] [PATCH 09/10] Backend: Full support workgroup reduce, scan inc/exc on DWORD and bellow datatypes
grigore.lupescu at intel.com
grigore.lupescu at intel.com
Thu Mar 31 15:28:33 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/backend/gen_context.cpp | 329 ++++++++++++++++++++----------------
1 file changed, 180 insertions(+), 149 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index d7f2479..31232dd 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2346,38 +2346,39 @@ namespace gbe
p->TYPED_WRITE(header, true, bti);
}
- static void workgroupOpBetweenThread(GenRegister partialRes,
- GenRegister value,
- uint32_t wg_op,
- GenEncoder *p)
+ static void workgroupOp(GenRegister dst,
+ GenRegister src1,
+ GenRegister src2,
+ uint32_t wg_op,
+ GenEncoder *p)
{
// REDUCE
if (wg_op == ir::WORKGROUP_OP_ANY)
- p->OR(partialRes, partialRes, value);
+ p->OR(dst, src1, src2);
else if (wg_op == ir::WORKGROUP_OP_ALL)
- p->AND(partialRes, partialRes, value);
+ p->AND(dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
- p->ADD(partialRes, partialRes, value);
+ p->ADD(dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
// INCLUSIVE
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
- p->ADD(partialRes, partialRes, value);
+ p->ADD(dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
// EXCLUSIVE
else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
- p->ADD(partialRes, partialRes, value);
+ p->ADD(dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
else
GBE_ASSERT(0);
@@ -2459,139 +2460,147 @@ namespace gbe
GBE_ASSERT(0);
}
- static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
- GenRegister tmp, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->curr.execWidth = 1;
-
- /* Setting the init value here. */
- msgData = GenRegister::retype(msgData, theVal.type);
- threadData = GenRegister::retype(threadData, theVal.type);
- initValue(p, threadData, wg_op);
-
- if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) {
- /* We need to set the value out of dispatch mask to MAX. */
- tmp = GenRegister::retype(tmp, theVal.type);
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->curr.execWidth = simd;
- initValue(p, tmp, wg_op);
- p->curr.noMask = 0;
- p->MOV(tmp, theVal);
- p->pop();
- }
- else {
- if (wg_op == ir::WORKGROUP_OP_ANY)
- p->OR(threadData, threadData, theVal);
- else if (wg_op == ir::WORKGROUP_OP_ALL)
- p->AND(threadData, threadData, theVal);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
- p->ADD(threadData, threadData, theVal);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, theVal);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, threadData, threadData, theVal);
- p->pop();
- return;
- }
-
- GBE_ASSERT(tmp.type == theVal.type);
- GenRegister partialRes = tmp;
- GenRegister finalRes = tmp;
-
- if(wg_op >= ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
- wg_op <= ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-
- for (uint32_t i = 1; i < simd; i++)
- {
- p->push(); {
- p->curr.execWidth = 1;
-
- tmp.hstride = GEN_HORIZONTAL_STRIDE_0;
- tmp.vstride = GEN_VERTICAL_STRIDE_0;
- tmp.width = GEN_WIDTH_1;
- tmp.subnr += typeSize(tmp.type);
- if (tmp.subnr == 32) {
- tmp.subnr = 0;
- tmp.nr++;
- }
-
- // REDUCE
- if (wg_op == ir::WORKGROUP_OP_ANY)
- p->OR(partialRes, partialRes, tmp);
- else if (wg_op == ir::WORKGROUP_OP_ALL)
- p->AND(partialRes, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
- p->ADD(partialRes, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, tmp);
-
- // INCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
- p->ADD(tmp, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
-
- // EXCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
- p->ADD(tmp, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
-
- else
- GBE_ASSERT(0);
-
- if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD)
- partialRes = tmp;
- } p->pop();
- }
-
- p->curr.execWidth = 16;
- if(wg_op < ir::WORKGROUP_OP_INCLUSIVE_ADD){
- p->MOV(threadData, partialRes);
- p->MOV(msgData, finalRes);
- }
- else if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD
- && wg_op < ir::WORKGROUP_OP_EXCLUSIVE_ADD)
- {
- p->MOV(threadData, tmp);
- p->MOV(msgData, finalRes);
- }
- else {
- p->MOV(threadData, tmp);
-
- p->curr.execWidth = 16;
- initValue(p, msgData, wg_op);
-
- p->push(); {
- p->curr.execWidth = 8;
- finalRes = finalRes.suboffset(finalRes, 0);
- msgData = msgData.suboffset(msgData, 1);
- p->MOV(msgData, finalRes); } p->pop();
-
- p->push(); {
- p->curr.execWidth = 1;
- finalRes = finalRes.suboffset(finalRes, 7);
- msgData = msgData.suboffset(msgData, 7);
- p->MOV(msgData, finalRes); } p->pop();
-
- p->push(); {
- p->curr.execWidth = 8;
- finalRes = finalRes.suboffset(finalRes, 1);
- msgData = msgData.suboffset(msgData, 1);
- p->MOV(msgData, finalRes); } p->pop();
- }
- p->pop();
-}
+ static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
+ GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+
+ /* setting the type */
+ resultVal = GenRegister::retype(resultVal, inputVal.type);
+ threadDst = GenRegister::retype(threadDst, inputVal.type);
+ threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
+
+ if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
+ p->MOV(threadExchangeData, inputVal);
+ p->pop();
+ return;
+ }
+
+ /* init thread data to min/max/null values */
+ p->push(); {
+ p->curr.execWidth = simd;
+ initValue(p, threadExchangeData, wg_op);
+ p->MOV(resultVal, inputVal);
+ } p->pop();
+
+ GenRegister resultValSingle = resultVal;
+ resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+ resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+ resultValSingle.width = GEN_WIDTH_1;
+
+ GenRegister inputValSingle = inputVal;
+ inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+ inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+ inputValSingle.width = GEN_WIDTH_1;
+
+ vector<GenRegister> input;
+ vector<GenRegister> result;
+
+ /* make an array of registers for easy accesing */
+ for(uint32_t i = 0; i < simd; i++){
+ /* add all resultVal offset reg positions from list */
+ result.push_back(resultValSingle);
+ input.push_back(inputValSingle);
+
+ /* move to next position */
+ resultValSingle.subnr += typeSize(resultValSingle.type);
+ if (resultValSingle.subnr == 32) {
+ resultValSingle.subnr = 0;
+ resultValSingle.nr++;
+ }
+ /* move to next position */
+ inputValSingle.subnr += typeSize(inputValSingle.type);
+ if (inputValSingle.subnr == 32) {
+ inputValSingle.subnr = 0;
+ inputValSingle.nr++;
+ }
+ }
+
+ uint32_t start_i = 0;
+ if(wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
+ p->MOV(result[0], input[0]);
+ start_i = 1;
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+ p->MOV(result[1], input[0]);
+ start_i = 2;
+ }
+
+ /* algorithm workgroup */
+ for (uint32_t i = start_i; i < simd; i++)
+ {
+ if(wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ workgroupOp(result[0], result[0], input[i], wg_op, p);
+
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
+
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
+
+ else
+ GBE_ASSERT(0);
+ }
+
+ if(wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ p->MOV(threadExchangeData, result[0]);
+ /* partial result thread */
+ p->MOV(threadDst, result[0]);
+ }
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ {
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ p->MOV(threadExchangeData, result[simd - 1]);
+ /* partial result thread */
+ p->MOV(threadDst, resultVal);
+ }
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ p->curr.execWidth = 1;
+ /* set result[0] to min/max/null */
+ initValue(p, result[0], wg_op);
+
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+ /* partial result thread */
+ p->MOV(threadDst, resultVal);
+ }
+
+ p->pop();
+ }
/**
* Basic idea:
@@ -2626,6 +2635,9 @@ namespace gbe
/* Do some calculation within each thread */
workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+
+ p->curr.execWidth = 16;
+ p->MOV(theVal, dst);
threadData = GenRegister::toUniform(threadData, dst.type);
/* Store thread count for future use on read/write to SLM */
@@ -2690,7 +2702,7 @@ namespace gbe
p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
/* Perform operation, process 4 elements, partialData will hold result */
- workgroupOpBetweenThread(partialData, msgData.offset(msgData, 0), wg_op, p);
+ workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
/* While threadN is not 0, cycle read SLM / update value */
p->curr.noMask = 1;
@@ -2726,6 +2738,25 @@ namespace gbe
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
}
+
+ /* corner cases for threads 0 */
+ if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ p->push();{
+ p->curr.flag = 0;
+ p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+ p->curr.execWidth = 16;
+ p->MOV(dst, theVal);
+ } p->pop();
+ }
}
void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
--
2.5.0
More information about the Beignet
mailing list