[Beignet] [PATCH 08/10] Backend: Add new workgroup function support for predicate, scan and exclusive
grigore.lupescu at intel.com
grigore.lupescu at intel.com
Thu Mar 31 15:28:25 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/backend/gen_context.cpp | 385 ++++++++++++++++++++---------
backend/src/backend/gen_insn_selection.cpp | 19 +-
backend/src/llvm/llvm_gen_backend.cpp | 18 +-
3 files changed, 286 insertions(+), 136 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 84df912..d7f2479 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2351,45 +2351,112 @@ namespace gbe
uint32_t wg_op,
GenEncoder *p)
{
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN
- || wg_op == ir::WORKGROUP_OP_REDUCE_MAX) {
- uint32_t cond;
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- cond = GEN_CONDITIONAL_LE;
- else
- cond = GEN_CONDITIONAL_GE;
- p->SEL_CMP(cond, partialRes, partialRes, value);
- } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) {
+ // REDUCE
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(partialRes, partialRes, value);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
p->ADD(partialRes, partialRes, value);
- }
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+
+ // INCLUSIVE
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ p->ADD(partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+
+ // EXCLUSIVE
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, value);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, value);
+
+ else
+ GBE_ASSERT(0);
}
- static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) {
- if (dataReg.type == GEN_TYPE_UD) {
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
- || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) {
+ static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+ {
+
+ if (wg_op == ir::WORKGROUP_OP_ALL)
+ {
+ if (dataReg.type == GEN_TYPE_D
+ || dataReg.type == GEN_TYPE_UD
+ || dataReg.type == GEN_TYPE_F)
+ p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
+ else if(dataReg.type == GEN_TYPE_L ||
+ dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_ANY
+ || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x0));
+ else if (dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immud(0x0));
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(dataReg, GenRegister::immf(0x0));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x0));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF));
+ else if (dataReg.type == GEN_TYPE_UD)
p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF));
- } else {
- GBE_ASSERT(wg_op == ir::WORKGROUP_OP_REDUCE_MAX || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
- || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
- || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD);
- p->MOV(dataReg, GenRegister::immud(0));
- }
- } else if (dataReg.type == GEN_TYPE_F) {
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
- || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) {
- p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); // inf
- } else if (wg_op == ir::WORKGROUP_OP_REDUCE_MAX || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
- || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
- p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000)); // -inf
- } else {
- GBE_ASSERT(wg_op == ir::WORKGROUP_OP_REDUCE_ADD || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
- || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD);
- p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x0));
- }
- } else {
- GBE_ASSERT(0);
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x80000000));
+ else if (dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immud(0x0));
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
}
+
+ /* unsupported operation */
+ else
+ GBE_ASSERT(0);
}
static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData,
@@ -2400,6 +2467,7 @@ namespace gbe
p->curr.execWidth = 1;
/* Setting the init value here. */
+ msgData = GenRegister::retype(msgData, theVal.type);
threadData = GenRegister::retype(threadData, theVal.type);
initValue(p, threadData, wg_op);
@@ -2415,52 +2483,112 @@ namespace gbe
p->MOV(tmp, theVal);
p->pop();
}
+ else {
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(threadData, threadData, theVal);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(threadData, threadData, theVal);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+ p->ADD(threadData, threadData, theVal);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, theVal);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, threadData, threadData, theVal);
+ p->pop();
+ return;
+ }
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MAX) {
- uint32_t cond;
- if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- cond = GEN_CONDITIONAL_LE;
- else
- cond = GEN_CONDITIONAL_GE;
+ GBE_ASSERT(tmp.type == theVal.type);
+ GenRegister partialRes = tmp;
+ GenRegister finalRes = tmp;
- if (theVal.hstride == GEN_HORIZONTAL_STRIDE_0) { // an uniform value.
- p->SEL_CMP(cond, threadData, threadData, theVal);
- } else {
- GBE_ASSERT(tmp.type == theVal.type);
- GenRegister v = GenRegister::toUniform(tmp, theVal.type);
- for (uint32_t i = 0; i < simd; i++) {
- p->SEL_CMP(cond, threadData, threadData, v);
- v.subnr += typeSize(theVal.type);
- if (v.subnr == 32) {
- v.subnr = 0;
- v.nr++;
- }
- }
+ if(wg_op >= ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op <= ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+
+ for (uint32_t i = 1; i < simd; i++)
+ {
+ p->push(); {
+ p->curr.execWidth = 1;
+
+ tmp.hstride = GEN_HORIZONTAL_STRIDE_0;
+ tmp.vstride = GEN_VERTICAL_STRIDE_0;
+ tmp.width = GEN_WIDTH_1;
+ tmp.subnr += typeSize(tmp.type);
+ if (tmp.subnr == 32) {
+ tmp.subnr = 0;
+ tmp.nr++;
}
+
+ // REDUCE
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(partialRes, partialRes, tmp);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(partialRes, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+ p->ADD(partialRes, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, partialRes, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, partialRes, partialRes, tmp);
+
+ // INCLUSIVE
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ p->ADD(tmp, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
+
+ // EXCLUSIVE
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(tmp, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, tmp, partialRes, tmp);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, tmp, partialRes, tmp);
+
+ else
+ GBE_ASSERT(0);
+
+ if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ partialRes = tmp;
+ } p->pop();
+ }
+
+ p->curr.execWidth = 16;
+ if(wg_op < ir::WORKGROUP_OP_INCLUSIVE_ADD){
+ p->MOV(threadData, partialRes);
+ p->MOV(msgData, finalRes);
}
- else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){
- tmp.hstride = GEN_HORIZONTAL_STRIDE_1;
- tmp.vstride = GEN_VERTICAL_STRIDE_4;
- tmp.width = 2;
+ else if(wg_op >= ir::WORKGROUP_OP_INCLUSIVE_ADD
+ && wg_op < ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ {
+ p->MOV(threadData, tmp);
+ p->MOV(msgData, finalRes);
+ }
+ else {
+ p->MOV(threadData, tmp);
- GBE_ASSERT(tmp.type == theVal.type);
- GenRegister partialSum = tmp;
+ p->curr.execWidth = 16;
+ initValue(p, msgData, wg_op);
- /* adjust offset, compute add with ADD4/ADD */
- for (uint32_t i = 1; i < simd/4; i++){
p->push(); {
- tmp = tmp.suboffset(tmp, 4);
- p->curr.execWidth = 4;
- p->ADD(partialSum, partialSum, tmp);
- } p->pop();
- }
+ p->curr.execWidth = 8;
+ finalRes = finalRes.suboffset(finalRes, 0);
+ msgData = msgData.suboffset(msgData, 1);
+ p->MOV(msgData, finalRes); } p->pop();
- for (uint32_t i = 0; i < 4; i++){
p->push(); {
- p->ADD(threadData, threadData, partialSum);
- partialSum = partialSum.suboffset(partialSum, 1);
- } p->pop();
- }
+ p->curr.execWidth = 1;
+ finalRes = finalRes.suboffset(finalRes, 7);
+ msgData = msgData.suboffset(msgData, 7);
+ p->MOV(msgData, finalRes); } p->pop();
+
+ p->push(); {
+ p->curr.execWidth = 8;
+ finalRes = finalRes.suboffset(finalRes, 1);
+ msgData = msgData.suboffset(msgData, 1);
+ p->MOV(msgData, finalRes); } p->pop();
}
p->pop();
}
@@ -2478,22 +2606,17 @@ namespace gbe
void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister tmp = ra->genReg(insn.dst(1));
-
- const GenRegister theVal = ra->genReg(insn.src(0));
- GenRegister threadData = ra->genReg(insn.src(1));
+ const GenRegister theVal = ra->genReg(insn.src(2));
+ GenRegister threadData = ra->genReg(insn.src(3));
GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
- GenRegister threadId = ra->genReg(insn.src(2));
- GenRegister threadNum = ra->genReg(insn.src(3));
-
- threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
- threadNum = GenRegister::toUniform(threadNum, GEN_TYPE_UD);
+ GenRegister threadId = ra->genReg(insn.src(0));
+ GenRegister threadLoop = ra->genReg(insn.src(1));
uint32_t wg_op = insn.extra.workgroupOp;
uint32_t simd = p->curr.execWidth;
int32_t jip0, jip1;
- GenRegister result = GenRegister::offset(dst, 0, 16);
- result = GenRegister::toUniform(result, dst.type);
+ threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
/* Use of continuous GRF allocation from insn selection */
GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
@@ -2502,12 +2625,41 @@ namespace gbe
GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
/* Do some calculation within each thread */
- workgroupOpInThread(msg, theVal, threadData, tmp, simd, wg_op, p);
+ workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
threadData = GenRegister::toUniform(threadData, dst.type);
/* Store thread count for future use on read/write to SLM */
- GenRegister threadN = GenRegister::retype(tmp, GEN_TYPE_UD);
- p->MOV(threadN, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ if (wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ }
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+ }
+
+ /* TODO implement communication for DW types */
+ if(dst.type == GEN_TYPE_UL ||
+ dst.type == GEN_TYPE_L ||
+ dst.type == GEN_TYPE_DF_IMM)
+ {
+ p->curr.execWidth = 16;
+ p->MOV(dst, threadData);
+ return;
+ }
/* All threads write the partial results to SLM memory */
p->curr.execWidth = 8;
@@ -2519,30 +2671,10 @@ namespace gbe
/* Init partialData register, it will hold the final result */
initValue(p, partialData, wg_op);
- /* Thread 0 will write extra elements for future reads in chunks of 4 */
- p->push();{
- p->curr.noMask = 1;
- p->curr.flag = 0;
- p->curr.subFlag = 1;
- p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
- p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.execWidth = 8;
- p->MOV(msgData.offset(msgData, 0), partialData);
- p->MOV(msgData.offset(msgData, 1), partialData);
- p->MOV(msgData.offset(msgData, 2), partialData);
- p->MUL(msgAddr, threadN, GenRegister::immd(0x4));
- p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 3);
- } p->pop();
-
- /* Round threadN to nearest upper number divisible with 4 required for
- * reading in chunks of 4 elements from SLM */
- p->ADD(threadN, threadN, GenRegister::immd(0x3));
- p->SHR(threadN, threadN, GenRegister::immd(0x2));
- p->SHL(threadN, threadN, GenRegister::immd(0x2));
-
- /* Wait for all writes to complete in work-group */
- p->FENCE(tmp);
+ p->FENCE(msgData);
+ p->MOV(msgData, msgData);
+ p->FENCE(msgData);
+ p->MOV(msgData, msgData);
/* Perform a loop, based on thread count (which is now multiple of 4) */
p->push();{
@@ -2552,31 +2684,48 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
/* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
- p->ADD(threadN, threadN, GenRegister::immd(-4));
- p->MUL(msgAddr, threadN, GenRegister::immd(0x4));
+ p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+ p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 4);
+ p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
/* Perform operation, process 4 elements, partialData will hold result */
workgroupOpBetweenThread(partialData, msgData.offset(msgData, 0), wg_op, p);
- workgroupOpBetweenThread(partialData, msgData.offset(msgData, 1), wg_op, p);
- workgroupOpBetweenThread(partialData, msgData.offset(msgData, 2), wg_op, p);
- workgroupOpBetweenThread(partialData, msgData.offset(msgData, 3), wg_op, p);
/* While threadN is not 0, cycle read SLM / update value */
p->curr.noMask = 1;
p->curr.flag = 0;
p->curr.subFlag = 1;
- p->CMP(GEN_CONDITIONAL_G, threadN, GenRegister::immd(0x0));
+ p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
jip1 = p->n_instruction();
p->JMPI(GenRegister::immud(0));
p->patchJMPI(jip1, jip0 - jip1, 0);
} p->pop();
- /* Save result to final register location dst */
- p->curr.execWidth = 16;
- p->MOV(dst, partialData);
+ if(wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ /* Save result to final register location dst */
+ p->curr.execWidth = 16;
+ p->MOV(dst, partialData);
+ }
+ else {
+ /* Save result to final register location dst */
+ p->curr.execWidth = 16;
+ if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(dst, dst, partialData);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+ }
}
void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 024c3e7..12a0cf4 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2069,10 +2069,10 @@ namespace gbe
for(uint32_t i = 0; i < msg.size(); i++)
insn->dst(2 + i) = msg[i];
- insn->src(0) = src;
- insn->src(1) = data;
- insn->src(2) = threadId;
- insn->src(3) = threadN;
+ insn->src(0) = threadId;
+ insn->src(1) = threadN;
+ insn->src(2) = src;
+ insn->src(3) = data;
insn->src(4) = slmOff;
}
@@ -6463,7 +6463,7 @@ namespace gbe
GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), type);
- GenRegister data = sel.selReg(sel.reg(FAMILY_WORD), type);
+ GenRegister data = sel.selReg(sel.reg(FAMILY_DWORD), type);
GenRegister slmOff = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
vector<GenRegister> msg;
@@ -6472,8 +6472,11 @@ namespace gbe
/* compute individual slice of workitems, (e.g. 0->16 workitems) */
sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
- sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId,
- threadN, tmp, slmOff, msg);
+
+ /* barrier for syn prior to workgroup */
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg);
+
return true;
}
@@ -6551,7 +6554,7 @@ namespace gbe
if (workGroupOp == WORKGROUP_OP_BROADCAST){
return emitWGBroadcast(sel, insn);
}
- else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){
+ else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){
return emitWGReduce(sel, insn);
}
else
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 951fa0e..e0140f7 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3967,18 +3967,12 @@ namespace gbe
GBE_ASSERT(f.getwgBroadcastSLM() >= 0);
}
- if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_REDUCE_ADD && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+ else if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_ANY && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
/* 1. For thread SLM based communication (default):
* Threads will use SLM to write partial results computed individually
and then read the whole set. Because the read is done in chunks of 4
extra padding is required.
- * 2. For thread message based communication:
- * Because we can not know the thread ID and the EUID for every physical
- thead which the work items execute on before the run time. We need to
- sync the thread execution order when using work group functions. We
- create the workitems/threadID map table in slm.
-
When we come to here, the global thread local vars should have all been
allocated, so it's safe for us to steal a piece of SLM for this usage. */
@@ -3997,9 +3991,12 @@ namespace gbe
if (opcode == ir::WORKGROUP_OP_ALL || opcode == ir::WORKGROUP_OP_ANY) {
GBE_ASSERT(getType(ctx, (*AI)->getType()) == ir::TYPE_S32);
- const ir::Register src = this->getRegister(*(AI++));
- const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);
- ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1, ir::TYPE_S32);
+ ir::Register src[3];
+ src[0] = ir::ocl::threadn;
+ src[1] = ir::ocl::threadid;
+ src[2] = this->getRegister(*(AI++));
+ const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
+ ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), getRegister(&I), srcTuple, 3, ir::TYPE_S32);
} else if (opcode == ir::WORKGROUP_OP_BROADCAST) {
int argNum = CS.arg_size();
ir::Register src[argNum];
@@ -4017,6 +4014,7 @@ namespace gbe
ir::Type ty;
if (isSign) {
ty = getType(ctx, (*AI)->getType());
+
} else {
ty = getUnsignedType(ctx, (*AI)->getType());
}
--
2.5.0
More information about the Beignet
mailing list