[Beignet] [PATCH V2 08/17] Backend: Code refactoring, additional comments on implementation
Grigore Lupescu
grigore.lupescu at intel.com
Mon Apr 11 14:37:58 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
backend/src/backend/gen_context.cpp | 222 ++++++++++++++++-------------
backend/src/backend/gen_insn_selection.cpp | 101 +++++++------
2 files changed, 177 insertions(+), 146 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 38bd0f2..6219b6c 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2351,45 +2351,9 @@ namespace gbe
p->TYPED_WRITE(header, true, bti);
}
- static void workgroupOp(GenRegister dst,
- GenRegister src1,
- GenRegister src2,
- uint32_t wg_op,
- GenEncoder *p)
- {
- // REDUCE
- if (wg_op == ir::WORKGROUP_OP_ANY)
- p->OR(dst, src1, src2);
- else if (wg_op == ir::WORKGROUP_OP_ALL)
- p->AND(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- // INCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- // EXCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- else
- GBE_ASSERT(0);
- }
-
- static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+ /* Init value according to WORKGROUP OP
+ * Emit assert is invalid combination operation - datatype */
+ static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
{
if (wg_op == ir::WORKGROUP_OP_ALL)
@@ -2464,8 +2428,53 @@ namespace gbe
GBE_ASSERT(0);
}
- static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
- GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+ /* Perform WORKGROUP OP on 2 input elements (registers) */
+ static void wgOpPerform(GenRegister dst,
+ GenRegister src1,
+ GenRegister src2,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
+ /* perform OP REDUCE on 2 elements */
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(dst, src1, src2);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN INCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN EXCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ else
+ GBE_ASSERT(0);
+ }
+
+ static void wgOpPerformThread(GenRegister threadDst,
+ GenRegister inputVal,
+ GenRegister threadExchangeData,
+ GenRegister resultVal,
+ uint32_t simd,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
@@ -2485,7 +2494,7 @@ namespace gbe
/* init thread data to min/max/null values */
p->push(); {
p->curr.execWidth = simd;
- initValue(p, threadExchangeData, wg_op);
+ wgOpInitValue(p, threadExchangeData, wg_op);
p->MOV(resultVal, inputVal);
} p->pop();
@@ -2550,17 +2559,17 @@ namespace gbe
wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- workgroupOp(result[0], result[0], input[i], wg_op, p);
+ wgOpPerform(result[0], result[0], input[i], wg_op, p);
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
+ wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
+ wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
else
GBE_ASSERT(0);
@@ -2594,11 +2603,11 @@ namespace gbe
{
p->curr.execWidth = 1;
/* set result[0] to min/max/null */
- initValue(p, result[0], wg_op);
+ wgOpInitValue(p, result[0], wg_op);
p->curr.execWidth = 16;
/* value exchanged with other threads */
- workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+ wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
/* partial result thread */
p->MOV(threadDst, resultVal);
}
@@ -2607,14 +2616,19 @@ namespace gbe
}
/**
- * Basic idea:
- * 1. All the threads firstly calculate the max/min/add value for the
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
* allocated work-items. SIMD16=> 16 work-items allocated for each thread
- * 2. Each thread will write the computed reduce OP result in SLM memory
- * based on the threadId
- * 3. After a memory fence, each thread will read in chunks of 4 elements,
- * the SLM region, using a loop based on the thread count value (threadN)
- * 4. At the end each thread has the final value computed individually
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
*/
void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
const GenRegister dst = ra->genReg(insn.dst(0));
@@ -2630,10 +2644,10 @@ namespace gbe
uint32_t simd = p->curr.execWidth;
int32_t jip0, jip1;
- /* Masked elements should be properly set to init value */
+ /* masked elements should be properly set to init value */
p->push(); {
p->curr.noMask = 1;
- initValue(p, tmp, wg_op);
+ wgOpInitValue(p, tmp, wg_op);
p->curr.noMask = 0;
p->MOV(tmp, theVal);
p->curr.noMask = 1;
@@ -2642,43 +2656,41 @@ namespace gbe
threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
- /* Use of continuous GRF allocation from insn selection */
+ /* use of continuous GRF allocation from insn selection */
GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
- /* Do some calculation within each thread */
- workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+ /* do some calculation within each thread */
+ wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
p->curr.execWidth = 16;
p->MOV(theVal, dst);
threadData = GenRegister::toUniform(threadData, dst.type);
- /* Store thread count for future use on read/write to SLM */
+ /* store thread count for future use on read/write to SLM */
if (wg_op == ir::WORKGROUP_OP_ANY ||
- wg_op == ir::WORKGROUP_OP_ALL ||
- wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
{
- //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
- threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
- p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
}
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
- //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
- threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
- p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
}
- /* All threads write the partial results to SLM memory */
+ /* all threads write the partial results to SLM memory */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
{
GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
@@ -2700,10 +2712,10 @@ namespace gbe
p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
}
- /* Init partialData register, it will hold the final result */
- initValue(p, partialData, wg_op);
+ /* init partialData register, it will hold the final result */
+ wgOpInitValue(p, partialData, wg_op);
- /* Add call to barrier */
+ /* add call to barrier */
p->push();
p->curr.execWidth = 8;
p->curr.physicalFlag = 0;
@@ -2714,11 +2726,11 @@ namespace gbe
p->WAIT();
p->pop();
- /* Perform a loop, based on thread count (which is now multiple of 4) */
+ /* perform a loop, based on thread count (which is now multiple of 4) */
p->push();{
jip0 = p->n_instruction();
- /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+ /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
{
p->curr.execWidth = 8;
@@ -2734,8 +2746,8 @@ namespace gbe
msgDataH.hstride = 2;
p->MOV(msgDataL, msgDataH);
- /* Perform operation, partialData will hold result */
- workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
}
else
{
@@ -2746,11 +2758,11 @@ namespace gbe
p->ADD(msgAddr, msgAddr, msgSlmOff);
p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
- /* Perform operation, partialData will hold result */
- workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
}
- /* While threadN is not 0, cycle read SLM / update value */
+ /* while threadN is not 0, cycle read SLM / update value */
p->curr.noMask = 1;
p->curr.flag = 0;
p->curr.subFlag = 1;
@@ -2762,22 +2774,25 @@ namespace gbe
} p->pop();
if(wg_op == ir::WORKGROUP_OP_ANY ||
- wg_op == ir::WORKGROUP_OP_ALL ||
- wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
{
- /* Save result to final register location dst */
- p->curr.execWidth = 16;
- p->MOV(dst, partialData);
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+ p->MOV(dst, partialData);
}
- else {
- /* Save result to final register location dst */
- p->curr.execWidth = 16;
+ else
+ {
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+
if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
p->ADD(dst, dst, partialData);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
{
p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
/* workaround QW datatype on CMP */
@@ -2790,7 +2805,8 @@ namespace gbe
dst.offset(dst, 3, 0), partialData);
}
}
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
/* workaround QW datatype on CMP */
@@ -2807,11 +2823,11 @@ namespace gbe
/* corner cases for threads 0 */
if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
p->push();{
p->curr.flag = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 07bdef8..152054e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6446,66 +6446,75 @@ namespace gbe
/*! WorkGroup instruction pattern */
DECL_PATTERN(WorkGroupInstruction)
{
- /* SLM bassed communication between threads, most of the logic bellow */
+ /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ * Shared local memory bassed communication between threads,
+ * prepare for the workgroup op in gen context
+ * Algorithm logic is in gen context, */
INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
{
using namespace ir;
- const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+ GBE_ASSERT(insn.getSrcNum() == 3);
+ GBE_ASSERT(insn.getSrc(0) == ocl::threadn);
+ GBE_ASSERT(insn.getSrc(1) == ocl::threadid);
+
+ const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
const Type type = insn.getType();
GenRegister dst = sel.selReg(insn.getDst(0), type);
GenRegister src = sel.selReg(insn.getSrc(2), type);
- GenRegister threadId = sel.selReg(ocl::threadid, ir::TYPE_U32);
- GenRegister threadN = sel.selReg(ocl::threadn, ir::TYPE_U32);
- const uint32_t srcNum = insn.getSrcNum();
-
- GBE_ASSERT(srcNum == 3);
- GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
- GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type);
- GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32);
+ GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32);
+ GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32);
+ GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32);
+ /* Allocate registers for message sending
+ * (read/write to shared local memory) */
vector<GenRegister> msg;
for(uint32_t i = 0; i < 6; i++)
- msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+ msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
- /* insert a barrier to make sure all the var we are interested in
+ /* Insert a barrier to make sure all the var we are interested in
have been assigned the final value. */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
- /* compute individual slice of workitems, (e.g. 0->16 workitems) */
+ /* Pass the shared local memory offset */
sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
- /* barrier for syn prior to workgroup */
- sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg);
+ /* Perform workgroup op */
+ sel.WORKGROUP_OP(workGroupOp, dst, src, data,
+ localThreadID, localThreadNUM, tmp, slmOff, msg);
return true;
}
- INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const {
- /* 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted.
- 2. CMP IDs Compare the local IDs with the specified ones in the function call.
- 3. STORE Use flag to control the store of the var. Only the specified item will execute the store.
- 4. BARRIER Ensure the specified value has been stored.
- 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value,
- so broadcasted. */
+ /* WORKGROUP OP: BROADCAST
+ * 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted.
+ 2. CMP IDs Compare the local IDs with the specified ones in the function call.
+ 3. STORE Use flag to control the store of the var. Only the specified item will execute the store.
+ 4. BARRIER Ensure the specified value has been stored.
+ 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value,
+ so broadcasted. */
+ INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+ {
using namespace ir;
+
+ const uint32_t srcNum = insn.getSrcNum();
+ GBE_ASSERT(srcNum >= 2);
+
const Type type = insn.getType();
const GenRegister src = sel.selReg(insn.getSrc(0), type);
const GenRegister dst = sel.selReg(insn.getDst(0), type);
- const uint32_t srcNum = insn.getSrcNum();
const uint32_t slmAddr = insn.getSlmAddr();
- GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
vector<GenRegister> fakeTemps;
fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
- GBE_ASSERT(srcNum >= 2);
GenRegister coords[3];
- for (uint32_t i = 1; i < srcNum; i++) {
+ for (uint32_t i = 1; i < srcNum; i++)
coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD);
- }
sel.push(); {
sel.curr.predicate = GEN_PREDICATE_NONE;
@@ -6515,7 +6524,8 @@ namespace gbe
/* insert a barrier to make sure all the var we are interested in
have been assigned the final value. */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
sel.push(); {
sel.curr.flag = 0;
@@ -6524,24 +6534,27 @@ namespace gbe
sel.curr.noMask = 1;
GenRegister lid0, lid1, lid2;
uint32_t dim = srcNum - 1;
- lid0 = GenRegister::retype(sel.selReg(ir::ocl::lid0, TYPE_U32), GEN_TYPE_UD);
- lid1 = GenRegister::retype(sel.selReg(ir::ocl::lid1, TYPE_U32), GEN_TYPE_UD);
- lid2 = GenRegister::retype(sel.selReg(ir::ocl::lid2, TYPE_U32), GEN_TYPE_UD);
+ lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD);
+ lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD);
+ lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD);
- sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
sel.curr.predicate = GEN_PREDICATE_NORMAL;
if (dim >= 2)
- sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
if (dim >= 3)
- sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
- /* write to SLM for BYTE/WORD/DWORD types */
+ /* write to shared local memory for BYTE/WORD/DWORD types */
if (typeSize(src.type) <= 4) {
GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD);
sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps);
}
- /* write to SLM for QWORD types */
+ /* write to shared local memory for QWORD types */
else if (typeSize(src.type) == 8) {
sel.push(); {
/* arrange data in QWORD */
@@ -6556,8 +6569,9 @@ namespace gbe
/* unpack into 2 DWORD */
sel.UNPACK_LONG(srcQW, src);
- /* perform write to SLM */
- sel.UNTYPED_WRITE(_addr, srcVec.data(), 2, GenRegister::immw(0xfe), fakeTemps);
+ /* emit write through SEND */
+ sel.UNTYPED_WRITE(_addr, srcVec.data(), 2,
+ GenRegister::immw(0xfe), fakeTemps);
}sel.pop();
}
else
@@ -6565,15 +6579,16 @@ namespace gbe
} sel.pop();
/* make sure the slm var have the valid value now */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
- /* read from SLM for BYTE/WORD/DWORD types */
+ /* read from shared local memory for BYTE/WORD/DWORD types */
if (typeSize(src.type) <= 4) {
GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD);
sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps);
}
- /* read from SLM for QWORD types */
+ /* read from shared local memory for QWORD types */
else if (typeSize(src.type) == 8) {
GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
vector<GenRegister> _dst;
@@ -6582,7 +6597,7 @@ namespace gbe
GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL);
sel.push(); {
- /* read from SLM */
+ /* emit read through SEND */
sel.curr.execWidth = 8;
sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps);
--
2.5.0
More information about the Beignet
mailing list