[Beignet] [PATCH 1/2] Backend: Copy workgroup emit function to gen8
Xiuli Pan
xiuli.pan at intel.com
Mon May 16 01:42:10 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Since long type is not supported before gen8, need to make a copy for
future change.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen8_context.cpp | 528 +++++++++++++++++++++++++++++++++++
backend/src/backend/gen8_context.hpp | 2 +
backend/src/backend/gen_context.hpp | 2 +-
3 files changed, 531 insertions(+), 1 deletion(-)
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index e5ccc0f..477b22b 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1317,4 +1317,532 @@ namespace gbe
p->pop();
}
+ /* Init value according to WORKGROUP OP
+ * Emit assert is invalid combination operation - datatype */
+ static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+ {
+
+ if (wg_op == ir::WORKGROUP_OP_ALL)
+ {
+ if (dataReg.type == GEN_TYPE_D
+ || dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
+ else if(dataReg.type == GEN_TYPE_L ||
+ dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_ANY
+ || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x0));
+ else if (dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immud(0x0));
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(dataReg, GenRegister::immf(0x0));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x0));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF));
+ else if (dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF));
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX
+ || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ if (dataReg.type == GEN_TYPE_D)
+ p->MOV(dataReg, GenRegister::immd(0x80000000));
+ else if (dataReg.type == GEN_TYPE_UD)
+ p->MOV(dataReg, GenRegister::immud(0x0));
+ else if (dataReg.type == GEN_TYPE_F)
+ p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
+ else if (dataReg.type == GEN_TYPE_L)
+ p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
+ else if (dataReg.type == GEN_TYPE_UL)
+ p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else
+ GBE_ASSERT(0); /* unsupported data-type */
+ }
+
+ /* unsupported operation */
+ else
+ GBE_ASSERT(0);
+ }
+
+ /* Perform WORKGROUP OP on 2 input elements (registers) */
+ static void wgOpPerform(GenRegister dst,
+ GenRegister src1,
+ GenRegister src2,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
+ /* perform OP REDUCE on 2 elements */
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(dst, src1, src2);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN INCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN EXCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ else
+ GBE_ASSERT(0);
+ }
+
+ static void wgOpPerformThread(GenRegister threadDst,
+ GenRegister inputVal,
+ GenRegister threadExchangeData,
+ GenRegister resultVal,
+ uint32_t simd,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+
+ /* setting the type */
+ resultVal = GenRegister::retype(resultVal, inputVal.type);
+ threadDst = GenRegister::retype(threadDst, inputVal.type);
+ threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
+
+ vector<GenRegister> input;
+ vector<GenRegister> result;
+
+ /* for workgroup all and any we can use simd_all/any for each thread */
+ if (wg_op == ir::WORKGROUP_OP_ALL || wg_op == ir::WORKGROUP_OP_ANY) {
+ GenRegister constZero = GenRegister::immuw(0);
+ GenRegister flag01 = GenRegister::flag(0, 1);
+
+ p->push();
+ {
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = simd;
+ p->MOV(resultVal, GenRegister::immud(1));
+ p->curr.execWidth = 1;
+ if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->MOV(flag01, GenRegister::immw(-1));
+ else
+ p->MOV(flag01, constZero);
+
+ p->curr.execWidth = simd;
+ p->curr.noMask = 0;
+
+ p->curr.flag = 0;
+ p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_NEQ, inputVal, constZero);
+
+ if (p->curr.execWidth == 16)
+ if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+ else
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else if (p->curr.execWidth == 8)
+ if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+ else
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else
+ NOT_IMPLEMENTED;
+ p->SEL(threadDst, resultVal, constZero);
+ p->SEL(threadExchangeData, resultVal, constZero);
+ }
+ p->pop();
+ } else {
+ if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
+ p->MOV(threadExchangeData, inputVal);
+ p->pop();
+ return;
+ }
+
+ /* init thread data to min/max/null values */
+ p->push(); {
+ p->curr.execWidth = simd;
+ wgOpInitValue(p, threadExchangeData, wg_op);
+ p->MOV(resultVal, inputVal);
+ } p->pop();
+
+ GenRegister resultValSingle = resultVal;
+ resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+ resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+ resultValSingle.width = GEN_WIDTH_1;
+
+ GenRegister inputValSingle = inputVal;
+ inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+ inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+ inputValSingle.width = GEN_WIDTH_1;
+
+
+ /* make an array of registers for easy accesing */
+ for(uint32_t i = 0; i < simd; i++){
+ /* add all resultVal offset reg positions from list */
+ result.push_back(resultValSingle);
+ input.push_back(inputValSingle);
+
+ /* move to next position */
+ resultValSingle.subnr += typeSize(resultValSingle.type);
+ if (resultValSingle.subnr == 32) {
+ resultValSingle.subnr = 0;
+ resultValSingle.nr++;
+ }
+ /* move to next position */
+ inputValSingle.subnr += typeSize(inputValSingle.type);
+ if (inputValSingle.subnr == 32) {
+ inputValSingle.subnr = 0;
+ inputValSingle.nr++;
+ }
+ }
+
+ uint32_t start_i = 0;
+ if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
+ p->MOV(result[0], input[0]);
+ start_i = 1;
+ }
+
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+ p->MOV(result[1], input[0]);
+ start_i = 2;
+ }
+
+ /* algorithm workgroup */
+ for (uint32_t i = start_i; i < simd; i++)
+ {
+ if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ wgOpPerform(result[0], result[0], input[i], wg_op, p);
+
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
+
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
+
+ else
+ GBE_ASSERT(0);
+ }
+ }
+
+ if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ p->MOV(threadExchangeData, result[0]);
+ /* partial result thread */
+ p->MOV(threadDst, result[0]);
+ }
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ {
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ p->MOV(threadExchangeData, result[simd - 1]);
+ /* partial result thread */
+ p->MOV(threadDst, resultVal);
+ }
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ p->curr.execWidth = 1;
+ /* set result[0] to min/max/null */
+ wgOpInitValue(p, result[0], wg_op);
+
+ p->curr.execWidth = 16;
+ /* value exchanged with other threads */
+ wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+ /* partial result thread */
+ p->MOV(threadDst, resultVal);
+ }
+
+ p->pop();
+ }
+
+/**
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
+ * allocated work-items. SIMD16=> 16 work-items allocated for each thread
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
+ */
+ void Gen8Context::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+ const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
+ GenRegister threadData = ra->genReg(insn.src(3));
+ GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
+ GenRegister threadId = ra->genReg(insn.src(0));
+ GenRegister threadLoop = ra->genReg(insn.src(1));
+ GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ GenRegister localBarrier = ra->genReg(insn.src(5));
+
+ uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t simd = p->curr.execWidth;
+ int32_t jip0, jip1;
+
+ /* masked elements should be properly set to init value */
+ p->push(); {
+ p->curr.noMask = 1;
+ wgOpInitValue(p, tmp, wg_op);
+ p->curr.noMask = 0;
+ p->MOV(tmp, theVal);
+ p->curr.noMask = 1;
+ p->MOV(theVal, tmp);
+ } p->pop();
+
+ threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
+
+ /* use of continuous GRF allocation from insn selection */
+ GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
+ GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+ GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
+ GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+
+ /* do some calculation within each thread */
+ wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+
+ p->curr.execWidth = 16;
+ p->MOV(theVal, dst);
+ threadData = GenRegister::toUniform(threadData, dst.type);
+
+ /* store thread count for future use on read/write to SLM */
+ if (wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ }
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+ }
+
+ /* all threads write the partial results to SLM memory */
+ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+ {
+ GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
+ GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
+ p->MOV(msgData.offset(msgData, 0), threadDataL);
+ p->MOV(msgData.offset(msgData, 1), threadDataH);
+
+ p->curr.execWidth = 8;
+ p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
+ p->ADD(msgAddr, msgAddr, msgSlmOff);
+ p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+ }
+ else
+ {
+ p->curr.execWidth = 8;
+ p->MOV(msgData, threadData);
+ p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
+ p->ADD(msgAddr, msgAddr, msgSlmOff);
+ p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+ }
+
+ /* init partialData register, it will hold the final result */
+ wgOpInitValue(p, partialData, wg_op);
+
+ /* add call to barrier */
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.physicalFlag = 0;
+ p->curr.noMask = 1;
+ p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+ p->BARRIER(localBarrier);
+ p->curr.execWidth = 1;
+ p->WAIT();
+ p->pop();
+
+ /* perform a loop, based on thread count (which is now multiple of 4) */
+ p->push();{
+ jip0 = p->n_instruction();
+
+ /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+ {
+ p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+ p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8));
+ p->ADD(msgAddr, msgAddr, msgSlmOff);
+ p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2);
+
+ GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D);
+ GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D);
+ msgDataL.hstride = 2;
+ msgDataH.hstride = 2;
+ p->MOV(msgDataL, msgDataH);
+
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ }
+ else
+ {
+ p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+ p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
+ p->ADD(msgAddr, msgAddr, msgSlmOff);
+ p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ }
+
+ /* while threadN is not 0, cycle read SLM / update value */
+ p->curr.noMask = 1;
+ p->curr.flag = 0;
+ p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ jip1 = p->n_instruction();
+ p->JMPI(GenRegister::immud(0));
+ p->patchJMPI(jip1, jip0 - jip1, 0);
+ } p->pop();
+
+ if(wg_op == ir::WORKGROUP_OP_ANY ||
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ {
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+ p->MOV(dst, partialData);
+ }
+ else
+ {
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+
+ if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(dst, dst, partialData);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ {
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+ /* workaround QW datatype on CMP */
+ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
+ dst.offset(dst, 1, 0), partialData);
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
+ dst.offset(dst, 2, 0), partialData);
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
+ dst.offset(dst, 3, 0), partialData);
+ }
+ }
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+ /* workaround QW datatype on CMP */
+ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
+ dst.offset(dst, 1, 0), partialData);
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
+ dst.offset(dst, 2, 0), partialData);
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
+ dst.offset(dst, 3, 0), partialData);
+ }
+ }
+ }
+
+ /* corner cases for threads 0 */
+ if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ {
+ p->push();{
+ p->curr.flag = 0;
+ p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+ p->curr.execWidth = 16;
+ p->MOV(dst, theVal);
+ } p->pop();
+ }
+ }
+
}
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 2e6eae5..771e20b 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -76,6 +76,8 @@ namespace gbe
virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
+ virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+
static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
protected:
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 47713da..ebc55e6 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -180,7 +180,7 @@ namespace gbe
virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
void emitCalcTimestampInstruction(const SelectionInstruction &insn);
void emitStoreProfilingInstruction(const SelectionInstruction &insn);
- void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+ virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
void emitPrintfInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
--
2.7.4
More information about the Beignet
mailing list