[Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks.
Yang, Rong R
rong.r.yang at intel.com
Wed Apr 2 01:41:28 PDT 2014
-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Zhigang Gong
Sent: Friday, March 28, 2014 3:11 PM
To: beignet at lists.freedesktop.org
Cc: Gong, Zhigang
Subject: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks.
When enable SPF (single program flow), we always need to use f0
as the predication of almost each instruction. This bring some
trouble when we want to get tow levels mask mechanism, for an
example the SEL instruction, and some BOOL operations. We
have to use more than one instructions to do that and simply
introduce 100% of overhead of those instructions.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/gen/gen_mesa_disasm.c | 31 ++--
backend/src/backend/gen_context.cpp | 141 ++++++++---------
backend/src/backend/gen_defs.hpp | 5 +
backend/src/backend/gen_encoder.cpp | 29 +++-
backend/src/backend/gen_insn_selection.cpp | 237 +++++++++++++----------------
backend/src/backend/gen_insn_selection.hpp | 5 +
backend/src/backend/gen_insn_selection.hxx | 2 +-
backend/src/backend/gen_register.hpp | 13 +-
src/cl_api.c | 1 +
src/intel/intel_driver.c | 1 +
src/intel/intel_gpgpu.c | 2 +-
utests/compiler_long_cmp.cpp | 1 +
utests/compiler_unstructured_branch0.cpp | 4 +-
utests/compiler_unstructured_branch1.cpp | 3 +-
utests/compiler_unstructured_branch2.cpp | 10 +-
15 files changed, 249 insertions(+), 236 deletions(-)
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 84ef0c8..e58ef31 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -100,13 +100,13 @@ static const struct {
[GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
- [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
- [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
- [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+ [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
[GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
@@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void *opaque_insn)
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
inst->header.opcode == GEN_OPCODE_ELSE ||
inst->header.opcode == GEN_OPCODE_ENDIF ||
- inst->header.opcode == GEN_OPCODE_WHILE)) {
- // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
- assert(0);
+ inst->header.opcode == GEN_OPCODE_WHILE ||
+ inst->header.opcode == GEN_OPCODE_BRD ||
+ inst->header.opcode == GEN_OPCODE_JMPI)) {
+ format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
inst->header.opcode == GEN_OPCODE_CONTINUE ||
- inst->header.opcode == GEN_OPCODE_HALT)) {
- // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
- assert(0);
- } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+ inst->header.opcode == GEN_OPCODE_HALT ||
+ inst->header.opcode == GEN_OPCODE_BRC)) {
+ format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+ }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
format (file, " %d", inst->bits3.d);
- }
+ }*/
if (opcode[inst->header.opcode].nsrc > 0) {
pad (file, 32);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index c46127a..bab059b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -87,33 +87,29 @@ namespace gbe
const LabelIndex label = pair.first;
const int32_t insnID = pair.second;
const int32_t targetID = labelPos.find(label)->second;
- p->patchJMPI(insnID, (targetID-insnID-1) * 2);
+ p->patchJMPI(insnID, (targetID - insnID) * 2);
+ }
+ for (auto pair : branchPos3) {
+ const LabelPair labelPair = pair.first;
+ const int32_t insnID = pair.second;
+ const int32_t jip = labelPos.find(labelPair.l0)->second + labelPair.offset0;
+ const int32_t uip = labelPos.find(labelPair.l1)->second + labelPair.offset1;
+ assert((jip - insnID) * 2 < 32767 && (jip - insnID) > -32768);
+ assert((uip - insnID) * 2 < 32767 && (uip - insnID) > -32768);
>>>>>>>>> should be (uip - insnID) * 2 > -32768
+ p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
}
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
- const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
- const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
- uint32_t execWidth = p->curr.execWidth;
+ const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- /* clear all the bit in f0.0. */
- p->curr.execWidth = 1;
- p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
- /* clear the barrier mask bits to all zero0*/
- p->curr.noMask = 0;
- p->curr.useFlag(0, 0);
- p->curr.execWidth = execWidth;
- /* set all the active lane to 1. Inactive lane remains 0. */
- p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
- p->curr.noMask = 1;
- p->curr.execWidth = 1;
- p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
- p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
- p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+ p->curr.noMask = 0;
+ p->MOV(blockip, GenRegister::immuw(0));
p->pop();
}
@@ -148,7 +144,6 @@ namespace gbe
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
const uint32_t perThreadSize = perLaneSize * this->simdWidth;
- //const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
GBE_ASSERT(perLaneSize > 0);
GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
@@ -325,6 +320,7 @@ namespace gbe
for (int i = 0; i < w / 8; i ++) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
p->curr.accWrEnable = 1;
p->MACH(tmp, src0, src1);
@@ -500,6 +496,7 @@ namespace gbe
int execWidth = p->curr.execWidth;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->AND(dest, src.bottom_half(), GenRegister::immud(63));
@@ -539,6 +536,7 @@ namespace gbe
void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
p->SHR(sign, high, GenRegister::immud(31));
p->push();
+ p->curr.noMask = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
@@ -574,6 +572,7 @@ namespace gbe
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -626,6 +625,7 @@ namespace gbe
p->OR(a, e, f);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, a, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -639,6 +639,7 @@ namespace gbe
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -670,6 +671,7 @@ namespace gbe
p->push();
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_NZ, e, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
@@ -793,6 +795,7 @@ namespace gbe
case SEL_OP_I64SHL:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
@@ -820,6 +823,7 @@ namespace gbe
case SEL_OP_I64SHR:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
@@ -848,6 +852,7 @@ namespace gbe
f.type = GEN_TYPE_D;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
@@ -894,6 +899,7 @@ namespace gbe
p->push();
p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32)); //high == 0
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst, low);
@@ -911,6 +917,7 @@ namespace gbe
p->pop();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32)); //exp>23 && high!=0
@@ -936,6 +943,7 @@ namespace gbe
p->pop();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0
@@ -956,7 +964,7 @@ namespace gbe
p->SHL(high, low, tmp);
p->MOV(low, GenRegister::immud(0));
- p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+ p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31)); //update dst where high != 0
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -970,7 +978,7 @@ namespace gbe
p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
- p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+ p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
p->pop();
@@ -994,6 +1002,7 @@ namespace gbe
p->MOV(tmp_high, high);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1006,6 +1015,7 @@ namespace gbe
UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1039,6 +1049,7 @@ namespace gbe
if(dst.is_signed_int()) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1066,11 +1077,10 @@ namespace gbe
f1.width = GEN_WIDTH_1;
GenRegister f2 = GenRegister::suboffset(f1, 1);
GenRegister f3 = GenRegister::suboffset(f1, 2);
- GenRegister f4 = GenRegister::suboffset(f1, 3);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
- saveFlag(f4, flag, subFlag);
+ p->curr.noMask = 1;
loadTopHalf(tmp0, src0);
loadTopHalf(tmp1, src1);
switch(insn.extra.function) {
@@ -1130,12 +1140,13 @@ namespace gbe
NOT_IMPLEMENTED;
}
p->curr.execWidth = 1;
- p->AND(f1, f1, f4);
p->MOV(GenRegister::flag(flag, subFlag), f1);
p->pop();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MOV(dst, GenRegister::immd(0));
+ p->curr.noMask = 0;
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst, GenRegister::immd(-1));
p->pop();
@@ -1163,6 +1174,7 @@ namespace gbe
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1176,6 +1188,7 @@ namespace gbe
p->MOV(a, GenRegister::immud(0x80000000u));
p->MOV(b, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
@@ -1209,6 +1222,7 @@ namespace gbe
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1238,6 +1252,7 @@ namespace gbe
src = src.top_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1252,6 +1267,7 @@ namespace gbe
int execWidth = p->curr.execWidth;
dest = dest.top_half();
p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->curr.nibControl = 1;
@@ -1271,6 +1287,7 @@ namespace gbe
src = src.bottom_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1286,6 +1303,7 @@ namespace gbe
dest = dest.bottom_half();
p->push();
p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dest, src);
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1369,6 +1387,7 @@ namespace gbe
loadBottomHalf(d, y);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
p->ADD(e, e, f);
@@ -1443,6 +1462,7 @@ namespace gbe
// condition <- (c,d)==0 && (a,b)>=(e,f)
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MOV(l, zero);
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_EQ, a, e);
@@ -1477,6 +1497,7 @@ namespace gbe
p->ADD(m, m, one);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
@@ -1484,7 +1505,6 @@ namespace gbe
p->curr.noMask = 1;
p->AND(flagReg, flagReg, emaskReg);
- p->curr.predicate = GEN_PREDICATE_NORMAL;
// under condition, jump back to start point
if (simdWidth == 8)
p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -1493,8 +1513,9 @@ namespace gbe
else
NOT_IMPLEMENTED;
int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+ p->curr.noMask = 1;
p->JMPI(zero);
- p->patchJMPI(p->n_instruction()-2, jip);
+ p->patchJMPI(p->n_instruction() - 2, jip + 2);
p->pop();
// end of loop
}
@@ -1502,6 +1523,7 @@ namespace gbe
if(x.is_signed_int()) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1534,7 +1556,7 @@ namespace gbe
}
void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
- NOT_IMPLEMENTED;
+ p->NOP();
}
void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
@@ -1546,59 +1568,24 @@ namespace gbe
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
- GenRegister blockIP;
- uint32_t exeWidth = p->curr.execWidth;
- ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
-
- if (exeWidth == 16)
- blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
- else if (exeWidth == 8)
- blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
- p->push();
- /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
- after the barrier, If there is any lane still remains zero. */
- p->MOV(blockIP, GenRegister::immuw(0xFFFF));
- p->curr.noMask = 1;
- p->curr.execWidth = 1;
- this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
- if (exeWidth == 16)
- p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
- else if (exeWidth == 8)
- p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
- else
- NOT_IMPLEMENTED;
- p->curr.inversePredicate = 1;
- // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
- // jump to next basic block.
- p->JMPI(GenRegister::immud(0));
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
- p->pop();
-
- p->push();
- p->curr.useFlag(0, 0);
- /* Restore the blockIP to current label. */
- p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
if (barrierType == ir::syncGlobalBarrier) {
p->FENCE(fenceDst);
p->MOV(fenceDst, fenceDst);
}
- p->curr.predicate = GEN_PREDICATE_NONE;
- // As only the payload.2 is used and all the other regions are ignored
- // SIMD8 mode here is safe.
- p->curr.execWidth = 8;
- p->curr.physicalFlag = 0;
- p->curr.noMask = 1;
- // Copy barrier id from r0.
- p->AND(src, barrierId, GenRegister::immud(0x0f000000));
- // A barrier is OK to start the thread synchronization *and* SLM fence
- p->BARRIER(src);
- // Now we wait for the other threads
- p->curr.execWidth = 1;
- p->WAIT();
- // we executed the barrier then restore the barrier soft mask to initial value.
- p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+ p->push();
+ // As only the payload.2 is used and all the other regions are ignored
+ // SIMD8 mode here is safe.
+ p->curr.execWidth = 8;
+ p->curr.physicalFlag = 0;
+ p->curr.noMask = 1;
+ // Copy barrier id from r0.
+ p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+ // A barrier is OK to start the thread synchronization *and* SLM fence
+ p->BARRIER(src);
+ p->curr.execWidth = 1;
+ // Now we wait for the other threads
+ p->WAIT();
p->pop();
}
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 7c49497..e731174 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -896,6 +896,11 @@ struct GenInstruction
uint32_t end_of_thread:1;
} gen7_msg_gw;
+ struct {
+ uint32_t jip:16;
+ uint32_t uip:16;
+ } gen7_branch;
+
int d;
uint32_t ud;
float f;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index fc7e53d..06aa769 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -837,6 +837,7 @@ namespace gbe
GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
push();
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
curr.execWidth = 1;
MOV(r, GenRegister::immud(u.u[1]));
MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
@@ -907,6 +908,7 @@ namespace gbe
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
MOV(r0, src0);
MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
curr.predicate = GEN_PREDICATE_NORMAL;
@@ -920,6 +922,7 @@ namespace gbe
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
MOV(r0, GenRegister::suboffset(src0, 8));
MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1058,7 +1061,7 @@ namespace gbe
#define ALU2_BRA(OP) \
void GenEncoder::OP(GenRegister src) { \
- alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), src); \
+ alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
}
ALU2_BRA(IF)
@@ -1071,9 +1074,21 @@ namespace gbe
GBE_ASSERT(insnID < this->store.size());
GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
insn.header.opcode == GEN_OPCODE_BRD ||
- insn.header.opcode == GEN_OPCODE_ENDIF);
- if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
- this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ insn.header.opcode == GEN_OPCODE_ENDIF ||
+ insn.header.opcode == GEN_OPCODE_IF ||
+ insn.header.opcode == GEN_OPCODE_BRC);
+
+ if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768)) {
+ int offset = 0;
+ if (insn.header.opcode == GEN_OPCODE_IF) {
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ return;
+ }
+ else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+ offset = -2;
+ /*assert(jumpDistance > -32769 && jumpDistance < 32768);*/
+ }
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
} else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
// For the conditional jump distance out of S15 range, we need to use an
// inverted jmp followed by a add ip, ip, distance to implement.
@@ -1085,10 +1100,12 @@ namespace gbe
// for all the branching instruction. And need to adjust the distance
// for those branch instruction's start point and end point contains
// this instruction.
+ GenInstruction &insn2 = this->store[insnID+1];
+ GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
insn.header.opcode = GEN_OPCODE_ADD;
this->setDst(&insn, GenRegister::ip());
this->setSrc0(&insn, GenRegister::ip());
- this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
} else {
insn.header.predicate_inverse ^= 1;
this->setSrc1(&insn, GenRegister::immd(2));
@@ -1099,7 +1116,7 @@ namespace gbe
insn2.header.opcode = GEN_OPCODE_ADD;
this->setDst(&insn2, GenRegister::ip());
this->setSrc0(&insn2, GenRegister::ip());
- this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+ this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
}
}
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d86e04c..147c3e6 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -76,8 +76,6 @@
*
* Also, there is some extra kludge to handle the predicates for JMPI.
*
- * See TODO for a better idea for branching and masking
- *
* TODO:
* =====
*
@@ -92,14 +90,9 @@
* interesting approach which consists in traversing the dominator tree in post
* order
*
- * About masking and branching, a much better idea (that I found later unfortunately)
- * is to replace the use of the flag by uses of if/endif to enclose the basic
- * block. So, instead of using predication, we use auto-masking. The very cool
- * consequence is that we can reintegrate back the structured branches.
- * Basically, we will be able to identify branches that can be mapped to
- * structured branches and mix nicely unstructured branches (which will use
- * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
- * fast)
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
*/
#include "backend/gen_insn_selection.hpp"
@@ -320,38 +313,6 @@ namespace gbe
INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
/*! indicate whether a register is a scalar/uniform register. */
INLINE bool isScalarReg(const ir::Register ®) const {
-#if 0
- printf("reg %d ", reg.value());
- printf("uniform: %d ", getRegisterData(reg).isUniform());
- if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n"); return true; }
- if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push location.\n"); return true; }
- if (reg == ir::ocl::groupid0 ||
- reg == ir::ocl::groupid1 ||
- reg == ir::ocl::groupid2 ||
- reg == ir::ocl::barrierid ||
- reg == ir::ocl::threadn ||
- reg == ir::ocl::numgroup0 ||
- reg == ir::ocl::numgroup1 ||
- reg == ir::ocl::numgroup2 ||
- reg == ir::ocl::lsize0 ||
- reg == ir::ocl::lsize1 ||
- reg == ir::ocl::lsize2 ||
- reg == ir::ocl::gsize0 ||
- reg == ir::ocl::gsize1 ||
- reg == ir::ocl::gsize2 ||
- reg == ir::ocl::goffset0 ||
- reg == ir::ocl::goffset1 ||
- reg == ir::ocl::goffset2 ||
- reg == ir::ocl::workdim ||
- reg == ir::ocl::emask ||
- reg == ir::ocl::notemask ||
- reg == ir::ocl::barriermask
- ) {
- printf("special reg.\n");
- return true;
- }
- return false;
-#endif
const ir::RegisterData ®Data = getRegisterData(reg);
return regData.isUniform();
}
@@ -992,7 +953,7 @@ namespace gbe
}
void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
insn->src(0) = src;
insn->index = uint16_t(jip);
}
@@ -1412,9 +1373,17 @@ namespace gbe
for (uint32_t regID = 0; regID < this->regNum; ++regID)
this->regDAG[regID] = NULL;
+ this->block->hasBarrier = false;
+ this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+ bb.getLastInstruction()->getOpcode() == OP_RET;
+ if (!this->block->hasBranch)
+ this->block->endifOffset = -1;
+
// Build the DAG on the fly
uint32_t insnNum = 0;
const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+ if (insn.getOpcode() == OP_SYNC)
+ this->block->hasBarrier = true;
// Build a selectionDAG node for instruction
SelectionDAG *dag = this->newSelectionDAG(insn);
@@ -1465,6 +1434,7 @@ namespace gbe
void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
{
// Bottom up code generation
+ bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
// Process all possible patterns for this instruction
SelectionDAG &dag = *insnDAG[insnID];
@@ -1476,8 +1446,10 @@ namespace gbe
// Start a new code fragment
this->startBackwardGeneration();
+ // If there is no branch at the end of this block.
// Try all the patterns from best to worst
+
do {
if ((*it)->emit(*this, dag))
break;
@@ -1485,6 +1457,13 @@ namespace gbe
} while (it != end);
GBE_ASSERT(it != end);
+ if (needEndif) {
+ const ir::BasicBlock *curr = insn.getParent();
+ const ir::BasicBlock *next = curr->getNextBlock();
+ this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+ needEndif = false;
+ }
+
// Output the code in the current basic block
this->endBackwardGeneration();
}
@@ -2133,6 +2112,7 @@ namespace gbe
const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
sel.push();
+ sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.execWidth = simdWidth;
sel.SEL_CMP(genCmp, tmp, src0, src1);
@@ -2329,7 +2309,6 @@ namespace gbe
const Type type = insn.getType();
const Immediate imm = insn.getImmediate();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
- GenRegister flagReg;
sel.push();
if (sel.isScalarOrBool(insn.getDst(0)) == true) {
@@ -2371,24 +2350,10 @@ namespace gbe
{
using namespace ir;
const ir::Register reg = sel.reg(FAMILY_DWORD);
- const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
const uint32_t params = insn.getParameters();
- sel.push();
- sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
- sel.MOV(GenRegister::flag(1, 1), barrierMask);
- sel.pop();
-
// A barrier is OK to start the thread synchronization *and* SLM fence
- sel.push();
- //sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.flag = 1;
- sel.curr.subFlag = 1;
- sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
- sel.pop();
+ sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
return true;
}
@@ -2696,7 +2661,7 @@ namespace gbe
GenRegister tmpDst;
if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
- tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
+ tmpDst = sel.selReg(dst, TYPE_BOOL);
else
tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
@@ -2724,36 +2689,23 @@ namespace gbe
sel.push();
sel.curr.flag = 1;
sel.curr.subFlag = 1;
- sel.curr.predicate = GEN_PREDICATE_NONE;
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[3];
for(int i=0; i<3; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
- sel.push();
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
- sel.pop();
- sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
} else if(opcode == OP_ORD) {
sel.push();
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
+ sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
sel.pop();
- sel.curr.predicate = GEN_PREDICATE_NORMAL;
-
- sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
- sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
} else
sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
sel.pop();
if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
sel.MOV(sel.selReg(dst, TYPE_U16), GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
- else
- sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
return true;
}
};
@@ -2979,11 +2931,6 @@ namespace gbe
markAllChildren(dag);
}
- // Since we cannot predicate the select instruction with our current mask,
- // we need to perform the selection in two steps (one to select, one to
- // update the destination register)
- const RegisterFamily family = getFamily(type);
- const GenRegister tmp = sel.selReg(sel.reg(family), type);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Register pred = insn.getPredicate();
sel.push();
@@ -2992,16 +2939,14 @@ namespace gbe
sel.curr.flag = 1;
sel.curr.subFlag = 1;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
- sel.curr.noMask = 0;
+ //sel.curr.noMask = 0;
sel.curr.predicate = GEN_PREDICATE_NORMAL;
if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
- sel.SEL_INT64(tmp, src0, src1);
+ sel.SEL_INT64(dst, src0, src1);
else
- sel.SEL(tmp, src0, src1);
+ sel.SEL(dst, src0, src1);
sel.pop();
- // Update the destination register properly now
- sel.MOV(dst, tmp);
return true;
}
};
@@ -3041,6 +2986,7 @@ namespace gbe
DECL_CTOR(TernaryInstruction, 1, 1);
};
+
/*! Label instruction pattern */
DECL_PATTERN(LabelInstruction)
{
@@ -3053,42 +2999,75 @@ namespace gbe
const uint32_t simdWidth = sel.ctx.getSimdWidth();
sel.LABEL(label);
- // Do not emit any code for the "returning" block. There is no need for it
- if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+ // Do not emit any code for the "returning" block. There is no need for it
+ if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
return true;
+ LabelIndex jip;
+ const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+ if (sel.ctx.hasJIP(&insn))
+ jip = sel.ctx.getLabelIndex(&insn);
+ else
+ jip = nextLabel;
+
// Emit the mask computation at the head of each basic block
sel.push();
+ sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.flag = 0;
- sel.curr.subFlag = 0;
sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
sel.pop();
- // If it is required, insert a JUMP to bypass the block
- if (sel.ctx.hasJIP(&insn)) {
- const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+ if (sel.block->hasBarrier) {
+ // If this block has barrier, we don't execute the block until all lanes
+ // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+ // lane not reached, we jump to jip. And no need to issue if/endif for
+ // this block, as it will always excute with all lanes activated.
sel.push();
-
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
sel.curr.predicate = GEN_PREDICATE_NONE;
- GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
- GenRegister flagReg = GenRegister::flag(0, 0);
- sel.AND(flagReg, flagReg, emaskReg);
-
+ sel.curr.noMask = 1;
+ sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
if (simdWidth == 8)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
else if (simdWidth == 16)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
else
NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
sel.curr.inversePredicate = 1;
- sel.curr.flag = 0;
- sel.curr.subFlag = 0;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+ sel.pop();
+ }
+ else {
+ if (sel.ctx.hasJIP(&insn)) {
+ // If it is required, insert a JUMP to bypass the block
+ sel.push();
+ if (simdWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else if (simdWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.curr.inversePredicate = 1;
+ sel.JMPI(GenRegister::immd(0), jip);
+ sel.pop();
+ }
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ // It's easier to set the jip to a relative position over next block.
+ sel.IF(GenRegister::immd(0), nextLabel, nextLabel, sel.block->endifOffset, sel.block->endifOffset);
+ sel.pop();
}
+
return true;
}
DECL_CTOR(LabelInstruction, 1, 1);
@@ -3225,7 +3204,6 @@ namespace gbe
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
-
void emitForwardBranch(Selection::Opaque &sel,
const ir::BranchInstruction &insn,
ir::LabelIndex dst,
@@ -3233,16 +3211,13 @@ namespace gbe
{
using namespace ir;
const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
- const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
// We will not emit any jump if we must go the next block anyway
const BasicBlock *curr = insn.getParent();
const BasicBlock *next = curr->getNextBlock();
const LabelIndex nextLabel = next->getLabelIndex();
-
if (insn.isPredicated() == true) {
const Register pred = insn.getPredicateIndex();
-
sel.push();
// we don't need to set next label to the pcip
// as if there is no backward jump latter, then obviously everything will work fine.
@@ -3250,22 +3225,30 @@ namespace gbe
sel.curr.flag = 0;
sel.curr.subFlag = 0;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
sel.pop();
-
- if (nextLabel == jip) return;
} else {
// Update the PcIPs
+ const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
- // Do not emit branch when we go to the next block anyway
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
if (nextLabel == jip) return;
+ // Branch to the jump target
sel.push();
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ // FIXME just for the correct endif offset.
+ // JMPI still has 2 instruction.
+ sel.block->endifOffset -= 2;
}
}
@@ -3290,37 +3273,32 @@ namespace gbe
// that actually take the branch
const LabelIndex next = bb.getNextBlock()->getLabelIndex();
sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
-
+ GBE_ASSERT(jip == dst);
sel.push();
sel.curr.flag = 0;
sel.curr.subFlag = 0;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
- // Re-update the PcIPs for the branches that takes the backward jump
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
- // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
- // will check those bits as well.
sel.curr.predicate = GEN_PREDICATE_NONE;
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
- sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), emaskReg);
-
- // Branch to the jump target
- if (simdWidth == 8)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
- else if (simdWidth == 16)
+ if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
- NOT_SUPPORTED;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
+ sel.block->endifOffset = -3;
sel.pop();
-
} else {
-
+ const LabelIndex next = bb.getNextBlock()->getLabelIndex();
// Update the PcIPs
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
// Branch to the jump target
sel.push();
sel.curr.execWidth = 1;
@@ -3328,6 +3306,7 @@ namespace gbe
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ sel.block->endifOffset = -3;
}
}
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 04fbb9f..8557768 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -42,6 +42,8 @@ namespace gbe
/*! Translate IR compare to Gen compare */
uint32_t getGenCompare(ir::Opcode opcode);
+ #define GEN_MAX_LABEL 0xFFFF
+
/*! Selection opcodes properly encoded from 0 to n for fast jump tables
* generations
*/
@@ -190,6 +192,9 @@ namespace gbe
void append(SelectionInstruction *insn);
/*! Append a new selection instruction at the beginning of the block */
void prepend(SelectionInstruction *insn);
+ int endifOffset;
+ bool hasBarrier;
+ bool hasBranch;
};
/*! Owns the selection engine */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index d318f8e..ddc9d5e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
-DECL_SELECTION_IR(BRC, BinaryInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
DECL_SELECTION_IR(BRD, UnaryInstruction)
DECL_SELECTION_IR(IF, UnaryInstruction)
DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 25cb428..051f16d 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -118,7 +118,7 @@ namespace gbe
this->noMask = 0;
this->flag = 0;
this->subFlag = 0;
- this->predicate = GEN_PREDICATE_NORMAL;
+ this->predicate = GEN_PREDICATE_NONE;
this->inversePredicate = 0;
this->physicalFlag = 1;
this->flagIndex = 0;
@@ -657,6 +657,17 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_1);
}
+ static INLINE GenRegister nullud(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_NULL,
+ 0,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+
static INLINE bool isNull(GenRegister reg) {
return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
&& reg.nr == GEN_ARF_NULL);
diff --git a/src/cl_api.c b/src/cl_api.c
index 9638994..b572500 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -2526,6 +2526,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+ clFinish(command_queue);
error:
return err;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 5e474de..c2308da 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)
{
driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
assert(driver->bufmgr);
+ //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
}
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e95b050..4819e9e 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
memset(desc, 0, sizeof(*desc));
ker_bo = (drm_intel_bo *) kernel->bo;
desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
- desc->desc1.single_program_flow = 1;
+ desc->desc1.single_program_flow = 0;
desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
desc->desc5.rounding_mode = 0; /* round to nearest even */
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
index 35d4c4f..f901fdf 100644
--- a/utests/compiler_long_cmp.cpp
+++ b/utests/compiler_long_cmp.cpp
@@ -45,6 +45,7 @@ void compiler_long_cmp(void)
int64_t *dest = (int64_t *)buf_data[2];
int64_t x = (src1[i] < src2[i]) ? 3 : 4;
OCL_ASSERT(x == dest[i]);
+ //printf("%d %ld %ld \n", i, dest[i], x);
}
OCL_UNMAP_BUFFER(2);
OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
diff --git a/utests/compiler_unstructured_branch0.cpp b/utests/compiler_unstructured_branch0.cpp
index 128a53e..1a371e9 100644
--- a/utests/compiler_unstructured_branch0.cpp
+++ b/utests/compiler_unstructured_branch0.cpp
@@ -27,7 +27,6 @@ static void compiler_unstructured_branch0(void)
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
for (uint32_t i = 16; i < 32; ++i)
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
-
// Second control flow
for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
OCL_UNMAP_BUFFER(0);
@@ -36,8 +35,7 @@ static void compiler_unstructured_branch0(void)
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < 32; ++i)
- OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
-
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
// Third control flow
for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
OCL_UNMAP_BUFFER(0);
diff --git a/utests/compiler_unstructured_branch1.cpp b/utests/compiler_unstructured_branch1.cpp
index 6021f5b..fb24cec 100644
--- a/utests/compiler_unstructured_branch1.cpp
+++ b/utests/compiler_unstructured_branch1.cpp
@@ -25,7 +25,6 @@ static void compiler_unstructured_branch1(void)
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < n; ++i)
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
-
// Second control flow
for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
OCL_UNMAP_BUFFER(0);
@@ -34,7 +33,7 @@ static void compiler_unstructured_branch1(void)
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < n; ++i)
- OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+ OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
// Third control flow
for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
diff --git a/utests/compiler_unstructured_branch2.cpp b/utests/compiler_unstructured_branch2.cpp
index d61c6b5..68c7448 100644
--- a/utests/compiler_unstructured_branch2.cpp
+++ b/utests/compiler_unstructured_branch2.cpp
@@ -23,6 +23,7 @@ static void compiler_unstructured_branch2(void)
// First control flow
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
+#if 1
for (uint32_t i = 0; i < n; ++i)
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
@@ -35,7 +36,7 @@ static void compiler_unstructured_branch2(void)
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < n; ++i)
OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
-
+#endif
// Third control flow
for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
@@ -45,9 +46,16 @@ static void compiler_unstructured_branch2(void)
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < 8; ++i)
+ {
+ //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], 12);
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+ }
for (uint32_t i = 8; i < n; ++i)
+ {
+ //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], -6);
OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+ }
+ //exit(0);
// Fourth control flow
for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1;
--
1.8.3.2
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list