[Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks.
Gong, Zhigang
zhigang.gong at intel.com
Wed Apr 2 01:51:02 PDT 2014
> -----Original Message-----
> From: Yang, Rong R
> Sent: Wednesday, April 2, 2014 4:41 PM
> To: Gong, Zhigang; beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: RE: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF
> to handle each blocks.
>
>
>
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Friday, March 28, 2014 3:11 PM
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to
> handle each blocks.
>
> When enable SPF (single program flow), we always need to use f0
> as the predication of almost each instruction. This bring some
> trouble when we want to get tow levels mask mechanism, for an
> example the SEL instruction, and some BOOL operations. We
> have to use more than one instructions to do that and simply
> introduce 100% of overhead of those instructions.
>
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
> backend/src/backend/gen/gen_mesa_disasm.c | 31 ++--
> backend/src/backend/gen_context.cpp | 141 ++++++++---------
> backend/src/backend/gen_defs.hpp | 5 +
> backend/src/backend/gen_encoder.cpp | 29 +++-
> backend/src/backend/gen_insn_selection.cpp | 237
> +++++++++++++----------------
> backend/src/backend/gen_insn_selection.hpp | 5 +
> backend/src/backend/gen_insn_selection.hxx | 2 +-
> backend/src/backend/gen_register.hpp | 13 +-
> src/cl_api.c | 1 +
> src/intel/intel_driver.c | 1 +
> src/intel/intel_gpgpu.c | 2 +-
> utests/compiler_long_cmp.cpp | 1 +
> utests/compiler_unstructured_branch0.cpp | 4 +-
> utests/compiler_unstructured_branch1.cpp | 3 +-
> utests/compiler_unstructured_branch2.cpp | 10 +-
> 15 files changed, 249 insertions(+), 236 deletions(-)
>
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index 84ef0c8..e58ef31 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -100,13 +100,13 @@ static const struct {
> [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
> [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
> [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
> - [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
> - [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
> - [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
> - [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
> - [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
> - [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
> - [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
> + [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
> + [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
> [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
> [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
> [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
> @@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void
> *opaque_insn)
> } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
> inst->header.opcode == GEN_OPCODE_ELSE ||
> inst->header.opcode == GEN_OPCODE_ENDIF ||
> - inst->header.opcode == GEN_OPCODE_WHILE)) {
> - // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
> - assert(0);
> + inst->header.opcode == GEN_OPCODE_WHILE ||
> + inst->header.opcode == GEN_OPCODE_BRD ||
> + inst->header.opcode == GEN_OPCODE_JMPI)) {
> + format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
> } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
> inst->header.opcode == GEN_OPCODE_CONTINUE ||
> - inst->header.opcode == GEN_OPCODE_HALT)) {
> - // XXX format (file, " %d %d", inst->bits3.break_cont.uip,
> inst->bits3.break_cont.jip);
> - assert(0);
> - } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
> + inst->header.opcode == GEN_OPCODE_HALT ||
> + inst->header.opcode == GEN_OPCODE_BRC)) {
> + format (file, " %d %d", inst->bits3.gen7_branch.jip,
> inst->bits3.gen7_branch.uip);
> + }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
> format (file, " %d", inst->bits3.d);
> - }
> + }*/
>
> if (opcode[inst->header.opcode].nsrc > 0) {
> pad (file, 32);
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index c46127a..bab059b 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -87,33 +87,29 @@ namespace gbe
> const LabelIndex label = pair.first;
> const int32_t insnID = pair.second;
> const int32_t targetID = labelPos.find(label)->second;
> - p->patchJMPI(insnID, (targetID-insnID-1) * 2);
> + p->patchJMPI(insnID, (targetID - insnID) * 2);
> + }
> + for (auto pair : branchPos3) {
> + const LabelPair labelPair = pair.first;
> + const int32_t insnID = pair.second;
> + const int32_t jip = labelPos.find(labelPair.l0)->second +
> labelPair.offset0;
> + const int32_t uip = labelPos.find(labelPair.l1)->second +
> labelPair.offset1;
> + assert((jip - insnID) * 2 < 32767 && (jip - insnID) > -32768);
> + assert((uip - insnID) * 2 < 32767 && (uip - insnID) > -32768);
> >>>>>>>>> should be (uip - insnID) * 2 > -32768
Good catch. Will fix it latter. Thanks.
>
>
>
> + p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
> }
> }
>
> void GenContext::clearFlagRegister(void) {
> // when group size not aligned to simdWidth, flag register need clear to
> // make prediction(any8/16h) work correctly
> - const GenRegister emaskReg =
> ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
> - const GenRegister notEmaskReg =
> ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
> - uint32_t execWidth = p->curr.execWidth;
> + const GenRegister blockip =
> ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
> p->push();
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - /* clear all the bit in f0.0. */
> - p->curr.execWidth = 1;
> - p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW),
> GenRegister::immuw(0x0000));
> - /* clear the barrier mask bits to all zero0*/
> - p->curr.noMask = 0;
> - p->curr.useFlag(0, 0);
> - p->curr.execWidth = execWidth;
> - /* set all the active lane to 1. Inactive lane remains 0. */
> - p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0),
> GenRegister::ud16grf(126, 0));
> - p->curr.noMask = 1;
> - p->curr.execWidth = 1;
> - p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0),
> GEN_TYPE_UW));
> - p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
> - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)),
> notEmaskReg);
> + p->curr.noMask = 1;
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
> + p->curr.noMask = 0;
> + p->MOV(blockip, GenRegister::immuw(0));
> p->pop();
> }
>
> @@ -148,7 +144,6 @@ namespace gbe
> // Check that everything is consistent in the kernel code
> const uint32_t perLaneSize = kernel->getStackSize();
> const uint32_t perThreadSize = perLaneSize * this->simdWidth;
> - //const int32_t offset = GEN_REG_SIZE +
> kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT,
> GBE_STACK_BUFFER);
> GBE_ASSERT(perLaneSize > 0);
> GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
> GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
> @@ -325,6 +320,7 @@ namespace gbe
> for (int i = 0; i < w / 8; i ++) {
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD),
> src0, src1);
> p->curr.accWrEnable = 1;
> p->MACH(tmp, src0, src1);
> @@ -500,6 +496,7 @@ namespace gbe
> int execWidth = p->curr.execWidth;
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.execWidth = 8;
> for (int nib = 0; nib < execWidth / 4; nib ++) {
> p->AND(dest, src.bottom_half(), GenRegister::immud(63));
> @@ -539,6 +536,7 @@ namespace gbe
> void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister
> low, GenRegister tmp, GenRegister flagReg) {
> p->SHR(sign, high, GenRegister::immud(31));
> p->push();
> + p->curr.noMask = 1;
> p->curr.predicate = GEN_PREDICATE_NONE;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
> @@ -574,6 +572,7 @@ namespace gbe
> I64FullMult(e, f, g, h, a, b, c, d);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -626,6 +625,7 @@ namespace gbe
> p->OR(a, e, f);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_NZ, a, zero);
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -639,6 +639,7 @@ namespace gbe
> I64FullMult(e, f, g, h, a, b, c, d);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_NZ, i, zero);
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -670,6 +671,7 @@ namespace gbe
> p->push();
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->CMP(GEN_CONDITIONAL_NZ, e, zero);
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->MOV(b, one);
> @@ -793,6 +795,7 @@ namespace gbe
> case SEL_OP_I64SHL:
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> collectShifter(a, y);
> loadBottomHalf(e, x);
> loadTopHalf(f, x);
> @@ -820,6 +823,7 @@ namespace gbe
> case SEL_OP_I64SHR:
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> collectShifter(a, y);
> loadBottomHalf(e, x);
> loadTopHalf(f, x);
> @@ -848,6 +852,7 @@ namespace gbe
> f.type = GEN_TYPE_D;
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> collectShifter(a, y);
> loadBottomHalf(e, x);
> loadTopHalf(f, x);
> @@ -894,6 +899,7 @@ namespace gbe
> p->push();
> p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));
> //high == 0
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->MOV(dst, low);
> @@ -911,6 +917,7 @@ namespace gbe
> p->pop();
>
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));
> //exp>23 && high!=0
> @@ -936,6 +943,7 @@ namespace gbe
> p->pop();
>
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0
> @@ -956,7 +964,7 @@ namespace gbe
> p->SHL(high, low, tmp);
> p->MOV(low, GenRegister::immud(0));
>
> - p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
> + p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
> p->curr.predicate = GEN_PREDICATE_NONE;
> p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));
> //update dst where high != 0
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -970,7 +978,7 @@ namespace gbe
> p->CMP(GEN_CONDITIONAL_EQ, high,
> GenRegister::immud(0x80000000));
> p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
> p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
> - p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
> + p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
>
> p->pop();
>
> @@ -994,6 +1002,7 @@ namespace gbe
> p->MOV(tmp_high, high);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
> p->CMP(GEN_CONDITIONAL_GE, tmp_high,
> GenRegister::immud(0x80000000));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1006,6 +1015,7 @@ namespace gbe
> UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
> p->CMP(GEN_CONDITIONAL_GE, tmp_high,
> GenRegister::immud(0x80000000));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1039,6 +1049,7 @@ namespace gbe
> if(dst.is_signed_int()) {
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
> p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1066,11 +1077,10 @@ namespace gbe
> f1.width = GEN_WIDTH_1;
> GenRegister f2 = GenRegister::suboffset(f1, 1);
> GenRegister f3 = GenRegister::suboffset(f1, 2);
> - GenRegister f4 = GenRegister::suboffset(f1, 3);
>
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> - saveFlag(f4, flag, subFlag);
> + p->curr.noMask = 1;
> loadTopHalf(tmp0, src0);
> loadTopHalf(tmp1, src1);
> switch(insn.extra.function) {
> @@ -1130,12 +1140,13 @@ namespace gbe
> NOT_IMPLEMENTED;
> }
> p->curr.execWidth = 1;
> - p->AND(f1, f1, f4);
> p->MOV(GenRegister::flag(flag, subFlag), f1);
> p->pop();
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->MOV(dst, GenRegister::immd(0));
> + p->curr.noMask = 0;
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->MOV(dst, GenRegister::immd(-1));
> p->pop();
> @@ -1163,6 +1174,7 @@ namespace gbe
> p->ADD(c, c, d);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> if(! dst.is_signed_int()) {
> p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
> @@ -1176,6 +1188,7 @@ namespace gbe
> p->MOV(a, GenRegister::immud(0x80000000u));
> p->MOV(b, GenRegister::immud(0));
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->CMP(GEN_CONDITIONAL_GE, a,
> GenRegister::immud(0x80000000u));
> @@ -1209,6 +1222,7 @@ namespace gbe
> p->ADD(c, c, d);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> if(! dst.is_signed_int()) {
> p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
> @@ -1238,6 +1252,7 @@ namespace gbe
> src = src.top_half();
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.execWidth = 8;
> p->MOV(dest, src);
> p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1252,6 +1267,7 @@ namespace gbe
> int execWidth = p->curr.execWidth;
> dest = dest.top_half();
> p->push();
> + p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->curr.execWidth = 8;
> p->MOV(dest, src);
> p->curr.nibControl = 1;
> @@ -1271,6 +1287,7 @@ namespace gbe
> src = src.bottom_half();
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.execWidth = 8;
> p->MOV(dest, src);
> p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1286,6 +1303,7 @@ namespace gbe
> dest = dest.bottom_half();
> p->push();
> p->curr.execWidth = 8;
> + p->curr.predicate = GEN_PREDICATE_NORMAL;
> p->MOV(dest, src);
> p->curr.nibControl = 1;
> p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1369,6 +1387,7 @@ namespace gbe
> loadBottomHalf(d, y);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b,
> c);
> I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a,
> d);
> p->ADD(e, e, f);
> @@ -1443,6 +1462,7 @@ namespace gbe
> // condition <- (c,d)==0 && (a,b)>=(e,f)
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->MOV(l, zero);
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_EQ, a, e);
> @@ -1477,6 +1497,7 @@ namespace gbe
> p->ADD(m, m, one);
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
>
> @@ -1484,7 +1505,6 @@ namespace gbe
> p->curr.noMask = 1;
> p->AND(flagReg, flagReg, emaskReg);
>
> - p->curr.predicate = GEN_PREDICATE_NORMAL;
> // under condition, jump back to start point
> if (simdWidth == 8)
> p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> @@ -1493,8 +1513,9 @@ namespace gbe
> else
> NOT_IMPLEMENTED;
> int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
> + p->curr.noMask = 1;
> p->JMPI(zero);
> - p->patchJMPI(p->n_instruction()-2, jip);
> + p->patchJMPI(p->n_instruction() - 2, jip + 2);
> p->pop();
> // end of loop
> }
> @@ -1502,6 +1523,7 @@ namespace gbe
> if(x.is_signed_int()) {
> p->push();
> p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
> p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
> p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1534,7 +1556,7 @@ namespace gbe
> }
>
> void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
> - NOT_IMPLEMENTED;
> + p->NOP();
> }
>
> void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
> @@ -1546,59 +1568,24 @@ namespace gbe
> const GenRegister fenceDst = ra->genReg(insn.dst(0));
> uint32_t barrierType = insn.extra.barrierType;
> const GenRegister barrierId =
> ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> - GenRegister blockIP;
> - uint32_t exeWidth = p->curr.execWidth;
> - ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
> -
> - if (exeWidth == 16)
> - blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
> - else if (exeWidth == 8)
> - blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
>
> - p->push();
> - /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the
> instructions
> - after the barrier, If there is any lane still remains zero. */
> - p->MOV(blockIP, GenRegister::immuw(0xFFFF));
> - p->curr.noMask = 1;
> - p->curr.execWidth = 1;
> - this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
> - if (exeWidth == 16)
> - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
> - else if (exeWidth == 8)
> - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
> - else
> - NOT_IMPLEMENTED;
> - p->curr.inversePredicate = 1;
> - // If not all channel is set to 1, the barrier is still waiting for other lanes to
> complete,
> - // jump to next basic block.
> - p->JMPI(GenRegister::immud(0));
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->MOV(GenRegister::flag(0, 0),
> ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
> - p->pop();
> -
> - p->push();
> - p->curr.useFlag(0, 0);
> - /* Restore the blockIP to current label. */
> - p->MOV(blockIP,
> GenRegister::immuw(insn.parent->bb->getLabelIndex()));
> if (barrierType == ir::syncGlobalBarrier) {
> p->FENCE(fenceDst);
> p->MOV(fenceDst, fenceDst);
> }
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - // As only the payload.2 is used and all the other regions are ignored
> - // SIMD8 mode here is safe.
> - p->curr.execWidth = 8;
> - p->curr.physicalFlag = 0;
> - p->curr.noMask = 1;
> - // Copy barrier id from r0.
> - p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> - // A barrier is OK to start the thread synchronization *and* SLM fence
> - p->BARRIER(src);
> - // Now we wait for the other threads
> - p->curr.execWidth = 1;
> - p->WAIT();
> - // we executed the barrier then restore the barrier soft mask to initial
> value.
> - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)),
> ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
> + p->push();
> + // As only the payload.2 is used and all the other regions are ignored
> + // SIMD8 mode here is safe.
> + p->curr.execWidth = 8;
> + p->curr.physicalFlag = 0;
> + p->curr.noMask = 1;
> + // Copy barrier id from r0.
> + p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> + // A barrier is OK to start the thread synchronization *and* SLM fence
> + p->BARRIER(src);
> + p->curr.execWidth = 1;
> + // Now we wait for the other threads
> + p->WAIT();
> p->pop();
> }
>
> diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index 7c49497..e731174 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -896,6 +896,11 @@ struct GenInstruction
> uint32_t end_of_thread:1;
> } gen7_msg_gw;
>
> + struct {
> + uint32_t jip:16;
> + uint32_t uip:16;
> + } gen7_branch;
> +
> int d;
> uint32_t ud;
> float f;
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index fc7e53d..06aa769 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -837,6 +837,7 @@ namespace gbe
> GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
> push();
> curr.predicate = GEN_PREDICATE_NONE;
> + curr.noMask = 1;
> curr.execWidth = 1;
> MOV(r, GenRegister::immud(u.u[1]));
> MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
> @@ -907,6 +908,7 @@ namespace gbe
> push();
> curr.execWidth = 8;
> curr.predicate = GEN_PREDICATE_NONE;
> + curr.noMask = 1;
> MOV(r0, src0);
> MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
> curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -920,6 +922,7 @@ namespace gbe
> push();
> curr.execWidth = 8;
> curr.predicate = GEN_PREDICATE_NONE;
> + curr.noMask = 1;
> MOV(r0, GenRegister::suboffset(src0, 8));
> MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0,
> 12));
> curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1058,7 +1061,7 @@ namespace gbe
>
> #define ALU2_BRA(OP) \
> void GenEncoder::OP(GenRegister src) { \
> - alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(),
> src); \
> + alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(),
> GenRegister::nullud(), src); \
> }
>
> ALU2_BRA(IF)
> @@ -1071,9 +1074,21 @@ namespace gbe
> GBE_ASSERT(insnID < this->store.size());
> GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
> insn.header.opcode == GEN_OPCODE_BRD ||
> - insn.header.opcode == GEN_OPCODE_ENDIF);
> - if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
> - this->setSrc1(&insn, GenRegister::immd(jumpDistance));
> + insn.header.opcode == GEN_OPCODE_ENDIF ||
> + insn.header.opcode == GEN_OPCODE_IF ||
> + insn.header.opcode == GEN_OPCODE_BRC);
> +
> + if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769
> && jumpDistance < 32768)) {
> + int offset = 0;
> + if (insn.header.opcode == GEN_OPCODE_IF) {
> + this->setSrc1(&insn, GenRegister::immd(jumpDistance));
> + return;
> + }
> + else if (insn.header.opcode == GEN_OPCODE_JMPI) {
> + offset = -2;
> + /*assert(jumpDistance > -32769 && jumpDistance <
> 32768);*/
> + }
> + this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
> } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
> // For the conditional jump distance out of S15 range, we need to use
> an
> // inverted jmp followed by a add ip, ip, distance to implement.
> @@ -1085,10 +1100,12 @@ namespace gbe
> // for all the branching instruction. And need to adjust the distance
> // for those branch instruction's start point and end point contains
> // this instruction.
> + GenInstruction &insn2 = this->store[insnID+1];
> + GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
> insn.header.opcode = GEN_OPCODE_ADD;
> this->setDst(&insn, GenRegister::ip());
> this->setSrc0(&insn, GenRegister::ip());
> - this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
> + this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
> } else {
> insn.header.predicate_inverse ^= 1;
> this->setSrc1(&insn, GenRegister::immd(2));
> @@ -1099,7 +1116,7 @@ namespace gbe
> insn2.header.opcode = GEN_OPCODE_ADD;
> this->setDst(&insn2, GenRegister::ip());
> this->setSrc0(&insn2, GenRegister::ip());
> - this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
> + this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
> }
> }
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index d86e04c..147c3e6 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -76,8 +76,6 @@
> *
> * Also, there is some extra kludge to handle the predicates for JMPI.
> *
> - * See TODO for a better idea for branching and masking
> - *
> * TODO:
> * =====
> *
> @@ -92,14 +90,9 @@
> * interesting approach which consists in traversing the dominator tree in
> post
> * order
> *
> - * About masking and branching, a much better idea (that I found later
> unfortunately)
> - * is to replace the use of the flag by uses of if/endif to enclose the basic
> - * block. So, instead of using predication, we use auto-masking. The very cool
> - * consequence is that we can reintegrate back the structured branches.
> - * Basically, we will be able to identify branches that can be mapped to
> - * structured branches and mix nicely unstructured branches (which will use
> - * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
> - * fast)
> + * We already use if/endif to enclose each basic block. We will continue to
> identify
> + * those blocks which could match to structured branching and use pure
> structured
> + * instruction to handle them completely.
> */
>
> #include "backend/gen_insn_selection.hpp"
> @@ -320,38 +313,6 @@ namespace gbe
> INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t
> registerPool);
> /*! indicate whether a register is a scalar/uniform register. */
> INLINE bool isScalarReg(const ir::Register ®) const {
> -#if 0
> - printf("reg %d ", reg.value());
> - printf("uniform: %d ", getRegisterData(reg).isUniform());
> - if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n");
> return true; }
> - if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push
> location.\n"); return true; }
> - if (reg == ir::ocl::groupid0 ||
> - reg == ir::ocl::groupid1 ||
> - reg == ir::ocl::groupid2 ||
> - reg == ir::ocl::barrierid ||
> - reg == ir::ocl::threadn ||
> - reg == ir::ocl::numgroup0 ||
> - reg == ir::ocl::numgroup1 ||
> - reg == ir::ocl::numgroup2 ||
> - reg == ir::ocl::lsize0 ||
> - reg == ir::ocl::lsize1 ||
> - reg == ir::ocl::lsize2 ||
> - reg == ir::ocl::gsize0 ||
> - reg == ir::ocl::gsize1 ||
> - reg == ir::ocl::gsize2 ||
> - reg == ir::ocl::goffset0 ||
> - reg == ir::ocl::goffset1 ||
> - reg == ir::ocl::goffset2 ||
> - reg == ir::ocl::workdim ||
> - reg == ir::ocl::emask ||
> - reg == ir::ocl::notemask ||
> - reg == ir::ocl::barriermask
> - ) {
> - printf("special reg.\n");
> - return true;
> - }
> - return false;
> -#endif
> const ir::RegisterData ®Data = getRegisterData(reg);
> return regData.isUniform();
> }
> @@ -992,7 +953,7 @@ namespace gbe
> }
>
> void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
> insn->src(0) = src;
> insn->index = uint16_t(jip);
> }
> @@ -1412,9 +1373,17 @@ namespace gbe
> for (uint32_t regID = 0; regID < this->regNum; ++regID)
> this->regDAG[regID] = NULL;
>
> + this->block->hasBarrier = false;
> + this->block->hasBranch = bb.getLastInstruction()->getOpcode() ==
> OP_BRA ||
> + bb.getLastInstruction()->getOpcode() ==
> OP_RET;
> + if (!this->block->hasBranch)
> + this->block->endifOffset = -1;
> +
> // Build the DAG on the fly
> uint32_t insnNum = 0;
> const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
> + if (insn.getOpcode() == OP_SYNC)
> + this->block->hasBarrier = true;
>
> // Build a selectionDAG node for instruction
> SelectionDAG *dag = this->newSelectionDAG(insn);
> @@ -1465,6 +1434,7 @@ namespace gbe
> void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
> {
> // Bottom up code generation
> + bool needEndif = this->block->hasBranch == false
> && !this->block->hasBarrier;
> for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
> // Process all possible patterns for this instruction
> SelectionDAG &dag = *insnDAG[insnID];
> @@ -1476,8 +1446,10 @@ namespace gbe
>
> // Start a new code fragment
> this->startBackwardGeneration();
> + // If there is no branch at the end of this block.
>
> // Try all the patterns from best to worst
> +
> do {
> if ((*it)->emit(*this, dag))
> break;
> @@ -1485,6 +1457,13 @@ namespace gbe
> } while (it != end);
> GBE_ASSERT(it != end);
>
> + if (needEndif) {
> + const ir::BasicBlock *curr = insn.getParent();
> + const ir::BasicBlock *next = curr->getNextBlock();
> + this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
> + needEndif = false;
> + }
> +
> // Output the code in the current basic block
> this->endBackwardGeneration();
> }
> @@ -2133,6 +2112,7 @@ namespace gbe
> const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
>
> sel.push();
> + sel.curr.noMask = 1;
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.curr.execWidth = simdWidth;
> sel.SEL_CMP(genCmp, tmp, src0, src1);
> @@ -2329,7 +2309,6 @@ namespace gbe
> const Type type = insn.getType();
> const Immediate imm = insn.getImmediate();
> const GenRegister dst = sel.selReg(insn.getDst(0), type);
> - GenRegister flagReg;
>
> sel.push();
> if (sel.isScalarOrBool(insn.getDst(0)) == true) {
> @@ -2371,24 +2350,10 @@ namespace gbe
> {
> using namespace ir;
> const ir::Register reg = sel.reg(FAMILY_DWORD);
> - const GenRegister barrierMask = sel.selReg(ocl::barriermask,
> TYPE_BOOL);
> const uint32_t params = insn.getParameters();
>
> - sel.push();
> - sel.curr.predicate = GEN_PREDICATE_NONE;
> - sel.curr.noMask = 1;
> - sel.curr.execWidth = 1;
> - sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
> - sel.MOV(GenRegister::flag(1, 1), barrierMask);
> - sel.pop();
> -
> // A barrier is OK to start the thread synchronization *and* SLM fence
> - sel.push();
> - //sel.curr.predicate = GEN_PREDICATE_NONE;
> - sel.curr.flag = 1;
> - sel.curr.subFlag = 1;
> - sel.BARRIER(GenRegister::ud8grf(reg),
> sel.selReg(sel.reg(FAMILY_DWORD)), params);
> - sel.pop();
> + sel.BARRIER(GenRegister::ud8grf(reg),
> sel.selReg(sel.reg(FAMILY_DWORD)), params);
> return true;
> }
>
> @@ -2696,7 +2661,7 @@ namespace gbe
> GenRegister tmpDst;
>
> if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
> - tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
> + tmpDst = sel.selReg(dst, TYPE_BOOL);
> else
> tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
>
> @@ -2724,36 +2689,23 @@ namespace gbe
> sel.push();
> sel.curr.flag = 1;
> sel.curr.subFlag = 1;
> - sel.curr.predicate = GEN_PREDICATE_NONE;
> if (type == TYPE_S64 || type == TYPE_U64) {
> GenRegister tmp[3];
> for(int i=0; i<3; i++)
> tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
> - sel.push();
> - sel.curr.execWidth = 1;
> - sel.curr.noMask = 1;
> - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
> - sel.pop();
> - sel.curr.predicate = GEN_PREDICATE_NORMAL;
> sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
> } else if(opcode == OP_ORD) {
> sel.push();
> - sel.curr.execWidth = 1;
> - sel.curr.noMask = 1;
> - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
> + sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
> sel.pop();
> - sel.curr.predicate = GEN_PREDICATE_NORMAL;
> -
> - sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> - sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
> } else
> sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
> sel.pop();
>
> if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
> sel.MOV(sel.selReg(dst, TYPE_U16),
> GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
> - else
> - sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
> return true;
> }
> };
> @@ -2979,11 +2931,6 @@ namespace gbe
> markAllChildren(dag);
> }
>
> - // Since we cannot predicate the select instruction with our current
> mask,
> - // we need to perform the selection in two steps (one to select, one to
> - // update the destination register)
> - const RegisterFamily family = getFamily(type);
> - const GenRegister tmp = sel.selReg(sel.reg(family), type);
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> const Register pred = insn.getPredicate();
> sel.push();
> @@ -2992,16 +2939,14 @@ namespace gbe
> sel.curr.flag = 1;
> sel.curr.subFlag = 1;
> sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> - sel.curr.noMask = 0;
> + //sel.curr.noMask = 0;
> sel.curr.predicate = GEN_PREDICATE_NORMAL;
> if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
> - sel.SEL_INT64(tmp, src0, src1);
> + sel.SEL_INT64(dst, src0, src1);
> else
> - sel.SEL(tmp, src0, src1);
> + sel.SEL(dst, src0, src1);
> sel.pop();
>
> - // Update the destination register properly now
> - sel.MOV(dst, tmp);
> return true;
> }
> };
> @@ -3041,6 +2986,7 @@ namespace gbe
> DECL_CTOR(TernaryInstruction, 1, 1);
> };
>
> +
> /*! Label instruction pattern */
> DECL_PATTERN(LabelInstruction)
> {
> @@ -3053,42 +2999,75 @@ namespace gbe
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> sel.LABEL(label);
>
> - // Do not emit any code for the "returning" block. There is no need for it
> - if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
> + // Do not emit any code for the "returning" block. There is no need for
> it
> + if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
> return true;
>
> + LabelIndex jip;
> + const LabelIndex nextLabel =
> insn.getParent()->getNextBlock()->getLabelIndex();
> + if (sel.ctx.hasJIP(&insn))
> + jip = sel.ctx.getLabelIndex(&insn);
> + else
> + jip = nextLabel;
> +
> // Emit the mask computation at the head of each basic block
> sel.push();
> + sel.curr.noMask = 1;
> sel.curr.predicate = GEN_PREDICATE_NONE;
> - sel.curr.flag = 0;
> - sel.curr.subFlag = 0;
> sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0,
> GEN_TYPE_UW), src1);
> sel.pop();
>
> - // If it is required, insert a JUMP to bypass the block
> - if (sel.ctx.hasJIP(&insn)) {
> - const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
> + if (sel.block->hasBarrier) {
> + // If this block has barrier, we don't execute the block until all lanes
> + // are 1s. Set each reached lane to 1, then check all lanes. If there is
> any
> + // lane not reached, we jump to jip. And no need to issue if/endif for
> + // this block, as it will always excute with all lanes activated.
> sel.push();
> -
> - sel.curr.noMask = 1;
> - sel.curr.execWidth = 1;
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW),
> GenRegister::immuw(GEN_MAX_LABEL));
> sel.curr.predicate = GEN_PREDICATE_NONE;
> - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
> - GenRegister flagReg = GenRegister::flag(0, 0);
> - sel.AND(flagReg, flagReg, emaskReg);
> -
> + sel.curr.noMask = 1;
> + sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0,
> GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
> if (simdWidth == 8)
> - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
> else if (simdWidth == 16)
> - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
> + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
> else
> NOT_IMPLEMENTED;
> + sel.curr.noMask = 1;
> + sel.curr.execWidth = 1;
> sel.curr.inversePredicate = 1;
> - sel.curr.flag = 0;
> - sel.curr.subFlag = 0;
> sel.JMPI(GenRegister::immd(0), jip);
> sel.pop();
> + // FIXME, if the last BRA is unconditional jump, we don't need to
> update the label here.
> + sel.push();
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW),
> GenRegister::immuw((uint16_t)label));
> + sel.pop();
> + }
> + else {
> + if (sel.ctx.hasJIP(&insn)) {
> + // If it is required, insert a JUMP to bypass the block
> + sel.push();
> + if (simdWidth == 8)
> + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> + else if (simdWidth == 16)
> + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
> + else
> + NOT_IMPLEMENTED;
> + sel.curr.noMask = 1;
> + sel.curr.execWidth = 1;
> + sel.curr.inversePredicate = 1;
> + sel.JMPI(GenRegister::immd(0), jip);
> + sel.pop();
> + }
> + sel.push();
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + // It's easier to set the jip to a relative position over next block.
> + sel.IF(GenRegister::immd(0), nextLabel, nextLabel,
> sel.block->endifOffset, sel.block->endifOffset);
> + sel.pop();
> }
> +
> return true;
> }
> DECL_CTOR(LabelInstruction, 1, 1);
> @@ -3225,7 +3204,6 @@ namespace gbe
> /*! Branch instruction pattern */
> DECL_PATTERN(BranchInstruction)
> {
> -
> void emitForwardBranch(Selection::Opaque &sel,
> const ir::BranchInstruction &insn,
> ir::LabelIndex dst,
> @@ -3233,16 +3211,13 @@ namespace gbe
> {
> using namespace ir;
> const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
> - const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
>
> // We will not emit any jump if we must go the next block anyway
> const BasicBlock *curr = insn.getParent();
> const BasicBlock *next = curr->getNextBlock();
> const LabelIndex nextLabel = next->getLabelIndex();
> -
> if (insn.isPredicated() == true) {
> const Register pred = insn.getPredicateIndex();
> -
> sel.push();
> // we don't need to set next label to the pcip
> // as if there is no backward jump latter, then obviously
> everything will work fine.
> @@ -3250,22 +3225,30 @@ namespace gbe
> sel.curr.flag = 0;
> sel.curr.subFlag = 0;
> sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> + if (!sel.block->hasBarrier)
> + sel.ENDIF(GenRegister::immd(0), nextLabel);
> + sel.block->endifOffset = -1;
> sel.pop();
> -
> - if (nextLabel == jip) return;
> } else {
> // Update the PcIPs
> + const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
> sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> - // Do not emit branch when we go to the next block anyway
> + if (!sel.block->hasBarrier)
> + sel.ENDIF(GenRegister::immd(0), nextLabel);
> + sel.block->endifOffset = -1;
> if (nextLabel == jip) return;
> + // Branch to the jump target
> sel.push();
> sel.curr.execWidth = 1;
> sel.curr.noMask = 1;
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.JMPI(GenRegister::immd(0), jip);
> sel.pop();
> + // FIXME just for the correct endif offset.
> + // JMPI still has 2 instruction.
> + sel.block->endifOffset -= 2;
> }
> }
>
> @@ -3290,37 +3273,32 @@ namespace gbe
> // that actually take the branch
> const LabelIndex next = bb.getNextBlock()->getLabelIndex();
> sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
> -
> + GBE_ASSERT(jip == dst);
> sel.push();
> sel.curr.flag = 0;
> sel.curr.subFlag = 0;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> - // Re-update the PcIPs for the branches that takes the backward
> jump
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> - // We clear all the inactive channel to 0 as the
> GEN_PREDICATE_ALIGN1_ANY8/16
> - // will check those bits as well.
> sel.curr.predicate = GEN_PREDICATE_NONE;
> + if (!sel.block->hasBarrier)
> + sel.ENDIF(GenRegister::immd(0), next);
> sel.curr.execWidth = 1;
> - sel.curr.noMask = 1;
> - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
> - sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1),
> emaskReg);
> -
> - // Branch to the jump target
> - if (simdWidth == 8)
> - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> - else if (simdWidth == 16)
> + if (simdWidth == 16)
> sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
> else
> - NOT_SUPPORTED;
> + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> + sel.curr.noMask = 1;
> sel.JMPI(GenRegister::immd(0), jip);
> + sel.block->endifOffset = -3;
> sel.pop();
> -
> } else {
> -
> + const LabelIndex next = bb.getNextBlock()->getLabelIndex();
> // Update the PcIPs
> sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> + if (!sel.block->hasBarrier)
> + sel.ENDIF(GenRegister::immd(0), next);
> // Branch to the jump target
> sel.push();
> sel.curr.execWidth = 1;
> @@ -3328,6 +3306,7 @@ namespace gbe
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.JMPI(GenRegister::immd(0), jip);
> sel.pop();
> + sel.block->endifOffset = -3;
> }
> }
>
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 04fbb9f..8557768 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -42,6 +42,8 @@ namespace gbe
> /*! Translate IR compare to Gen compare */
> uint32_t getGenCompare(ir::Opcode opcode);
>
> + #define GEN_MAX_LABEL 0xFFFF
> +
> /*! Selection opcodes properly encoded from 0 to n for fast jump tables
> * generations
> */
> @@ -190,6 +192,9 @@ namespace gbe
> void append(SelectionInstruction *insn);
> /*! Append a new selection instruction at the beginning of the block */
> void prepend(SelectionInstruction *insn);
> + int endifOffset;
> + bool hasBarrier;
> + bool hasBranch;
> };
>
> /*! Owns the selection engine */
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index d318f8e..ddc9d5e 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
> DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
> DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
> DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
> -DECL_SELECTION_IR(BRC, BinaryInstruction)
> +DECL_SELECTION_IR(BRC, UnaryInstruction)
> DECL_SELECTION_IR(BRD, UnaryInstruction)
> DECL_SELECTION_IR(IF, UnaryInstruction)
> DECL_SELECTION_IR(ENDIF, UnaryInstruction)
> diff --git a/backend/src/backend/gen_register.hpp
> b/backend/src/backend/gen_register.hpp
> index 25cb428..051f16d 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -118,7 +118,7 @@ namespace gbe
> this->noMask = 0;
> this->flag = 0;
> this->subFlag = 0;
> - this->predicate = GEN_PREDICATE_NORMAL;
> + this->predicate = GEN_PREDICATE_NONE;
> this->inversePredicate = 0;
> this->physicalFlag = 1;
> this->flagIndex = 0;
> @@ -657,6 +657,17 @@ namespace gbe
> GEN_HORIZONTAL_STRIDE_1);
> }
>
> + static INLINE GenRegister nullud(void) {
> + return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
> + GEN_ARF_NULL,
> + 0,
> + GEN_TYPE_UD,
> + GEN_VERTICAL_STRIDE_8,
> + GEN_WIDTH_8,
> + GEN_HORIZONTAL_STRIDE_1);
> + }
> +
> +
> static INLINE bool isNull(GenRegister reg) {
> return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
> && reg.nr == GEN_ARF_NULL);
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 9638994..b572500 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -2526,6 +2526,7 @@ clEnqueueNDRangeKernel(cl_command_queue
> command_queue,
>
> err = cl_command_queue_flush(command_queue);
> }
> + clFinish(command_queue);
>
> error:
> return err;
> diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
> index 5e474de..c2308da 100644
> --- a/src/intel/intel_driver.c
> +++ b/src/intel/intel_driver.c
> @@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)
> {
> driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
> assert(driver->bufmgr);
> + //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
> drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
> }
>
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index e95b050..4819e9e 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu,
> cl_gpgpu_kernel *kernel)
> memset(desc, 0, sizeof(*desc));
> ker_bo = (drm_intel_bo *) kernel->bo;
> desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
> - desc->desc1.single_program_flow = 1;
> + desc->desc1.single_program_flow = 0;
> desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
> desc->desc5.rounding_mode = 0; /* round to nearest even */
>
> diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
> index 35d4c4f..f901fdf 100644
> --- a/utests/compiler_long_cmp.cpp
> +++ b/utests/compiler_long_cmp.cpp
> @@ -45,6 +45,7 @@ void compiler_long_cmp(void)
> int64_t *dest = (int64_t *)buf_data[2];
> int64_t x = (src1[i] < src2[i]) ? 3 : 4;
> OCL_ASSERT(x == dest[i]);
> + //printf("%d %ld %ld \n", i, dest[i], x);
> }
> OCL_UNMAP_BUFFER(2);
> OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
> diff --git a/utests/compiler_unstructured_branch0.cpp
> b/utests/compiler_unstructured_branch0.cpp
> index 128a53e..1a371e9 100644
> --- a/utests/compiler_unstructured_branch0.cpp
> +++ b/utests/compiler_unstructured_branch0.cpp
> @@ -27,7 +27,6 @@ static void compiler_unstructured_branch0(void)
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
> for (uint32_t i = 16; i < 32; ++i)
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
> -
> // Second control flow
> for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
> OCL_UNMAP_BUFFER(0);
> @@ -36,8 +35,7 @@ static void compiler_unstructured_branch0(void)
> OCL_MAP_BUFFER(0);
> OCL_MAP_BUFFER(1);
> for (uint32_t i = 0; i < 32; ++i)
> - OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
> -
> + OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
> // Third control flow
> for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
> OCL_UNMAP_BUFFER(0);
> diff --git a/utests/compiler_unstructured_branch1.cpp
> b/utests/compiler_unstructured_branch1.cpp
> index 6021f5b..fb24cec 100644
> --- a/utests/compiler_unstructured_branch1.cpp
> +++ b/utests/compiler_unstructured_branch1.cpp
> @@ -25,7 +25,6 @@ static void compiler_unstructured_branch1(void)
> OCL_MAP_BUFFER(1);
> for (uint32_t i = 0; i < n; ++i)
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
> -
> // Second control flow
> for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
> OCL_UNMAP_BUFFER(0);
> @@ -34,7 +33,7 @@ static void compiler_unstructured_branch1(void)
> OCL_MAP_BUFFER(0);
> OCL_MAP_BUFFER(1);
> for (uint32_t i = 0; i < n; ++i)
> - OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
> + OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
>
> // Third control flow
> for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
> diff --git a/utests/compiler_unstructured_branch2.cpp
> b/utests/compiler_unstructured_branch2.cpp
> index d61c6b5..68c7448 100644
> --- a/utests/compiler_unstructured_branch2.cpp
> +++ b/utests/compiler_unstructured_branch2.cpp
> @@ -23,6 +23,7 @@ static void compiler_unstructured_branch2(void)
> // First control flow
> OCL_MAP_BUFFER(0);
> OCL_MAP_BUFFER(1);
> +#if 1
> for (uint32_t i = 0; i < n; ++i)
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
>
> @@ -35,7 +36,7 @@ static void compiler_unstructured_branch2(void)
> OCL_MAP_BUFFER(1);
> for (uint32_t i = 0; i < n; ++i)
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
> -
> +#endif
> // Third control flow
> for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
> for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
> @@ -45,9 +46,16 @@ static void compiler_unstructured_branch2(void)
> OCL_MAP_BUFFER(0);
> OCL_MAP_BUFFER(1);
> for (uint32_t i = 0; i < 8; ++i)
> + {
> + //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], 12);
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
> + }
> for (uint32_t i = 8; i < n; ++i)
> + {
> + //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], -6);
> OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
> + }
> + //exit(0);
>
> // Fourth control flow
> for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1;
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list