[Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks.

Gong, Zhigang zhigang.gong at intel.com
Wed Apr 2 01:51:02 PDT 2014



> -----Original Message-----
> From: Yang, Rong R
> Sent: Wednesday, April 2, 2014 4:41 PM
> To: Gong, Zhigang; beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: RE: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF
> to handle each blocks.
> 
> 
> 
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Friday, March 28, 2014 3:11 PM
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to
> handle each blocks.
> 
> When enable SPF (single program flow), we always need to use f0
> as the predication of almost each instruction. This bring some
> trouble when we want to get tow levels mask mechanism, for an
> example the SEL instruction, and some BOOL operations. We
> have to use more than one instructions to do that and simply
> introduce 100% of overhead of those instructions.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen/gen_mesa_disasm.c  |  31 ++--
>  backend/src/backend/gen_context.cpp        | 141 ++++++++---------
>  backend/src/backend/gen_defs.hpp           |   5 +
>  backend/src/backend/gen_encoder.cpp        |  29 +++-
>  backend/src/backend/gen_insn_selection.cpp | 237
> +++++++++++++----------------
>  backend/src/backend/gen_insn_selection.hpp |   5 +
>  backend/src/backend/gen_insn_selection.hxx |   2 +-
>  backend/src/backend/gen_register.hpp       |  13 +-
>  src/cl_api.c                               |   1 +
>  src/intel/intel_driver.c                   |   1 +
>  src/intel/intel_gpgpu.c                    |   2 +-
>  utests/compiler_long_cmp.cpp               |   1 +
>  utests/compiler_unstructured_branch0.cpp   |   4 +-
>  utests/compiler_unstructured_branch1.cpp   |   3 +-
>  utests/compiler_unstructured_branch2.cpp   |  10 +-
>  15 files changed, 249 insertions(+), 236 deletions(-)
> 
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index 84ef0c8..e58ef31 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -100,13 +100,13 @@ static const struct {
>    [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
>    [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
>    [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
> -  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
> -  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
> -  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
> -  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
> -  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
> -  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
> -  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
> +  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
> +  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
>    [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
>    [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
>    [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
> @@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void
> *opaque_insn)
>      } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
>            inst->header.opcode == GEN_OPCODE_ELSE ||
>            inst->header.opcode == GEN_OPCODE_ENDIF ||
> -          inst->header.opcode == GEN_OPCODE_WHILE)) {
> -      // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
> -      assert(0);
> +          inst->header.opcode == GEN_OPCODE_WHILE ||
> +          inst->header.opcode == GEN_OPCODE_BRD ||
> +          inst->header.opcode == GEN_OPCODE_JMPI)) {
> +      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
>      } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
>            inst->header.opcode == GEN_OPCODE_CONTINUE ||
> -          inst->header.opcode == GEN_OPCODE_HALT)) {
> -      // XXX format (file, " %d %d", inst->bits3.break_cont.uip,
> inst->bits3.break_cont.jip);
> -      assert(0);
> -    } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
> +          inst->header.opcode == GEN_OPCODE_HALT ||
> +          inst->header.opcode == GEN_OPCODE_BRC)) {
> +      format (file, " %d %d", inst->bits3.gen7_branch.jip,
> inst->bits3.gen7_branch.uip);
> +    }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
>        format (file, " %d", inst->bits3.d);
> -    }
> +    }*/
> 
>      if (opcode[inst->header.opcode].nsrc > 0) {
>        pad (file, 32);
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index c46127a..bab059b 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -87,33 +87,29 @@ namespace gbe
>        const LabelIndex label = pair.first;
>        const int32_t insnID = pair.second;
>        const int32_t targetID = labelPos.find(label)->second;
> -      p->patchJMPI(insnID, (targetID-insnID-1) * 2);
> +      p->patchJMPI(insnID, (targetID - insnID) * 2);
> +    }
> +    for (auto pair : branchPos3) {
> +      const LabelPair labelPair = pair.first;
> +      const int32_t insnID = pair.second;
> +      const int32_t jip = labelPos.find(labelPair.l0)->second +
> labelPair.offset0;
> +      const int32_t uip = labelPos.find(labelPair.l1)->second +
> labelPair.offset1;
> +      assert((jip - insnID) * 2 < 32767 && (jip - insnID) > -32768);
> +      assert((uip - insnID) * 2 < 32767 && (uip - insnID) > -32768);
> >>>>>>>>> should be (uip - insnID) * 2 > -32768
Good catch. Will fix it latter. Thanks.

> 
> 
> 
> +      p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
>      }
>    }
> 
>    void GenContext::clearFlagRegister(void) {
>      // when group size not aligned to simdWidth, flag register need clear to
>      // make prediction(any8/16h) work correctly
> -    const GenRegister emaskReg =
> ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
> -    const GenRegister notEmaskReg =
> ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
> -    uint32_t execWidth = p->curr.execWidth;
> +    const GenRegister blockip =
> ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
>      p->push();
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->curr.noMask = 1;
> -    /* clear all the bit in f0.0. */
> -    p->curr.execWidth = 1;
> -    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW),
> GenRegister::immuw(0x0000));
> -    /* clear the barrier mask bits to all zero0*/
> -    p->curr.noMask = 0;
> -    p->curr.useFlag(0, 0);
> -    p->curr.execWidth = execWidth;
> -    /* set all the active lane to 1. Inactive lane remains 0. */
> -    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0),
> GenRegister::ud16grf(126, 0));
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 1;
> -    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0),
> GEN_TYPE_UW));
> -    p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
> -    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)),
> notEmaskReg);
> +      p->curr.noMask = 1;
> +      p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
> +      p->curr.noMask = 0;
> +      p->MOV(blockip, GenRegister::immuw(0));
>      p->pop();
>    }
> 
> @@ -148,7 +144,6 @@ namespace gbe
>      // Check that everything is consistent in the kernel code
>      const uint32_t perLaneSize = kernel->getStackSize();
>      const uint32_t perThreadSize = perLaneSize * this->simdWidth;
> -    //const int32_t offset = GEN_REG_SIZE +
> kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT,
> GBE_STACK_BUFFER);
>      GBE_ASSERT(perLaneSize > 0);
>      GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
>      GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
> @@ -325,6 +320,7 @@ namespace gbe
>          for (int i = 0; i < w / 8; i ++) {
>            p->push();
>            p->curr.predicate = GEN_PREDICATE_NONE;
> +          p->curr.noMask = 1;
>            p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD),
> src0, src1);
>            p->curr.accWrEnable = 1;
>            p->MACH(tmp, src0, src1);
> @@ -500,6 +496,7 @@ namespace gbe
>      int execWidth = p->curr.execWidth;
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->curr.execWidth = 8;
>      for (int nib = 0; nib < execWidth / 4; nib ++) {
>        p->AND(dest, src.bottom_half(), GenRegister::immud(63));
> @@ -539,6 +536,7 @@ namespace gbe
>    void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister
> low, GenRegister tmp, GenRegister flagReg) {
>      p->SHR(sign, high, GenRegister::immud(31));
>      p->push();
> +    p->curr.noMask = 1;
>      p->curr.predicate = GEN_PREDICATE_NONE;
>      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>      p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
> @@ -574,6 +572,7 @@ namespace gbe
>        I64FullMult(e, f, g, h, a, b, c, d);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -626,6 +625,7 @@ namespace gbe
>        p->OR(a, e, f);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_NZ, a, zero);
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -639,6 +639,7 @@ namespace gbe
>        I64FullMult(e, f, g, h, a, b, c, d);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_NZ, i, zero);
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -670,6 +671,7 @@ namespace gbe
>        p->push();
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->CMP(GEN_CONDITIONAL_NZ, e, zero);
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
>        p->MOV(b, one);
> @@ -793,6 +795,7 @@ namespace gbe
>        case SEL_OP_I64SHL:
>          p->push();
>          p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
>          collectShifter(a, y);
>          loadBottomHalf(e, x);
>          loadTopHalf(f, x);
> @@ -820,6 +823,7 @@ namespace gbe
>        case SEL_OP_I64SHR:
>          p->push();
>          p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
>          collectShifter(a, y);
>          loadBottomHalf(e, x);
>          loadTopHalf(f, x);
> @@ -848,6 +852,7 @@ namespace gbe
>          f.type = GEN_TYPE_D;
>          p->push();
>          p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
>          collectShifter(a, y);
>          loadBottomHalf(e, x);
>          loadTopHalf(f, x);
> @@ -894,6 +899,7 @@ namespace gbe
>      p->push();
>        p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));
> //high == 0
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
>        p->MOV(dst, low);
> @@ -911,6 +917,7 @@ namespace gbe
>        p->pop();
> 
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
>        p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));
> //exp>23 && high!=0
> @@ -936,6 +943,7 @@ namespace gbe
>        p->pop();
> 
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
>        p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
> @@ -956,7 +964,7 @@ namespace gbe
>        p->SHL(high, low, tmp);
>        p->MOV(low, GenRegister::immud(0));
> 
> -      p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
> +      p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
>        p->curr.predicate = GEN_PREDICATE_NONE;
>        p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));
> //update dst where high != 0
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -970,7 +978,7 @@ namespace gbe
>        p->CMP(GEN_CONDITIONAL_EQ, high,
> GenRegister::immud(0x80000000));
>        p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
>        p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
> -      p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
> +      p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
> 
>      p->pop();
> 
> @@ -994,6 +1002,7 @@ namespace gbe
>        p->MOV(tmp_high, high);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_GE, tmp_high,
> GenRegister::immud(0x80000000));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1006,6 +1015,7 @@ namespace gbe
>        UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_GE, tmp_high,
> GenRegister::immud(0x80000000));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1039,6 +1049,7 @@ namespace gbe
>      if(dst.is_signed_int()) {
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1066,11 +1077,10 @@ namespace gbe
>                  f1.width = GEN_WIDTH_1;
>      GenRegister f2 = GenRegister::suboffset(f1, 1);
>      GenRegister f3 = GenRegister::suboffset(f1, 2);
> -    GenRegister f4 = GenRegister::suboffset(f1, 3);
> 
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> -    saveFlag(f4, flag, subFlag);
> +    p->curr.noMask = 1;
>      loadTopHalf(tmp0, src0);
>      loadTopHalf(tmp1, src1);
>      switch(insn.extra.function) {
> @@ -1130,12 +1140,13 @@ namespace gbe
>          NOT_IMPLEMENTED;
>      }
>      p->curr.execWidth = 1;
> -    p->AND(f1, f1, f4);
>      p->MOV(GenRegister::flag(flag, subFlag), f1);
>      p->pop();
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->MOV(dst, GenRegister::immd(0));
> +    p->curr.noMask = 0;
>      p->curr.predicate = GEN_PREDICATE_NORMAL;
>      p->MOV(dst, GenRegister::immd(-1));
>      p->pop();
> @@ -1163,6 +1174,7 @@ namespace gbe
>      p->ADD(c, c, d);
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>      if(! dst.is_signed_int()) {
>        p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
> @@ -1176,6 +1188,7 @@ namespace gbe
>        p->MOV(a, GenRegister::immud(0x80000000u));
>        p->MOV(b, GenRegister::immud(0));
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
>        p->CMP(GEN_CONDITIONAL_GE, a,
> GenRegister::immud(0x80000000u));
> @@ -1209,6 +1222,7 @@ namespace gbe
>      p->ADD(c, c, d);
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>      if(! dst.is_signed_int()) {
>        p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
> @@ -1238,6 +1252,7 @@ namespace gbe
>      src = src.top_half();
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->curr.execWidth = 8;
>      p->MOV(dest, src);
>      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1252,6 +1267,7 @@ namespace gbe
>      int execWidth = p->curr.execWidth;
>      dest = dest.top_half();
>      p->push();
> +    p->curr.predicate = GEN_PREDICATE_NORMAL;
>      p->curr.execWidth = 8;
>      p->MOV(dest, src);
>      p->curr.nibControl = 1;
> @@ -1271,6 +1287,7 @@ namespace gbe
>      src = src.bottom_half();
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      p->curr.execWidth = 8;
>      p->MOV(dest, src);
>      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1286,6 +1303,7 @@ namespace gbe
>      dest = dest.bottom_half();
>      p->push();
>      p->curr.execWidth = 8;
> +    p->curr.predicate = GEN_PREDICATE_NORMAL;
>      p->MOV(dest, src);
>      p->curr.nibControl = 1;
>      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src,
> 4));
> @@ -1369,6 +1387,7 @@ namespace gbe
>      loadBottomHalf(d, y);
>      p->push();
>      p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->curr.noMask = 1;
>      I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b,
> c);
>      I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a,
> d);
>      p->ADD(e, e, f);
> @@ -1443,6 +1462,7 @@ namespace gbe
>        // condition <- (c,d)==0 && (a,b)>=(e,f)
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->MOV(l, zero);
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_EQ, a, e);
> @@ -1477,6 +1497,7 @@ namespace gbe
>        p->ADD(m, m, one);
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
> 
> @@ -1484,7 +1505,6 @@ namespace gbe
>        p->curr.noMask = 1;
>        p->AND(flagReg, flagReg, emaskReg);
> 
> -      p->curr.predicate = GEN_PREDICATE_NORMAL;
>        // under condition, jump back to start point
>        if (simdWidth == 8)
>          p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> @@ -1493,8 +1513,9 @@ namespace gbe
>        else
>          NOT_IMPLEMENTED;
>        int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
> +      p->curr.noMask = 1;
>        p->JMPI(zero);
> -      p->patchJMPI(p->n_instruction()-2, jip);
> +      p->patchJMPI(p->n_instruction() - 2, jip + 2);
>        p->pop();
>        // end of loop
>      }
> @@ -1502,6 +1523,7 @@ namespace gbe
>      if(x.is_signed_int()) {
>        p->push();
>        p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
>        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
>        p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
>        p->curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1534,7 +1556,7 @@ namespace gbe
>    }
> 
>    void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
> -    NOT_IMPLEMENTED;
> +   p->NOP();
>    }
> 
>    void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
> @@ -1546,59 +1568,24 @@ namespace gbe
>      const GenRegister fenceDst = ra->genReg(insn.dst(0));
>      uint32_t barrierType = insn.extra.barrierType;
>      const GenRegister barrierId =
> ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> -    GenRegister blockIP;
> -    uint32_t exeWidth = p->curr.execWidth;
> -    ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
> -
> -    if (exeWidth == 16)
> -      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
> -    else if (exeWidth == 8)
> -      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
> 
> -    p->push();
> -    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the
> instructions
> -       after the barrier, If there is any lane still remains zero. */
> -    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 1;
> -    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
> -    if (exeWidth == 16)
> -      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
> -    else if (exeWidth == 8)
> -      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
> -    else
> -      NOT_IMPLEMENTED;
> -    p->curr.inversePredicate = 1;
> -    // If not all channel is set to 1, the barrier is still waiting for other lanes to
> complete,
> -    // jump to next basic block.
> -    p->JMPI(GenRegister::immud(0));
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->MOV(GenRegister::flag(0, 0),
> ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
> -    p->pop();
> -
> -    p->push();
> -    p->curr.useFlag(0, 0);
> -    /* Restore the blockIP to current label. */
> -    p->MOV(blockIP,
> GenRegister::immuw(insn.parent->bb->getLabelIndex()));
>      if (barrierType == ir::syncGlobalBarrier) {
>        p->FENCE(fenceDst);
>        p->MOV(fenceDst, fenceDst);
>      }
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    // As only the payload.2 is used and all the other regions are ignored
> -    // SIMD8 mode here is safe.
> -    p->curr.execWidth = 8;
> -    p->curr.physicalFlag = 0;
> -    p->curr.noMask = 1;
> -    // Copy barrier id from r0.
> -    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> -    // A barrier is OK to start the thread synchronization *and* SLM fence
> -    p->BARRIER(src);
> -    // Now we wait for the other threads
> -    p->curr.execWidth = 1;
> -    p->WAIT();
> -    // we executed the barrier then restore the barrier soft mask to initial
> value.
> -    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)),
> ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
> +    p->push();
> +      // As only the payload.2 is used and all the other regions are ignored
> +      // SIMD8 mode here is safe.
> +      p->curr.execWidth = 8;
> +      p->curr.physicalFlag = 0;
> +      p->curr.noMask = 1;
> +      // Copy barrier id from r0.
> +      p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> +      // A barrier is OK to start the thread synchronization *and* SLM fence
> +      p->BARRIER(src);
> +      p->curr.execWidth = 1;
> +      // Now we wait for the other threads
> +      p->WAIT();
>      p->pop();
>    }
> 
> diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index 7c49497..e731174 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -896,6 +896,11 @@ struct GenInstruction
>        uint32_t end_of_thread:1;
>      } gen7_msg_gw;
> 
> +    struct {
> +      uint32_t jip:16;
> +      uint32_t uip:16;
> +    } gen7_branch;
> +
>      int d;
>      uint32_t ud;
>      float f;
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index fc7e53d..06aa769 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -837,6 +837,7 @@ namespace gbe
>      GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
>      push();
>      curr.predicate = GEN_PREDICATE_NONE;
> +    curr.noMask = 1;
>      curr.execWidth = 1;
>      MOV(r, GenRegister::immud(u.u[1]));
>      MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
> @@ -907,6 +908,7 @@ namespace gbe
>        push();
>        curr.execWidth = 8;
>        curr.predicate = GEN_PREDICATE_NONE;
> +      curr.noMask = 1;
>        MOV(r0, src0);
>        MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
>        curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -920,6 +922,7 @@ namespace gbe
>          push();
>          curr.execWidth = 8;
>          curr.predicate = GEN_PREDICATE_NONE;
> +        curr.noMask = 1;
>          MOV(r0, GenRegister::suboffset(src0, 8));
>          MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0,
> 12));
>          curr.predicate = GEN_PREDICATE_NORMAL;
> @@ -1058,7 +1061,7 @@ namespace gbe
> 
>  #define ALU2_BRA(OP) \
>    void GenEncoder::OP(GenRegister src) { \
> -    alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(),
> src); \
> +    alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(),
> GenRegister::nullud(), src); \
>    }
> 
>    ALU2_BRA(IF)
> @@ -1071,9 +1074,21 @@ namespace gbe
>      GBE_ASSERT(insnID < this->store.size());
>      GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
>                 insn.header.opcode == GEN_OPCODE_BRD  ||
> -               insn.header.opcode == GEN_OPCODE_ENDIF);
> -    if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
> -          this->setSrc1(&insn, GenRegister::immd(jumpDistance));
> +               insn.header.opcode == GEN_OPCODE_ENDIF ||
> +               insn.header.opcode == GEN_OPCODE_IF ||
> +               insn.header.opcode == GEN_OPCODE_BRC);
> +
> +    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769
> && jumpDistance < 32768))  {
> +          int offset = 0;
> +           if (insn.header.opcode == GEN_OPCODE_IF) {
> +             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
> +             return;
> +           }
> +           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
> +             offset = -2;
> +             /*assert(jumpDistance > -32769 && jumpDistance <
> 32768);*/
> +           }
> +          this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
>      } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
>        // For the conditional jump distance out of S15 range, we need to use
> an
>        // inverted jmp followed by a add ip, ip, distance to implement.
> @@ -1085,10 +1100,12 @@ namespace gbe
>        // for all the branching instruction. And need to adjust the distance
>        // for those branch instruction's start point and end point contains
>        // this instruction.
> +      GenInstruction &insn2 = this->store[insnID+1];
> +      GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
>        insn.header.opcode = GEN_OPCODE_ADD;
>        this->setDst(&insn, GenRegister::ip());
>        this->setSrc0(&insn, GenRegister::ip());
> -      this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
> +      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
>      } else {
>        insn.header.predicate_inverse ^= 1;
>        this->setSrc1(&insn, GenRegister::immd(2));
> @@ -1099,7 +1116,7 @@ namespace gbe
>        insn2.header.opcode = GEN_OPCODE_ADD;
>        this->setDst(&insn2, GenRegister::ip());
>        this->setSrc0(&insn2, GenRegister::ip());
> -      this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
> +      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
>      }
>    }
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index d86e04c..147c3e6 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -76,8 +76,6 @@
>   *
>   * Also, there is some extra kludge to handle the predicates for JMPI.
>   *
> - * See TODO for a better idea for branching and masking
> - *
>   * TODO:
>   * =====
>   *
> @@ -92,14 +90,9 @@
>   * interesting approach which consists in traversing the dominator tree in
> post
>   * order
>   *
> - * About masking and branching, a much better idea (that I found later
> unfortunately)
> - * is to replace the use of the flag by uses of if/endif to enclose the basic
> - * block. So, instead of using predication, we use auto-masking. The very cool
> - * consequence is that we can reintegrate back the structured branches.
> - * Basically, we will be able to identify branches that can be mapped to
> - * structured branches and mix nicely unstructured branches (which will use
> - * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
> - * fast)
> + * We already use if/endif to enclose each basic block. We will continue to
> identify
> + * those blocks which could match to structured branching and use pure
> structured
> + * instruction to handle them completely.
>   */
> 
>  #include "backend/gen_insn_selection.hpp"
> @@ -320,38 +313,6 @@ namespace gbe
>      INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t
> registerPool);
>      /*! indicate whether a register is a scalar/uniform register. */
>      INLINE bool isScalarReg(const ir::Register &reg) const {
> -#if 0
> -      printf("reg %d ", reg.value());
> -      printf("uniform: %d ", getRegisterData(reg).isUniform());
> -      if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n");
> return true; }
> -      if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push
> location.\n"); return true; }
> -      if (reg == ir::ocl::groupid0  ||
> -          reg == ir::ocl::groupid1  ||
> -          reg == ir::ocl::groupid2  ||
> -          reg == ir::ocl::barrierid ||
> -          reg == ir::ocl::threadn   ||
> -          reg == ir::ocl::numgroup0 ||
> -          reg == ir::ocl::numgroup1 ||
> -          reg == ir::ocl::numgroup2 ||
> -          reg == ir::ocl::lsize0    ||
> -          reg == ir::ocl::lsize1    ||
> -          reg == ir::ocl::lsize2    ||
> -          reg == ir::ocl::gsize0    ||
> -          reg == ir::ocl::gsize1    ||
> -          reg == ir::ocl::gsize2    ||
> -          reg == ir::ocl::goffset0  ||
> -          reg == ir::ocl::goffset1  ||
> -          reg == ir::ocl::goffset2  ||
> -          reg == ir::ocl::workdim   ||
> -          reg == ir::ocl::emask     ||
> -          reg == ir::ocl::notemask  ||
> -          reg == ir::ocl::barriermask
> -        ) {
> -        printf("special reg.\n");
> -        return true;
> -      }
> -      return false;
> -#endif
>        const ir::RegisterData &regData = getRegisterData(reg);
>        return regData.isUniform();
>      }
> @@ -992,7 +953,7 @@ namespace gbe
>    }
> 
>    void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
>      insn->src(0) = src;
>      insn->index = uint16_t(jip);
>    }
> @@ -1412,9 +1373,17 @@ namespace gbe
>      for (uint32_t regID = 0; regID < this->regNum; ++regID)
>        this->regDAG[regID] = NULL;
> 
> +    this->block->hasBarrier = false;
> +    this->block->hasBranch = bb.getLastInstruction()->getOpcode() ==
> OP_BRA ||
> +                             bb.getLastInstruction()->getOpcode() ==
> OP_RET;
> +    if (!this->block->hasBranch)
> +      this->block->endifOffset = -1;
> +
>      // Build the DAG on the fly
>      uint32_t insnNum = 0;
>      const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
> +      if (insn.getOpcode() == OP_SYNC)
> +        this->block->hasBarrier = true;
> 
>        // Build a selectionDAG node for instruction
>        SelectionDAG *dag = this->newSelectionDAG(insn);
> @@ -1465,6 +1434,7 @@ namespace gbe
>    void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
>    {
>      // Bottom up code generation
> +    bool needEndif = this->block->hasBranch == false
> && !this->block->hasBarrier;
>      for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
>        // Process all possible patterns for this instruction
>        SelectionDAG &dag = *insnDAG[insnID];
> @@ -1476,8 +1446,10 @@ namespace gbe
> 
>          // Start a new code fragment
>          this->startBackwardGeneration();
> +        // If there is no branch at the end of this block.
> 
>          // Try all the patterns from best to worst
> +
>          do {
>            if ((*it)->emit(*this, dag))
>              break;
> @@ -1485,6 +1457,13 @@ namespace gbe
>          } while (it != end);
>          GBE_ASSERT(it != end);
> 
> +        if (needEndif) {
> +          const ir::BasicBlock *curr = insn.getParent();
> +          const ir::BasicBlock *next = curr->getNextBlock();
> +          this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
> +          needEndif = false;
> +        }
> +
>          // Output the code in the current basic block
>          this->endBackwardGeneration();
>        }
> @@ -2133,6 +2112,7 @@ namespace gbe
>        const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
> 
>        sel.push();
> +        sel.curr.noMask = 1;
>          sel.curr.predicate = GEN_PREDICATE_NONE;
>          sel.curr.execWidth = simdWidth;
>          sel.SEL_CMP(genCmp, tmp, src0, src1);
> @@ -2329,7 +2309,6 @@ namespace gbe
>        const Type type = insn.getType();
>        const Immediate imm = insn.getImmediate();
>        const GenRegister dst = sel.selReg(insn.getDst(0), type);
> -      GenRegister flagReg;
> 
>        sel.push();
>        if (sel.isScalarOrBool(insn.getDst(0)) == true) {
> @@ -2371,24 +2350,10 @@ namespace gbe
>      {
>        using namespace ir;
>        const ir::Register reg = sel.reg(FAMILY_DWORD);
> -      const GenRegister barrierMask = sel.selReg(ocl::barriermask,
> TYPE_BOOL);
>        const uint32_t params = insn.getParameters();
> 
> -      sel.push();
> -        sel.curr.predicate = GEN_PREDICATE_NONE;
> -        sel.curr.noMask = 1;
> -        sel.curr.execWidth = 1;
> -        sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
> -        sel.MOV(GenRegister::flag(1, 1), barrierMask);
> -      sel.pop();
> -
>        // A barrier is OK to start the thread synchronization *and* SLM fence
> -      sel.push();
> -        //sel.curr.predicate = GEN_PREDICATE_NONE;
> -        sel.curr.flag = 1;
> -        sel.curr.subFlag = 1;
> -        sel.BARRIER(GenRegister::ud8grf(reg),
> sel.selReg(sel.reg(FAMILY_DWORD)), params);
> -      sel.pop();
> +      sel.BARRIER(GenRegister::ud8grf(reg),
> sel.selReg(sel.reg(FAMILY_DWORD)), params);
>        return true;
>      }
> 
> @@ -2696,7 +2661,7 @@ namespace gbe
>        GenRegister tmpDst;
> 
>        if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
> -        tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
> +        tmpDst = sel.selReg(dst, TYPE_BOOL);
>        else
>          tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
> 
> @@ -2724,36 +2689,23 @@ namespace gbe
>        sel.push();
>          sel.curr.flag = 1;
>          sel.curr.subFlag = 1;
> -        sel.curr.predicate  = GEN_PREDICATE_NONE;
>          if (type == TYPE_S64 || type == TYPE_U64) {
>            GenRegister tmp[3];
>            for(int i=0; i<3; i++)
>              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
> -          sel.push();
> -            sel.curr.execWidth = 1;
> -            sel.curr.noMask = 1;
> -            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
> -          sel.pop();
> -          sel.curr.predicate = GEN_PREDICATE_NORMAL;
>            sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
>          } else if(opcode == OP_ORD) {
>            sel.push();
> -            sel.curr.execWidth = 1;
> -            sel.curr.noMask = 1;
> -            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
> +            sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> +            sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +            sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
>            sel.pop();
> -          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> -
> -          sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> -          sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
>          } else
>            sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
>        sel.pop();
> 
>        if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
>          sel.MOV(sel.selReg(dst, TYPE_U16),
> GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
> -      else
> -        sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
>        return true;
>      }
>    };
> @@ -2979,11 +2931,6 @@ namespace gbe
>          markAllChildren(dag);
>        }
> 
> -      // Since we cannot predicate the select instruction with our current
> mask,
> -      // we need to perform the selection in two steps (one to select, one to
> -      // update the destination register)
> -      const RegisterFamily family = getFamily(type);
> -      const GenRegister tmp = sel.selReg(sel.reg(family), type);
>        const uint32_t simdWidth = sel.ctx.getSimdWidth();
>        const Register pred = insn.getPredicate();
>        sel.push();
> @@ -2992,16 +2939,14 @@ namespace gbe
>          sel.curr.flag = 1;
>          sel.curr.subFlag = 1;
>          sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> -        sel.curr.noMask = 0;
> +        //sel.curr.noMask = 0;
>          sel.curr.predicate = GEN_PREDICATE_NORMAL;
>          if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
> -          sel.SEL_INT64(tmp, src0, src1);
> +          sel.SEL_INT64(dst, src0, src1);
>          else
> -          sel.SEL(tmp, src0, src1);
> +          sel.SEL(dst, src0, src1);
>        sel.pop();
> 
> -      // Update the destination register properly now
> -      sel.MOV(dst, tmp);
>        return true;
>      }
>    };
> @@ -3041,6 +2986,7 @@ namespace gbe
>      DECL_CTOR(TernaryInstruction, 1, 1);
>     };
> 
> +
>    /*! Label instruction pattern */
>    DECL_PATTERN(LabelInstruction)
>    {
> @@ -3053,42 +2999,75 @@ namespace gbe
>        const uint32_t simdWidth = sel.ctx.getSimdWidth();
>        sel.LABEL(label);
> 
> -     // Do not emit any code for the "returning" block. There is no need for it
> -     if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
> +      // Do not emit any code for the "returning" block. There is no need for
> it
> +      if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
>          return true;
> 
> +      LabelIndex jip;
> +      const LabelIndex nextLabel =
> insn.getParent()->getNextBlock()->getLabelIndex();
> +      if (sel.ctx.hasJIP(&insn))
> +        jip = sel.ctx.getLabelIndex(&insn);
> +      else
> +        jip = nextLabel;
> +
>        // Emit the mask computation at the head of each basic block
>        sel.push();
> +        sel.curr.noMask = 1;
>          sel.curr.predicate = GEN_PREDICATE_NONE;
> -        sel.curr.flag = 0;
> -        sel.curr.subFlag = 0;
>          sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0,
> GEN_TYPE_UW), src1);
>        sel.pop();
> 
> -      // If it is required, insert a JUMP to bypass the block
> -      if (sel.ctx.hasJIP(&insn)) {
> -        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
> +      if (sel.block->hasBarrier) {
> +        // If this block has barrier, we don't execute the block until all lanes
> +        // are 1s. Set each reached lane to 1, then check all lanes. If there is
> any
> +        // lane not reached, we jump to jip. And no need to issue if/endif for
> +        // this block, as it will always excute with all lanes activated.
>          sel.push();
> -
> -          sel.curr.noMask = 1;
> -          sel.curr.execWidth = 1;
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW),
> GenRegister::immuw(GEN_MAX_LABEL));
>            sel.curr.predicate = GEN_PREDICATE_NONE;
> -          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
> -          GenRegister flagReg = GenRegister::flag(0, 0);
> -          sel.AND(flagReg, flagReg, emaskReg);
> -
> +          sel.curr.noMask = 1;
> +          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0,
> GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
>            if (simdWidth == 8)
> -            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> +            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
>            else if (simdWidth == 16)
> -            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
> +            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
>            else
>              NOT_IMPLEMENTED;
> +          sel.curr.noMask = 1;
> +          sel.curr.execWidth = 1;
>            sel.curr.inversePredicate = 1;
> -          sel.curr.flag = 0;
> -          sel.curr.subFlag = 0;
>            sel.JMPI(GenRegister::immd(0), jip);
>          sel.pop();
> +        // FIXME, if the last BRA is unconditional jump, we don't need to
> update the label here.
> +        sel.push();
> +         sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW),
> GenRegister::immuw((uint16_t)label));
> +        sel.pop();
> +      }
> +      else {
> +        if (sel.ctx.hasJIP(&insn)) {
> +          // If it is required, insert a JUMP to bypass the block
> +          sel.push();
> +            if (simdWidth == 8)
> +              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> +            else if (simdWidth == 16)
> +              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
> +            else
> +              NOT_IMPLEMENTED;
> +            sel.curr.noMask = 1;
> +            sel.curr.execWidth = 1;
> +            sel.curr.inversePredicate = 1;
> +            sel.JMPI(GenRegister::immd(0), jip);
> +          sel.pop();
> +        }
> +        sel.push();
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +          // It's easier to set the jip to a relative position over next block.
> +          sel.IF(GenRegister::immd(0), nextLabel, nextLabel,
> sel.block->endifOffset, sel.block->endifOffset);
> +        sel.pop();
>        }
> +
>        return true;
>      }
>      DECL_CTOR(LabelInstruction, 1, 1);
> @@ -3225,7 +3204,6 @@ namespace gbe
>    /*! Branch instruction pattern */
>    DECL_PATTERN(BranchInstruction)
>    {
> -
>      void emitForwardBranch(Selection::Opaque &sel,
>                             const ir::BranchInstruction &insn,
>                             ir::LabelIndex dst,
> @@ -3233,16 +3211,13 @@ namespace gbe
>      {
>        using namespace ir;
>        const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
> -      const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
> 
>        // We will not emit any jump if we must go the next block anyway
>        const BasicBlock *curr = insn.getParent();
>        const BasicBlock *next = curr->getNextBlock();
>        const LabelIndex nextLabel = next->getLabelIndex();
> -
>        if (insn.isPredicated() == true) {
>          const Register pred = insn.getPredicateIndex();
> -
>          sel.push();
>            // we don't need to set next label to the pcip
>            // as if there is no backward jump latter, then obviously
> everything will work fine.
> @@ -3250,22 +3225,30 @@ namespace gbe
>            sel.curr.flag = 0;
>            sel.curr.subFlag = 0;
>            sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
>            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> +          if (!sel.block->hasBarrier)
> +            sel.ENDIF(GenRegister::immd(0), nextLabel);
> +          sel.block->endifOffset = -1;
>          sel.pop();
> -
> -        if (nextLabel == jip) return;
>        } else {
>          // Update the PcIPs
> +        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
>          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> -        // Do not emit branch when we go to the next block anyway
> +        if (!sel.block->hasBarrier)
> +          sel.ENDIF(GenRegister::immd(0), nextLabel);
> +        sel.block->endifOffset = -1;
>          if (nextLabel == jip) return;
> +        // Branch to the jump target
>          sel.push();
>            sel.curr.execWidth = 1;
>            sel.curr.noMask = 1;
>            sel.curr.predicate = GEN_PREDICATE_NONE;
>            sel.JMPI(GenRegister::immd(0), jip);
>          sel.pop();
> +        // FIXME just for the correct endif offset.
> +        // JMPI still has 2 instruction.
> +        sel.block->endifOffset -= 2;
>        }
>      }
> 
> @@ -3290,37 +3273,32 @@ namespace gbe
>          // that actually take the branch
>          const LabelIndex next = bb.getNextBlock()->getLabelIndex();
>          sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
> -
> +        GBE_ASSERT(jip == dst);
>          sel.push();
>            sel.curr.flag = 0;
>            sel.curr.subFlag = 0;
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
>            sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16),
> GenRegister::immuw(0));
> -          // Re-update the PcIPs for the branches that takes the backward
> jump
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
>            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> -          // We clear all the inactive channel to 0 as the
> GEN_PREDICATE_ALIGN1_ANY8/16
> -          // will check those bits as well.
>            sel.curr.predicate = GEN_PREDICATE_NONE;
> +          if (!sel.block->hasBarrier)
> +            sel.ENDIF(GenRegister::immd(0), next);
>            sel.curr.execWidth = 1;
> -          sel.curr.noMask = 1;
> -          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
> -          sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1),
> emaskReg);
> -
> -          // Branch to the jump target
> -          if (simdWidth == 8)
> -            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> -          else if (simdWidth == 16)
> +          if (simdWidth == 16)
>              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
>            else
> -            NOT_SUPPORTED;
> +            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
> +          sel.curr.noMask = 1;
>            sel.JMPI(GenRegister::immd(0), jip);
> +          sel.block->endifOffset = -3;
>          sel.pop();
> -
>        } else {
> -
> +        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
>          // Update the PcIPs
>          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
> -
> +        if (!sel.block->hasBarrier)
> +          sel.ENDIF(GenRegister::immd(0), next);
>          // Branch to the jump target
>          sel.push();
>            sel.curr.execWidth = 1;
> @@ -3328,6 +3306,7 @@ namespace gbe
>            sel.curr.predicate = GEN_PREDICATE_NONE;
>            sel.JMPI(GenRegister::immd(0), jip);
>          sel.pop();
> +        sel.block->endifOffset = -3;
>        }
>      }
> 
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 04fbb9f..8557768 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -42,6 +42,8 @@ namespace gbe
>    /*! Translate IR compare to Gen compare */
>    uint32_t getGenCompare(ir::Opcode opcode);
> 
> +  #define GEN_MAX_LABEL 0xFFFF
> +
>    /*! Selection opcodes properly encoded from 0 to n for fast jump tables
>     *  generations
>     */
> @@ -190,6 +192,9 @@ namespace gbe
>      void append(SelectionInstruction *insn);
>      /*! Append a new selection instruction at the beginning of the block */
>      void prepend(SelectionInstruction *insn);
> +    int endifOffset;
> +    bool hasBarrier;
> +    bool hasBranch;
>    };
> 
>    /*! Owns the selection engine */
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index d318f8e..ddc9d5e 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
>  DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
>  DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
>  DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
> -DECL_SELECTION_IR(BRC, BinaryInstruction)
> +DECL_SELECTION_IR(BRC, UnaryInstruction)
>  DECL_SELECTION_IR(BRD, UnaryInstruction)
>  DECL_SELECTION_IR(IF, UnaryInstruction)
>  DECL_SELECTION_IR(ENDIF, UnaryInstruction)
> diff --git a/backend/src/backend/gen_register.hpp
> b/backend/src/backend/gen_register.hpp
> index 25cb428..051f16d 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -118,7 +118,7 @@ namespace gbe
>        this->noMask = 0;
>        this->flag = 0;
>        this->subFlag = 0;
> -      this->predicate = GEN_PREDICATE_NORMAL;
> +      this->predicate = GEN_PREDICATE_NONE;
>        this->inversePredicate = 0;
>        this->physicalFlag = 1;
>        this->flagIndex = 0;
> @@ -657,6 +657,17 @@ namespace gbe
>                           GEN_HORIZONTAL_STRIDE_1);
>      }
> 
> +    static INLINE GenRegister nullud(void) {
> +      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
> +                         GEN_ARF_NULL,
> +                         0,
> +                         GEN_TYPE_UD,
> +                         GEN_VERTICAL_STRIDE_8,
> +                         GEN_WIDTH_8,
> +                         GEN_HORIZONTAL_STRIDE_1);
> +    }
> +
> +
>      static INLINE bool isNull(GenRegister reg) {
>        return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
>                && reg.nr == GEN_ARF_NULL);
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 9638994..b572500 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -2526,6 +2526,7 @@ clEnqueueNDRangeKernel(cl_command_queue
> command_queue,
> 
>      err = cl_command_queue_flush(command_queue);
>    }
> +  clFinish(command_queue);
> 
>  error:
>    return err;
> diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
> index 5e474de..c2308da 100644
> --- a/src/intel/intel_driver.c
> +++ b/src/intel/intel_driver.c
> @@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)
>  {
>    driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
>    assert(driver->bufmgr);
> +  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
>    drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
>  }
> 
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index e95b050..4819e9e 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu,
> cl_gpgpu_kernel *kernel)
>    memset(desc, 0, sizeof(*desc));
>    ker_bo = (drm_intel_bo *) kernel->bo;
>    desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
> -  desc->desc1.single_program_flow = 1;
> +  desc->desc1.single_program_flow = 0;
>    desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
>    desc->desc5.rounding_mode = 0; /* round to nearest even */
> 
> diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
> index 35d4c4f..f901fdf 100644
> --- a/utests/compiler_long_cmp.cpp
> +++ b/utests/compiler_long_cmp.cpp
> @@ -45,6 +45,7 @@ void compiler_long_cmp(void)
>      int64_t *dest = (int64_t *)buf_data[2];
>      int64_t x = (src1[i] < src2[i]) ? 3 : 4;
>      OCL_ASSERT(x == dest[i]);
> +    //printf("%d %ld  %ld \n", i, dest[i], x);
>    }
>    OCL_UNMAP_BUFFER(2);
>    OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
> diff --git a/utests/compiler_unstructured_branch0.cpp
> b/utests/compiler_unstructured_branch0.cpp
> index 128a53e..1a371e9 100644
> --- a/utests/compiler_unstructured_branch0.cpp
> +++ b/utests/compiler_unstructured_branch0.cpp
> @@ -27,7 +27,6 @@ static void compiler_unstructured_branch0(void)
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
>    for (uint32_t i = 16; i < 32; ++i)
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
> -
>    // Second control flow
>    for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
>    OCL_UNMAP_BUFFER(0);
> @@ -36,8 +35,7 @@ static void compiler_unstructured_branch0(void)
>    OCL_MAP_BUFFER(0);
>    OCL_MAP_BUFFER(1);
>    for (uint32_t i = 0; i < 32; ++i)
> -    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
> -
> +   OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
>    // Third control flow
>    for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
>    OCL_UNMAP_BUFFER(0);
> diff --git a/utests/compiler_unstructured_branch1.cpp
> b/utests/compiler_unstructured_branch1.cpp
> index 6021f5b..fb24cec 100644
> --- a/utests/compiler_unstructured_branch1.cpp
> +++ b/utests/compiler_unstructured_branch1.cpp
> @@ -25,7 +25,6 @@ static void compiler_unstructured_branch1(void)
>    OCL_MAP_BUFFER(1);
>    for (uint32_t i = 0; i < n; ++i)
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
> -
>    // Second control flow
>    for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
>    OCL_UNMAP_BUFFER(0);
> @@ -34,7 +33,7 @@ static void compiler_unstructured_branch1(void)
>    OCL_MAP_BUFFER(0);
>    OCL_MAP_BUFFER(1);
>    for (uint32_t i = 0; i < n; ++i)
> -    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
> +   OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
> 
>    // Third control flow
>    for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
> diff --git a/utests/compiler_unstructured_branch2.cpp
> b/utests/compiler_unstructured_branch2.cpp
> index d61c6b5..68c7448 100644
> --- a/utests/compiler_unstructured_branch2.cpp
> +++ b/utests/compiler_unstructured_branch2.cpp
> @@ -23,6 +23,7 @@ static void compiler_unstructured_branch2(void)
>    // First control flow
>    OCL_MAP_BUFFER(0);
>    OCL_MAP_BUFFER(1);
> +#if 1
>    for (uint32_t i = 0; i < n; ++i)
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
> 
> @@ -35,7 +36,7 @@ static void compiler_unstructured_branch2(void)
>    OCL_MAP_BUFFER(1);
>    for (uint32_t i = 0; i < n; ++i)
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
> -
> +#endif
>    // Third control flow
>    for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
>    for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
> @@ -45,9 +46,16 @@ static void compiler_unstructured_branch2(void)
>    OCL_MAP_BUFFER(0);
>    OCL_MAP_BUFFER(1);
>    for (uint32_t i = 0; i < 8; ++i)
> +  {
> +    //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], 12);
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
> +  }
>    for (uint32_t i = 8; i < n; ++i)
> +  {
> +    //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], -6);
>      OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
> +  }
> +  //exit(0);
> 
>    // Fourth control flow
>    for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1;
> --
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list