[Beignet] [PATCH 2/2] GBE: optimize IMM handling for SEL/SEL_CMP/CMP.

Zhigang Gong zhigang.gong at linux.intel.com
Wed May 14 01:44:24 PDT 2014


Please review the new version, this version has some indent issues.

On Wed, May 14, 2014 at 04:11:27PM +0800, Zhigang Gong wrote:
> Actually, all of the above 3 instructions could avoid
> one LOADI instruction by switching operands position.
> 
> This patch impemented this optimization. And consolidate
> all the same type of optimization into one place.
> 
> No obvious performance impact on luxmark.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 183 +++++++++++++++--------------
>  1 file changed, 97 insertions(+), 86 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index a554ee9..3d6897d 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -129,15 +129,15 @@ namespace gbe
>      }
>    }
>  
> -  uint32_t getGenCompare(ir::Opcode opcode) {
> +  uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) {
>      using namespace ir;
>      switch (opcode) {
> -      case OP_LE: return GEN_CONDITIONAL_LE;
> -      case OP_LT: return GEN_CONDITIONAL_L;
> -      case OP_GE: return GEN_CONDITIONAL_GE;
> -      case OP_GT: return GEN_CONDITIONAL_G;
> -      case OP_EQ: return GEN_CONDITIONAL_EQ;
> -      case OP_NE: return GEN_CONDITIONAL_NEQ;
> +      case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G;
> +      case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE;
> +      case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L;
> +      case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE;
> +      case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ;
> +      case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ;
>        default: NOT_SUPPORTED; return 0u;
>      };
>    }
> @@ -576,6 +576,15 @@ namespace gbe
>      void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
>      /*! 64-bit integer remainder of division */
>      void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
> +    /* common functions for both binary instruction and sel_cmp and compare instruction.
> +       It will handle the IMM or normal register assignment, and will try to avoid LOADI
> +       as much as possible. */
> +    void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
> +                      GenRegister &src1, ir::Type type, bool &inverse);
> +    void getSrcGenRegImm(SelectionDAG &dag,
> +                      SelectionDAG *dag0, SelectionDAG *dag1,
> +                      GenRegister &src0, GenRegister &src1,
> +                      ir::Type type, bool &inverse);
>      /*! Use custom allocators */
>      GBE_CLASS(Opaque);
>      friend class SelectionBlock;
> @@ -588,6 +597,7 @@ namespace gbe
>        currAuxLabel++;
>        return (ir::LabelIndex)currAuxLabel;
>      }
> +
>    };
>  
>    ///////////////////////////////////////////////////////////////////////////
> @@ -1665,11 +1675,11 @@ namespace gbe
>      return false;
>    }
>  
> -  GenRegister getRegisterFromImmediate(ir::Immediate imm, bool negate = false)
> +  GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false)
>    {
>      using namespace ir;
>      int sign = negate ? -1 : 1;
> -    switch (imm.type) {
> +    switch (type) {
>        case TYPE_U32:   return GenRegister::immud(imm.data.u32 * sign);
>        case TYPE_S32:   return GenRegister::immd(imm.data.s32 * sign);
>        case TYPE_FLOAT: return GenRegister::immf(imm.data.f32 * sign);
> @@ -1683,6 +1693,68 @@ namespace gbe
>      }
>    }
>  
> +  BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
> +  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag,
> +                                       SelectionDAG *dag0, SelectionDAG *dag1,
> +                                       GenRegister &src0, GenRegister &src1,
> +                                       ir::Type type, bool &inverse) {
> +    using namespace ir;
> +      // Right source can always be an immediate
> +      //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
> +      inverse = false;
> +      const int src0Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src0Index : 0;
> +      const int src1Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src1Index : 1;
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> +          canGetRegisterFromImmediate(dag1->insn)) {
> +        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> +        src0 = this->selReg(dag.insn.getSrc(src0Index), type);
> +        src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
> +        if (dag0) dag0->isRoot = 1;
> +      }
> +      // Left source cannot be immediate but it is OK if we can commute
> +      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf<BinaryInstruction>() &&
> +               ((cast<BinaryInstruction>(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) &&
> +               dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
> +        const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> +        src0 = dag.insn.getOpcode() != OP_SUB ?
> +               this->selReg(dag.insn.getSrc(src1Index), type) :
> +               GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type));
> +        Immediate imm = childInsn.getImmediate();
> +        src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB);
> +        if (dag1) dag1->isRoot = 1;
> +      }
> +      // If it's a compare instruction, theoritically, we can easily revert the condition code to
> +      // switch the two operands. But we can't do that for float due to the NaN's exist.
> +      // For a normal select instruction, we can always inverse the predication to switch the two
> +      // operands' position.
> +      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> +               dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
> +               ((dag.insn.isMemberOf<CompareInstruction>() && type != TYPE_FLOAT && type != TYPE_DOUBLE) ||
> +                (dag.insn.isMemberOf<SelectInstruction>()) ||
> +                (dag.insn.getOpcode() == OP_ORD))) {
> +        const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> +        src0 = this->selReg(dag.insn.getSrc(src1Index), type);
> +        src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
> +        inverse = true;
> +        if (dag1) dag1->isRoot = 1;
> +      }
> +      // Just grab the two sources
> +      else {
> +        src0 = this->selReg(dag.insn.getSrc(src0Index), type);
> +        src1 = this->selReg(dag.insn.getSrc(src1Index), type);
> +        markAllChildren(dag);
> +      }
> +  }
> +
> +  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
> +                                       GenRegister &src1, ir::Type type,
> +                                       bool &inverse) {
> +    SelectionDAG *dag0 = dag.child[0];
> +    SelectionDAG *dag1 = dag.child[1];
> +    getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse);
> +  }
> +
> +
>    /*! Template for the one-to-many instruction patterns */
>    template <typename T, typename U>
>    class OneToManyPattern : public SelectionPattern
> @@ -1846,7 +1918,6 @@ namespace gbe
>      DECL_CTOR(UnaryInstruction, 1, 1)
>    };
>  
> -  BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
>  
>    /*! Binary regular instruction pattern */
>    class BinaryInstructionPattern : public SelectionPattern
> @@ -1957,36 +2028,8 @@ namespace gbe
>  
>        // Look for immediate values
>        GenRegister src0, src1;
> -      SelectionDAG *dag0 = dag.child[0];
> -      SelectionDAG *dag1 = dag.child[1];
> -
> -      // Right source can always be an immediate
> -      //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> -          canGetRegisterFromImmediate(dag1->insn) && type != TYPE_BOOL) {
> -        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> -        src0 = sel.selReg(insn.getSrc(0), type);
> -        src1 = getRegisterFromImmediate(childInsn.getImmediate());
> -        if (dag0) dag0->isRoot = 1;
> -      }
> -      // Left source cannot be immediate but it is OK if we can commute
> -      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> -               (insn.commutes() || insn.getOpcode() == OP_SUB) &&
> -               dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
> -               type != TYPE_BOOL) {
> -        const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> -        src0 = insn.getOpcode() != OP_SUB ?
> -               sel.selReg(insn.getSrc(1), type) : GenRegister::negate(sel.selReg(insn.getSrc(1), type));
> -        src1 = getRegisterFromImmediate(childInsn.getImmediate(), insn.getOpcode() == OP_SUB); 
> -        if (dag1) dag1->isRoot = 1;
> -      }
> -      // Just grab the two sources
> -      else {
> -        src0 = sel.selReg(insn.getSrc(0), type);
> -        src1 = sel.selReg(insn.getSrc(1), type);
> -        markAllChildren(dag);
> -      }
> -
> +      bool inverse = false;
> +      sel.getSrcGenRegImm(dag, src0, src1, type, inverse);
>        // Output the binary instruction
>        if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) {
>          GBE_ASSERT(insn.getOpcode() == OP_AND ||
> @@ -2272,12 +2315,16 @@ namespace gbe
>        // So both sources must match
>        if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
>        if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
> -
>        // OK, we merge the instructions
>        const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
>        const ir::Opcode opcode = cmpInsn.getOpcode();
>        if(opcode == OP_ORD) return false;
> -      const uint32_t genCmp = getGenCompare(opcode);
> +      GenRegister src0, src1;
> +      const ir::Type type = cmpInsn.getType();
> +      bool inverse = false;
> +      sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse);
> +
> +      const uint32_t genCmp = getGenCompare(opcode, inverse);
>        sel.push();
>          if (sel.isScalarReg(insn.getDst(0)) == true) {
>            sel.curr.execWidth = 1;
> @@ -2287,21 +2334,12 @@ namespace gbe
>  
>          // Like for regular selects, we need a temporary since we cannot predicate
>          // properly
> -        const ir::Type type = cmpInsn.getType();
>          const uint32_t simdWidth = sel.curr.execWidth;
>          const GenRegister dst  = sel.selReg(insn.getDst(0), type);
> -        const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type);
> -        const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
> -
>          sel.curr.predicate = GEN_PREDICATE_NONE;
>          sel.curr.execWidth = simdWidth;
>          sel.SEL_CMP(genCmp, dst, src0, src1);
>        sel.pop();
> -
> -
> -      // We need the sources of the compare instruction
> -      markAllChildren(*cmp);
> -
>        return true;
>      }
>    };
> @@ -2913,25 +2951,8 @@ namespace gbe
>  
>        // Look for immediate values for the right source
>        GenRegister src0, src1;
> -      SelectionDAG *dag0 = dag.child[0];
> -      SelectionDAG *dag1 = dag.child[1];
> -
> -      // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> -          canGetRegisterFromImmediate(dag1->insn) && opcode != OP_ORD) {
> -        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> -        src0 = sel.selReg(insn.getSrc(0), type);
> -        Immediate imm = childInsn.getImmediate();
> -        if(imm.type != type)
> -          imm.type = type;
> -        src1 = getRegisterFromImmediate(imm);
> -        if (dag0) dag0->isRoot = 1;
> -      } else {
> -        src0 = sel.selReg(insn.getSrc(0), type);
> -        src1 = sel.selReg(insn.getSrc(1), type);
> -        markAllChildren(dag);
> -      }
> -
> +      bool inverseCmp = false;
> +      sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp);
>        sel.push();
>          if (sel.isScalarReg(dst))
>            sel.curr.noMask = 1;
> @@ -2944,7 +2965,7 @@ namespace gbe
>            for(int i=0; i<3; i++)
>              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>            sel.curr.flagGen = 1;
> -          sel.I64CMP(getGenCompare(opcode), src0, src1, tmp);
> +          sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp);
>          } else if(opcode == OP_ORD) {
>            sel.push();
>              sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> @@ -2966,7 +2987,7 @@ namespace gbe
>              sel.curr.flagGen = needStoreBool;
>              tmpDst = GenRegister::nullud();
>            }
> -          sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
> +          sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst);
>          }
>        sel.pop();
>        return true;
> @@ -3189,19 +3210,9 @@ namespace gbe
>        SelectionDAG *dag1 = dag.child[1];
>        SelectionDAG *dag2 = dag.child[2];
>  
> -      // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
> -        const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
> -        src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
> -        src1 = getRegisterFromImmediate(childInsn.getImmediate());
> -        if (dag0) dag0->isRoot = 1;
> -        if (dag1) dag1->isRoot = 1;
> -      } else {
> -        src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
> -        src1 = sel.selReg(insn.getSrc(SelectInstruction::src1Index), type);
> -        markAllChildren(dag);
> -      }
> -
> +      if (dag0) dag0->isRoot = 1;
> +      bool inverse = false;
> +      sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse);
>        const Register pred = insn.getPredicate();
>        sel.push();
>          if (sel.isScalarReg(insn.getDst(0)) == true) {
> @@ -3209,7 +3220,7 @@ namespace gbe
>            sel.curr.predicate = GEN_PREDICATE_NONE;
>            sel.curr.noMask = 1;
>          }
> -
> +        sel.curr.inversePredicate ^= inverse;
>          sel.curr.physicalFlag = 0;
>          sel.curr.flagIndex = (uint16_t) pred;
>          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list