[Beignet] [PATCH V2] GBE: optimize IMM handling for SEL/SEL_CMP/CMP.
Zhigang Gong
zhigang.gong at linux.intel.com
Thu May 22 18:20:31 PDT 2014
Ping for review.
On Wed, May 14, 2014 at 04:43:45PM +0800, Zhigang Gong wrote:
> Actually, all of the above 3 instructions could avoid
> one LOADI instruction by switching operands position.
>
> This patch impemented this optimization. And consolidate
> all the same type of optimization into one place.
>
> No obvious performance impact on luxmark.
>
> v2:
> fix some wrong indent.
>
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 183 +++++++++++++++--------------
> 1 file changed, 97 insertions(+), 86 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index a554ee9..05925a3 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -129,15 +129,15 @@ namespace gbe
> }
> }
>
> - uint32_t getGenCompare(ir::Opcode opcode) {
> + uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) {
> using namespace ir;
> switch (opcode) {
> - case OP_LE: return GEN_CONDITIONAL_LE;
> - case OP_LT: return GEN_CONDITIONAL_L;
> - case OP_GE: return GEN_CONDITIONAL_GE;
> - case OP_GT: return GEN_CONDITIONAL_G;
> - case OP_EQ: return GEN_CONDITIONAL_EQ;
> - case OP_NE: return GEN_CONDITIONAL_NEQ;
> + case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G;
> + case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE;
> + case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L;
> + case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE;
> + case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ;
> + case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ;
> default: NOT_SUPPORTED; return 0u;
> };
> }
> @@ -576,6 +576,15 @@ namespace gbe
> void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
> /*! 64-bit integer remainder of division */
> void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
> + /* common functions for both binary instruction and sel_cmp and compare instruction.
> + It will handle the IMM or normal register assignment, and will try to avoid LOADI
> + as much as possible. */
> + void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
> + GenRegister &src1, ir::Type type, bool &inverse);
> + void getSrcGenRegImm(SelectionDAG &dag,
> + SelectionDAG *dag0, SelectionDAG *dag1,
> + GenRegister &src0, GenRegister &src1,
> + ir::Type type, bool &inverse);
> /*! Use custom allocators */
> GBE_CLASS(Opaque);
> friend class SelectionBlock;
> @@ -588,6 +597,7 @@ namespace gbe
> currAuxLabel++;
> return (ir::LabelIndex)currAuxLabel;
> }
> +
> };
>
> ///////////////////////////////////////////////////////////////////////////
> @@ -1665,11 +1675,11 @@ namespace gbe
> return false;
> }
>
> - GenRegister getRegisterFromImmediate(ir::Immediate imm, bool negate = false)
> + GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false)
> {
> using namespace ir;
> int sign = negate ? -1 : 1;
> - switch (imm.type) {
> + switch (type) {
> case TYPE_U32: return GenRegister::immud(imm.data.u32 * sign);
> case TYPE_S32: return GenRegister::immd(imm.data.s32 * sign);
> case TYPE_FLOAT: return GenRegister::immf(imm.data.f32 * sign);
> @@ -1683,6 +1693,68 @@ namespace gbe
> }
> }
>
> + BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
> + void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag,
> + SelectionDAG *dag0, SelectionDAG *dag1,
> + GenRegister &src0, GenRegister &src1,
> + ir::Type type, bool &inverse) {
> + using namespace ir;
> + inverse = false;
> + // Right source can always be an immediate
> + //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
> + const int src0Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src0Index : 0;
> + const int src1Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src1Index : 1;
> + if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
> + const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> + src0 = this->selReg(dag.insn.getSrc(src0Index), type);
> + src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
> + if (dag0) dag0->isRoot = 1;
> + }
> + // Left source cannot be immediate but it is OK if we can commute
> + else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf<BinaryInstruction>() &&
> + ((cast<BinaryInstruction>(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) &&
> + dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
> + const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> + src0 = dag.insn.getOpcode() != OP_SUB ?
> + this->selReg(dag.insn.getSrc(src1Index), type) :
> + GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type));
> + Immediate imm = childInsn.getImmediate();
> + src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB);
> + if (dag1) dag1->isRoot = 1;
> + }
> + // If it's a compare instruction, theoritically, we can easily revert the condition code to
> + // switch the two operands. But we can't do that for float due to the NaN's exist.
> + // For a normal select instruction, we can always inverse the predication to switch the two
> + // operands' position.
> + else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> + dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
> + ((dag.insn.isMemberOf<CompareInstruction>() && type != TYPE_FLOAT && type != TYPE_DOUBLE) ||
> + (dag.insn.isMemberOf<SelectInstruction>()) ||
> + (dag.insn.getOpcode() == OP_ORD))) {
> + const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> + src0 = this->selReg(dag.insn.getSrc(src1Index), type);
> + src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
> + inverse = true;
> + if (dag1) dag1->isRoot = 1;
> + }
> + // Just grab the two sources
> + else {
> + src0 = this->selReg(dag.insn.getSrc(src0Index), type);
> + src1 = this->selReg(dag.insn.getSrc(src1Index), type);
> + markAllChildren(dag);
> + }
> + }
> +
> + void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
> + GenRegister &src1, ir::Type type,
> + bool &inverse) {
> + SelectionDAG *dag0 = dag.child[0];
> + SelectionDAG *dag1 = dag.child[1];
> + getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse);
> + }
> +
> +
> /*! Template for the one-to-many instruction patterns */
> template <typename T, typename U>
> class OneToManyPattern : public SelectionPattern
> @@ -1846,7 +1918,6 @@ namespace gbe
> DECL_CTOR(UnaryInstruction, 1, 1)
> };
>
> - BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
>
> /*! Binary regular instruction pattern */
> class BinaryInstructionPattern : public SelectionPattern
> @@ -1957,36 +2028,8 @@ namespace gbe
>
> // Look for immediate values
> GenRegister src0, src1;
> - SelectionDAG *dag0 = dag.child[0];
> - SelectionDAG *dag1 = dag.child[1];
> -
> - // Right source can always be an immediate
> - //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
> - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> - canGetRegisterFromImmediate(dag1->insn) && type != TYPE_BOOL) {
> - const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> - src0 = sel.selReg(insn.getSrc(0), type);
> - src1 = getRegisterFromImmediate(childInsn.getImmediate());
> - if (dag0) dag0->isRoot = 1;
> - }
> - // Left source cannot be immediate but it is OK if we can commute
> - else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> - (insn.commutes() || insn.getOpcode() == OP_SUB) &&
> - dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
> - type != TYPE_BOOL) {
> - const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> - src0 = insn.getOpcode() != OP_SUB ?
> - sel.selReg(insn.getSrc(1), type) : GenRegister::negate(sel.selReg(insn.getSrc(1), type));
> - src1 = getRegisterFromImmediate(childInsn.getImmediate(), insn.getOpcode() == OP_SUB);
> - if (dag1) dag1->isRoot = 1;
> - }
> - // Just grab the two sources
> - else {
> - src0 = sel.selReg(insn.getSrc(0), type);
> - src1 = sel.selReg(insn.getSrc(1), type);
> - markAllChildren(dag);
> - }
> -
> + bool inverse = false;
> + sel.getSrcGenRegImm(dag, src0, src1, type, inverse);
> // Output the binary instruction
> if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) {
> GBE_ASSERT(insn.getOpcode() == OP_AND ||
> @@ -2272,12 +2315,16 @@ namespace gbe
> // So both sources must match
> if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
> if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
> -
> // OK, we merge the instructions
> const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
> const ir::Opcode opcode = cmpInsn.getOpcode();
> if(opcode == OP_ORD) return false;
> - const uint32_t genCmp = getGenCompare(opcode);
> + GenRegister src0, src1;
> + const ir::Type type = cmpInsn.getType();
> + bool inverse = false;
> + sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse);
> +
> + const uint32_t genCmp = getGenCompare(opcode, inverse);
> sel.push();
> if (sel.isScalarReg(insn.getDst(0)) == true) {
> sel.curr.execWidth = 1;
> @@ -2287,21 +2334,12 @@ namespace gbe
>
> // Like for regular selects, we need a temporary since we cannot predicate
> // properly
> - const ir::Type type = cmpInsn.getType();
> const uint32_t simdWidth = sel.curr.execWidth;
> const GenRegister dst = sel.selReg(insn.getDst(0), type);
> - const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type);
> - const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
> -
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.curr.execWidth = simdWidth;
> sel.SEL_CMP(genCmp, dst, src0, src1);
> sel.pop();
> -
> -
> - // We need the sources of the compare instruction
> - markAllChildren(*cmp);
> -
> return true;
> }
> };
> @@ -2913,25 +2951,8 @@ namespace gbe
>
> // Look for immediate values for the right source
> GenRegister src0, src1;
> - SelectionDAG *dag0 = dag.child[0];
> - SelectionDAG *dag1 = dag.child[1];
> -
> - // Right source can always be an immediate
> - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
> - canGetRegisterFromImmediate(dag1->insn) && opcode != OP_ORD) {
> - const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> - src0 = sel.selReg(insn.getSrc(0), type);
> - Immediate imm = childInsn.getImmediate();
> - if(imm.type != type)
> - imm.type = type;
> - src1 = getRegisterFromImmediate(imm);
> - if (dag0) dag0->isRoot = 1;
> - } else {
> - src0 = sel.selReg(insn.getSrc(0), type);
> - src1 = sel.selReg(insn.getSrc(1), type);
> - markAllChildren(dag);
> - }
> -
> + bool inverseCmp = false;
> + sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp);
> sel.push();
> if (sel.isScalarReg(dst))
> sel.curr.noMask = 1;
> @@ -2944,7 +2965,7 @@ namespace gbe
> for(int i=0; i<3; i++)
> tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
> sel.curr.flagGen = 1;
> - sel.I64CMP(getGenCompare(opcode), src0, src1, tmp);
> + sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp);
> } else if(opcode == OP_ORD) {
> sel.push();
> sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
> @@ -2966,7 +2987,7 @@ namespace gbe
> sel.curr.flagGen = needStoreBool;
> tmpDst = GenRegister::nullud();
> }
> - sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
> + sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst);
> }
> sel.pop();
> return true;
> @@ -3189,19 +3210,9 @@ namespace gbe
> SelectionDAG *dag1 = dag.child[1];
> SelectionDAG *dag2 = dag.child[2];
>
> - // Right source can always be an immediate
> - if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
> - const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
> - src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
> - src1 = getRegisterFromImmediate(childInsn.getImmediate());
> - if (dag0) dag0->isRoot = 1;
> - if (dag1) dag1->isRoot = 1;
> - } else {
> - src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
> - src1 = sel.selReg(insn.getSrc(SelectInstruction::src1Index), type);
> - markAllChildren(dag);
> - }
> -
> + if (dag0) dag0->isRoot = 1;
> + bool inverse = false;
> + sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse);
> const Register pred = insn.getPredicate();
> sel.push();
> if (sel.isScalarReg(insn.getDst(0)) == true) {
> @@ -3209,7 +3220,7 @@ namespace gbe
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.curr.noMask = 1;
> }
> -
> + sel.curr.inversePredicate ^= inverse;
> sel.curr.physicalFlag = 0;
> sel.curr.flagIndex = (uint16_t) pred;
> sel.curr.predicate = GEN_PREDICATE_NORMAL;
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list