[Mesa-dev] [PATCH 5/5] R600: Fold immediates into ALU instructions when possible v2

Thu Nov 29 08:32:16 PST 2012

It's a nice optimisation, it also makes machine code easier to read.

Reviewed-by: Vincent Lejeune <vljn at ovi.com>

----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : mesa-dev at lists.freedesktop.org
> Cc : Tom Stellard <thomas.stellard at amd.com>
> Envoyé le : Mercredi 28 novembre 2012 23h50
> Objet : [Mesa-dev] [PATCH 5/5] R600: Fold immediates into ALU instructions when possible v2
> 
> From: Tom Stellard <thomas.stellard at amd.com>
> 
> v2:
>   - Fold the immediates using the SelectionDAG
> ---
> lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp         | 94 +++++++++++++++++++++++++
> lib/Target/AMDGPU/R600InstrInfo.cpp             | 16 ++++-
> lib/Target/AMDGPU/R600InstrInfo.h               |  7 ++
> test/CodeGen/R600/fcmp-cnd.ll                   |  4 +-
> test/CodeGen/R600/fcmp-cnde-int-args.ll         |  2 +-
> test/CodeGen/R600/literals.ll                   | 30 ++++++++
> test/CodeGen/R600/selectcc-icmp-select-float.ll |  2 +-
> test/CodeGen/R600/selectcc_cnde.ll              |  2 +-
> test/CodeGen/R600/selectcc_cnde_int.ll          |  2 +-
> 9 files changed, 153 insertions(+), 6 deletions(-)
> create mode 100644 test/CodeGen/R600/literals.ll
> 
> diff --git a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp 
> b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> index 10ce6ad..2a80f1b 100644
> --- a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> +++ b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> @@ -14,6 +14,7 @@
> #include "AMDGPUISelLowering.h" // For AMDGPUISD
> #include "AMDGPURegisterInfo.h"
> #include "AMDILDevices.h"
> +#include "R600InstrInfo.h"
> #include "llvm/ADT/ValueMap.h"
> #include "llvm/CodeGen/PseudoSourceValue.h"
> #include "llvm/CodeGen/SelectionDAGISel.h"
> @@ -167,6 +168,99 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
>        }
>      }
>      break;
> +  case ISD::ConstantFP:
> +  case ISD::Constant:
> +    {
> +      const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
> +      // XXX: Custom immediate lowering not implemented yet.  Instead we use
> +      // pseudo instructions defined in SIInstructions.td
> +      if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
> +        break;
> +      }
> +      const R600InstrInfo *TII = static_cast<const 
> R600InstrInfo*>(TM.getInstrInfo());
> +
> +      uint64_t ImmValue = 0;
> +      unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
> +
> +      if (N->getOpcode() == ISD::ConstantFP) {
> +        // XXX: 64-bit Immediates not supported yet
> +        assert(N->getValueType(0) != MVT::f64);
> +
> +        ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
> +        APFloat Value = C->getValueAPF();
> +        float FloatValue = Value.convertToFloat();
> +        if (FloatValue == 0.0) {
> +          ImmReg = AMDGPU::ZERO;
> +        } else if (FloatValue == 0.5) {
> +          ImmReg = AMDGPU::HALF;
> +        } else if (FloatValue == 1.0) {
> +          ImmReg = AMDGPU::ONE;
> +        } else {
> +          ImmValue = Value.bitcastToAPInt().getZExtValue();
> +        }
> +      } else {
> +        // XXX: 64-bit Immediates not supported yet
> +        assert(N->getValueType(0) != MVT::i64);
> +
> +        ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
> +        if (C->getZExtValue() == 0) {
> +          ImmReg = AMDGPU::ZERO;
> +        } else if (C->getZExtValue() == 1) {
> +          ImmReg = AMDGPU::ONE_INT;
> +        } else {
> +          ImmValue = C->getZExtValue();
> +        }
> +      }
> +
> +      for (SDNode::use_iterator Use = N->use_begin(), E = SDNode::use_end();
> +                                                      Use != E; ++Use) {
> +        std::vector<SDValue> Ops;
> +        for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
> +          Ops.push_back(Use->getOperand(i));
> +        }
> +
> +        if (!Use->isMachineOpcode()) {
> +            if (ImmReg == AMDGPU::ALU_LITERAL_X) {
> +              // We can only use literal constants (e.g. AMDGPU::ZERO,
> +              // AMDGPU::ONE, etc) in machine opcodes.
> +              continue;
> +            }
> +        } else {
> +          if (!TII->isALUInstr(Use->getMachineOpcode())) {
> +            continue;
> +          }
> +
> +          int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), 
> R600Operands::IMM);
> +          assert(ImmIdx != -1);
> +
> +          // subtract one from ImmIdx, because the DST operand is usually index
> +          // 0 for MachineInstrs, but we have no DST in the Ops vector.
> +          ImmIdx--;
> +
> +          // Check that we aren't already using an immediate.
> +          // XXX: It's possible for an instruction to have more than one
> +          // immediate operand, but this is not supported yet.
> +          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
> +            ConstantSDNode *C = 
> dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
> +            assert(C);
> +
> +            if (C->getZExtValue() != 0) {
> +              // This instruction is already using an immediate.
> +              continue;
> +            }
> +
> +            // Set the immediate value
> +            Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
> +          }
> +        }
> +        // Set the immediate register
> +        Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
> +
> +        CurDAG->UpdateNodeOperands(*Use, Ops.data(), 
> Use->getNumOperands());
> +      }
> +      break;
> +    }
> +
>    }
>    return SelectCode(N);
> }
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp 
> b/lib/Target/AMDGPU/R600InstrInfo.cpp
> index 20b1aa3..814e0a2 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.cpp
> +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
> @@ -127,6 +127,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
>    }
> }
> 
> +bool R600InstrInfo::isALUInstr(unsigned Opcode) const
> +{
> +  unsigned TargetFlags = get(Opcode).TSFlags;
> +
> +  return ((TargetFlags & R600_InstFlag::OP1) |
> +          (TargetFlags & R600_InstFlag::OP2) |
> +          (TargetFlags & R600_InstFlag::OP3));
> +}
> +
> DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
>      const ScheduleDAG *DAG) const {
>    const InstrItineraryData *II = TM->getInstrItineraryData();
> @@ -505,6 +514,11 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock 
> &BB,
> 
> int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
>                                   R600Operands::Ops Op) const {
> +  return getOperandIdx(MI.getOpcode(), Op);
> +}
> +
> +int R600InstrInfo::getOperandIdx(unsigned Opcode,
> +                                 R600Operands::Ops Op) const {
>    const static int OpTable[3][R600Operands::COUNT] = {
> //            W        C     S  S  S     S  S  S     S  S
> //            R  O  D  L  S  R  R  R  S  R  R  R  S  R  R  L  P
> @@ -515,7 +529,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
>      {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
>      {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
>    };
> -  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
> +  unsigned TargetFlags = get(Opcode).TSFlags;
>    unsigned OpTableIdx;
> 
>    if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.h 
> b/lib/Target/AMDGPU/R600InstrInfo.h
> index cec1c3b..81e1828 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.h
> +++ b/lib/Target/AMDGPU/R600InstrInfo.h
> @@ -50,6 +50,9 @@ namespace llvm {
>    bool isReductionOp(unsigned opcode) const;
>    bool isCubeOp(unsigned opcode) const;
> 
> +  /// isALUInstr - Returns true if this Opcode represents an ALU instruction.
> +  bool isALUInstr(unsigned Opcode) const;
> +
>    /// isVector - Vector instructions are instructions that must fill all
>    /// instruction slots within an instruction group.
>    bool isVector(const MachineInstr &MI) const;
> @@ -130,6 +133,10 @@ namespace llvm {
>    /// if the Instruction does not contain the specified Op.
>    int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
> 
> +  /// getOperandIdx - Get the index of Op for the given Opcode.  Returns -1
> +  /// if the Instruction does not contain the specified Op.
> +  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
> +
>    /// setImmOperand - Helper function for setting instruction flag values.
>    void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) 
> const;
> 
> diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
> index c6b6236..a94cfb5 100644
> --- a/test/CodeGen/R600/fcmp-cnd.ll
> +++ b/test/CodeGen/R600/fcmp-cnd.ll
> @@ -1,6 +1,8 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;Not checking arguments 2 and 3 to CNDE, because they may change between
> +;registers and literal.x depending on what the optimizer does.
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> 
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll 
> b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> index 92f3b5f..5c981ef 100644
> --- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
> +++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> @@ -4,7 +4,7 @@
> ; chance to optimize the fcmp + select instructions to CNDE was missed
> ; due to the fact that the operands to fcmp and select had different types
> 
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
> 
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
> new file mode 100644
> index 0000000..4c731b2
> --- /dev/null
> +++ b/test/CodeGen/R600/literals.ll
> @@ -0,0 +1,30 @@
> +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> +
> +; Test using an integer literal constant.
> +; Generated ASM should be:
> +; ADD_INT REG literal.x, 5
> +; or
> +; ADD_INT literal.x REG, 5
> +
> +; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
> +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
> +entry:
> +  %0 = add i32 5, %in
> +  store i32 %0, i32 addrspace(1)* %out
> +  ret void
> +}
> +
> +; Test using a float literal constant.
> +; Generated ASM should be:
> +; ADD REG literal.x, 5.0
> +; or
> +; ADD literal.x REG, 5.0
> +
> +; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
> +define void @float_literal(float addrspace(1)* %out, float %in) {
> +entry:
> +  %0 = fadd float 5.0, %in
> +  store float %0, float addrspace(1)* %out
> +  ret void
> +}
> +
> diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll 
> b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> index f1f8ab1..f65a300 100644
> --- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
> +++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> @@ -2,7 +2,7 @@
> 
> ; Note additional optimizations may cause this SGT to be replaced with a
> ; CND* instruction.
> -; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
> ; Test a selectcc with i32 LHS/RHS and float True/False
> 
> define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
> diff --git a/test/CodeGen/R600/selectcc_cnde.ll 
> b/test/CodeGen/R600/selectcc_cnde.ll
> index e06a170..f0a0f51 100644
> --- a/test/CodeGen/R600/selectcc_cnde.ll
> +++ b/test/CodeGen/R600/selectcc_cnde.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> ;CHECK-NOT: SETE
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, 
> [-0-9]+\(2.0}}
> define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
>    %1 = load float addrspace(1)* %in
>    %2 = fcmp oeq float %1, 0.0
> diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll 
> b/test/CodeGen/R600/selectcc_cnde_int.ll
> index 03d000f..b38078e 100644
> --- a/test/CodeGen/R600/selectcc_cnde_int.ll
> +++ b/test/CodeGen/R600/selectcc_cnde_int.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> ;CHECK-NOT: SETE_INT
> -;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
> define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>    %1 = load i32 addrspace(1)* %in
>    %2 = icmp eq i32 %1, 0
> -- 
> 1.7.11.4
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>