[Mesa-dev] [PATCH 4/4] R600: Fold immediates into ALU instructions when possible

Tue Nov 13 07:41:20 PST 2012

----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : mesa-dev at lists.freedesktop.org
> Cc : Tom Stellard <thomas.stellard at amd.com>
> Envoyé le : Vendredi 9 novembre 2012 21h47
> Objet : [Mesa-dev] [PATCH 4/4] R600: Fold immediates into ALU instructions when possible
> 
> From: Tom Stellard <thomas.stellard at amd.com>
> 
> ---
> lib/Target/AMDGPU/R600ISelLowering.cpp          | 65 ++++++++++++++++++++++---
> lib/Target/AMDGPU/R600InstrInfo.cpp             | 40 +++++++++++++++
> lib/Target/AMDGPU/R600InstrInfo.h               | 12 +++++
> test/CodeGen/R600/fcmp-cnd.ll                   |  4 +-
> test/CodeGen/R600/fcmp-cnde-int-args.ll         |  2 +-
> test/CodeGen/R600/selectcc-icmp-select-float.ll |  2 +-
> test/CodeGen/R600/selectcc_cnde.ll              |  2 +-
> test/CodeGen/R600/selectcc_cnde_int.ll          |  2 +-
> 8 files changed, 117 insertions(+), 12 deletions(-)
> 
> diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp 
> b/lib/Target/AMDGPU/R600ISelLowering.cpp
> index 75a2a90..dc2247a 100644
> --- a/lib/Target/AMDGPU/R600ISelLowering.cpp
> +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
> @@ -131,15 +131,66 @@ MachineBasicBlock * 
> R600TargetLowering::EmitInstrWithCustomInserter(
>      }
> 
>    case AMDGPU::MOV_IMM_F32:
> -    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
> -                     MI->getOperand(1).getFPImm()->getValueAPF()
> -                         .bitcastToAPInt().getZExtValue());
> -    break;
>    case AMDGPU::MOV_IMM_I32:
> -    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
> -                     MI->getOperand(1).getImm());
> -    break;
> +    {
> +      unsigned DstReg = MI->getOperand(0).getReg();
> +      bool CanReplace = true;
> +      int64_t Imm;
> +      unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
> +      bool IsInlineConstant = false;
> +
> +      if (MI->getOpcode() == AMDGPU::MOV_IMM_F32) {
> +        APFloat Value = MI->getOperand(1).getFPImm()->getValueAPF();
> +        float FloatValue = Value.convertToFloat();
> +        if (FloatValue == 0.0f) {
> +          ImmReg = AMDGPU::ZERO;
> +          IsInlineConstant = true;
> +        } else if (FloatValue == 1.0f) {
> +          ImmReg = AMDGPU::ONE;
> +          IsInlineConstant = true;
> +        }
> +        Imm = Value.bitcastToAPInt().getZExtValue();
> +      } else {
> +        Imm = MI->getOperand(1).getImm();
> +        switch (Imm) {
> +        case 0:
> +          ImmReg = AMDGPU::ZERO;
> +          IsInlineConstant = true;
> +          break;
> +        case 1:
> +          ImmReg = AMDGPU::ONE_INT;
> +          IsInlineConstant = true;
> +          break;
> +        }
> +      }
> +
> +      // Check all the uses of DstReg to make sure it is safe to fold the
> +      // immediate.
> +      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(DstReg);
> +                                   UI != MachineRegisterInfo::use_end(); ++UI) 

I think use_iterator are a bit tricky to use ; I was told by people on #llvm that the
use_iterator has no guarantees on order, and is not a def-use chain per se.
The use iterator returns every machine operand marked as "use" or "implicit use" by a
machine instruction. For a virtual register that is defined only once, it matches the definition
of a def-use chain but not for physical register. For instance in the following block :

vreg1 = MOV T2_X (1)
...
T2_X = MOV vreg2 (2)
...
vreg3 = MOV T2_X (3)

the use iterator will returns operand in instruction (1) and (3), even if it was redefined in (2).
This situation can occur in our backend in the InstrEmitterPass because we generate physical
register read for some input loading intrinsics, and register write for some output storing intrinsics.

I think using a COPY instruction here would be better, because we'll let register coalescer fold litterals for
us.

> {
> 
> +        if (!TII->canUseImmediate(UI.getOperand()) &&
> +            !(IsInlineConstant && 
> TII->isALUInstr((*UI).getOpcode()))) {
> +          CanReplace = false;
> +          break;
> +        }
> +      }
> +
> +      if (!CanReplace) {
> +        TII->buildMovImm(*BB, I, DstReg, Imm);
> +      } else {
> +        if (!IsInlineConstant) {
> +          // Add the immediate to the use instructions
> +          for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(DstReg);
> +                                     UI != MachineRegisterInfo::use_end(); 
> ++UI) {
> +            TII->setImmOperand(&*UI, R600Operands::IMM, Imm);
> +          }
> +        }
> +        // Update the use's register
> +        MRI.replaceRegWith(DstReg, ImmReg);
> +      }
> +     break;
> +    }
> 
>    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
>    case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp 
> b/lib/Target/AMDGPU/R600InstrInfo.cpp
> index e437313..2ca5e45 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.cpp
> +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
> @@ -138,6 +138,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const
>    }
> }
> 
> +bool R600InstrInfo::isALUInstr(unsigned Opcode) const
> +{
> +  unsigned TargetFlags = get(Opcode).TSFlags;
> +
> +  return ((TargetFlags & R600_InstFlag::OP1) |
> +          (TargetFlags & R600_InstFlag::OP2) |
> +          (TargetFlags & R600_InstFlag::OP3));
> +}
> +
> DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
>      const ScheduleDAG *DAG) const
> {
> @@ -582,6 +591,37 @@ void R600InstrInfo::setImmOperand(MachineInstr *MI, 
> R600Operands::Ops Op,
>    MI->getOperand(Idx).setImm(Imm);
> }
> 
> +bool R600InstrInfo::operandUsesImmediate(const MachineInstr &MI,
> +                                         R600Operands::Ops Op) const
> +{
> +  int OpIndex = getOperandIdx(MI, Op);
> +  return OpIndex != -1 &&
> +         MI.getOperand(OpIndex).getReg() == AMDGPU::ALU_LITERAL_X;
> +}
> +
> +bool R600InstrInfo::canUseImmediate(const MachineOperand &MO) const
> +{
> +  const MachineInstr *MI = MO.getParent();
> +
> +  if (!isALUInstr(MI->getOpcode())) {
> +    return false;
> +  }
> +
> +  // XXX: Handle cases when an instruction can reference an immediate from
> +  // multiple operands:
> +  //
> +  // 1. Each operand uses the same immediate value
> +  // 2. There is room in the instruction group (MIBundle) for another 
> immediate.
> +  //    We need to start using bundles first.
> +  if (operandUsesImmediate(*MI, R600Operands::SRC0) ||
> +      operandUsesImmediate(*MI, R600Operands::SRC1) ||
> +      operandUsesImmediate(*MI, R600Operands::SRC2)) {
> +    return false;
> +  }
> +
> +  return true;
> +}
> +
> //===----------------------------------------------------------------------===//
> // Instruction flag getters/setters
> //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.h 
> b/lib/Target/AMDGPU/R600InstrInfo.h
> index cec1c3b..7c62a53 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.h
> +++ b/lib/Target/AMDGPU/R600InstrInfo.h
> @@ -50,6 +50,9 @@ namespace llvm {
>    bool isReductionOp(unsigned opcode) const;
>    bool isCubeOp(unsigned opcode) const;
> 
> +  /// isALUInstr - Returns true if this Opcode represents an ALU instruction.
> +  bool isALUInstr(unsigned Opcode) const;
> +
>    /// isVector - Vector instructions are instructions that must fill all
>    /// instruction slots within an instruction group.
>    bool isVector(const MachineInstr &MI) const;
> @@ -133,6 +136,15 @@ namespace llvm {
>    /// setImmOperand - Helper function for setting instruction flag values.
>    void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) 
> const;
> 
> +  /// operandUsesImmediate - Return true if it is legal for this instruction
> +  /// to reference an immediate from one of its opreand and the given operand
> +  /// is using an immediate.
> +  bool operandUsesImmediate(const MachineInstr &MI, R600Operands::Ops Op) 
> const;
> +
> +  /// canUseImmediate - Return true if this operand can be replaced with an
> +  /// immediate.
> +  bool canUseImmediate(const MachineOperand &MO) const;
> +
>    ///hasFlagOperand - Returns true if this instruction has an operand for
>    /// storing target flags.
>    bool hasFlagOperand(const MachineInstr &MI) const;
> diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
> index c6b6236..a94cfb5 100644
> --- a/test/CodeGen/R600/fcmp-cnd.ll
> +++ b/test/CodeGen/R600/fcmp-cnd.ll
> @@ -1,6 +1,8 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;Not checking arguments 2 and 3 to CNDE, because they may change between
> +;registers and literal.x depending on what the optimizer does.
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> 
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll 
> b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> index 92f3b5f..5c981ef 100644
> --- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
> +++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> @@ -4,7 +4,7 @@
> ; chance to optimize the fcmp + select instructions to CNDE was missed
> ; due to the fact that the operands to fcmp and select had different types
> 
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
> 
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll 
> b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> index f1f8ab1..f65a300 100644
> --- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
> +++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> @@ -2,7 +2,7 @@
> 
> ; Note additional optimizations may cause this SGT to be replaced with a
> ; CND* instruction.
> -; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
> ; Test a selectcc with i32 LHS/RHS and float True/False
> 
> define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
> diff --git a/test/CodeGen/R600/selectcc_cnde.ll 
> b/test/CodeGen/R600/selectcc_cnde.ll
> index e06a170..f0a0f51 100644
> --- a/test/CodeGen/R600/selectcc_cnde.ll
> +++ b/test/CodeGen/R600/selectcc_cnde.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> ;CHECK-NOT: SETE
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, 
> [-0-9]+\(2.0}}
> define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
>    %1 = load float addrspace(1)* %in
>    %2 = fcmp oeq float %1, 0.0
> diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll 
> b/test/CodeGen/R600/selectcc_cnde_int.ll
> index 03d000f..b38078e 100644
> --- a/test/CodeGen/R600/selectcc_cnde_int.ll
> +++ b/test/CodeGen/R600/selectcc_cnde_int.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> 
> ;CHECK-NOT: SETE_INT
> -;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
> define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>    %1 = load i32 addrspace(1)* %in
>    %2 = icmp eq i32 %1, 0
> -- 
> 1.7.11.4
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>