[Mesa-dev] [PATCH 4/4] R600: Fold immediates into ALU instructions when possible

Fri Nov 9 12:47:37 PST 2012

From: Tom Stellard <thomas.stellard at amd.com>

---
 lib/Target/AMDGPU/R600ISelLowering.cpp          | 65 ++++++++++++++++++++++---
 lib/Target/AMDGPU/R600InstrInfo.cpp             | 40 +++++++++++++++
 lib/Target/AMDGPU/R600InstrInfo.h               | 12 +++++
 test/CodeGen/R600/fcmp-cnd.ll                   |  4 +-
 test/CodeGen/R600/fcmp-cnde-int-args.ll         |  2 +-
 test/CodeGen/R600/selectcc-icmp-select-float.ll |  2 +-
 test/CodeGen/R600/selectcc_cnde.ll              |  2 +-
 test/CodeGen/R600/selectcc_cnde_int.ll          |  2 +-
 8 files changed, 117 insertions(+), 12 deletions(-)

diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 75a2a90..dc2247a 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -131,15 +131,66 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     }
 
   case AMDGPU::MOV_IMM_F32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getFPImm()->getValueAPF()
-                         .bitcastToAPInt().getZExtValue());
-    break;
   case AMDGPU::MOV_IMM_I32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getImm());
-    break;
+    {
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool CanReplace = true;
+      int64_t Imm;
+      unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+      bool IsInlineConstant = false;
+
+      if (MI->getOpcode() == AMDGPU::MOV_IMM_F32) {
+        APFloat Value = MI->getOperand(1).getFPImm()->getValueAPF();
+        float FloatValue = Value.convertToFloat();
+        if (FloatValue == 0.0f) {
+          ImmReg = AMDGPU::ZERO;
+          IsInlineConstant = true;
+        } else if (FloatValue == 1.0f) {
+          ImmReg = AMDGPU::ONE;
+          IsInlineConstant = true;
+        }
+        Imm = Value.bitcastToAPInt().getZExtValue();
+      } else {
+        Imm = MI->getOperand(1).getImm();
+        switch (Imm) {
+        case 0:
+          ImmReg = AMDGPU::ZERO;
+          IsInlineConstant = true;
+          break;
+        case 1:
+          ImmReg = AMDGPU::ONE_INT;
+          IsInlineConstant = true;
+          break;
+        }
+      }
+
+      // Check all the uses of DstReg to make sure it is safe to fold the
+      // immediate.
+      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(DstReg);
+                                   UI != MachineRegisterInfo::use_end(); ++UI) {
 
+        if (!TII->canUseImmediate(UI.getOperand()) &&
+            !(IsInlineConstant && TII->isALUInstr((*UI).getOpcode()))) {
+          CanReplace = false;
+          break;
+        }
+      }
+
+      if (!CanReplace) {
+        TII->buildMovImm(*BB, I, DstReg, Imm);
+      } else {
+        if (!IsInlineConstant) {
+          // Add the immediate to the use instructions
+          for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(DstReg);
+                                     UI != MachineRegisterInfo::use_end(); ++UI) {
+            TII->setImmOperand(&*UI, R600Operands::IMM, Imm);
+          }
+        }
+        // Update the use's register
+        MRI.replaceRegWith(DstReg, ImmReg);
+      }
+     break;
+    }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index e437313..2ca5e45 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -138,6 +138,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const
   }
 }
 
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const
+{
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
 DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
     const ScheduleDAG *DAG) const
 {
@@ -582,6 +591,37 @@ void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
   MI->getOperand(Idx).setImm(Imm);
 }
 
+bool R600InstrInfo::operandUsesImmediate(const MachineInstr &MI,
+                                         R600Operands::Ops Op) const
+{
+  int OpIndex = getOperandIdx(MI, Op);
+  return OpIndex != -1 &&
+         MI.getOperand(OpIndex).getReg() == AMDGPU::ALU_LITERAL_X;
+}
+
+bool R600InstrInfo::canUseImmediate(const MachineOperand &MO) const
+{
+  const MachineInstr *MI = MO.getParent();
+
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+
+  // XXX: Handle cases when an instruction can reference an immediate from
+  // multiple operands:
+  //
+  // 1. Each operand uses the same immediate value
+  // 2. There is room in the instruction group (MIBundle) for another immediate.
+  //    We need to start using bundles first.
+  if (operandUsesImmediate(*MI, R600Operands::SRC0) ||
+      operandUsesImmediate(*MI, R600Operands::SRC1) ||
+      operandUsesImmediate(*MI, R600Operands::SRC2)) {
+    return false;
+  }
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction flag getters/setters
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index cec1c3b..7c62a53 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -50,6 +50,9 @@ namespace llvm {
   bool isReductionOp(unsigned opcode) const;
   bool isCubeOp(unsigned opcode) const;
 
+  /// isALUInstr - Returns true if this Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+
   /// isVector - Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
@@ -133,6 +136,15 @@ namespace llvm {
   /// setImmOperand - Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
 
+  /// operandUsesImmediate - Return true if it is legal for this instruction
+  /// to reference an immediate from one of its opreand and the given operand
+  /// is using an immediate.
+  bool operandUsesImmediate(const MachineInstr &MI, R600Operands::Ops Op) const;
+
+  /// canUseImmediate - Return true if this operand can be replaced with an
+  /// immediate.
+  bool canUseImmediate(const MachineOperand &MO) const;
+
   ///hasFlagOperand - Returns true if this instruction has an operand for
   /// storing target flags.
   bool hasFlagOperand(const MachineInstr &MI) const;
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
index c6b6236..a94cfb5 100644
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -1,6 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
index 92f3b5f..5c981ef 100644
--- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -4,7 +4,7 @@
 ; chance to optimize the fcmp + select instructions to CNDE was missed
 ; due to the fact that the operands to fcmp and select had different types
 
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
index f1f8ab1..f65a300 100644
--- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -2,7 +2,7 @@
 
 ; Note additional optimizations may cause this SGT to be replaced with a
 ; CND* instruction.
-; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
 define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/selectcc_cnde.ll b/test/CodeGen/R600/selectcc_cnde.ll
index e06a170..f0a0f51 100644
--- a/test/CodeGen/R600/selectcc_cnde.ll
+++ b/test/CodeGen/R600/selectcc_cnde.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
 define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll b/test/CodeGen/R600/selectcc_cnde_int.ll
index 03d000f..b38078e 100644
--- a/test/CodeGen/R600/selectcc_cnde_int.ll
+++ b/test/CodeGen/R600/selectcc_cnde_int.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE_INT
-;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
-- 
1.7.11.4