[Mesa-dev] [PATCH 5/5] R600: Fold immediates into ALU instructions when possible v2

Wed Nov 28 14:50:11 PST 2012

From: Tom Stellard <thomas.stellard at amd.com>

v2:
  - Fold the immediates using the SelectionDAG
---
 lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp         | 94 +++++++++++++++++++++++++
 lib/Target/AMDGPU/R600InstrInfo.cpp             | 16 ++++-
 lib/Target/AMDGPU/R600InstrInfo.h               |  7 ++
 test/CodeGen/R600/fcmp-cnd.ll                   |  4 +-
 test/CodeGen/R600/fcmp-cnde-int-args.ll         |  2 +-
 test/CodeGen/R600/literals.ll                   | 30 ++++++++
 test/CodeGen/R600/selectcc-icmp-select-float.ll |  2 +-
 test/CodeGen/R600/selectcc_cnde.ll              |  2 +-
 test/CodeGen/R600/selectcc_cnde_int.ll          |  2 +-
 9 files changed, 153 insertions(+), 6 deletions(-)
 create mode 100644 test/CodeGen/R600/literals.ll

diff --git a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
index 10ce6ad..2a80f1b 100644
--- a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
 #include "AMDILDevices.h"
+#include "R600InstrInfo.h"
 #include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@@ -167,6 +168,99 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       }
     }
     break;
+  case ISD::ConstantFP:
+  case ISD::Constant:
+    {
+      const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+      // XXX: Custom immediate lowering not implemented yet.  Instead we use
+      // pseudo instructions defined in SIInstructions.td
+      if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+        break;
+      }
+      const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
+      uint64_t ImmValue = 0;
+      unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+
+      if (N->getOpcode() == ISD::ConstantFP) {
+        // XXX: 64-bit Immediates not supported yet
+        assert(N->getValueType(0) != MVT::f64);
+
+        ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
+        APFloat Value = C->getValueAPF();
+        float FloatValue = Value.convertToFloat();
+        if (FloatValue == 0.0) {
+          ImmReg = AMDGPU::ZERO;
+        } else if (FloatValue == 0.5) {
+          ImmReg = AMDGPU::HALF;
+        } else if (FloatValue == 1.0) {
+          ImmReg = AMDGPU::ONE;
+        } else {
+          ImmValue = Value.bitcastToAPInt().getZExtValue();
+        }
+      } else {
+        // XXX: 64-bit Immediates not supported yet
+        assert(N->getValueType(0) != MVT::i64);
+
+        ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+        if (C->getZExtValue() == 0) {
+          ImmReg = AMDGPU::ZERO;
+        } else if (C->getZExtValue() == 1) {
+          ImmReg = AMDGPU::ONE_INT;
+        } else {
+          ImmValue = C->getZExtValue();
+        }
+      }
+
+      for (SDNode::use_iterator Use = N->use_begin(), E = SDNode::use_end();
+                                                      Use != E; ++Use) {
+        std::vector<SDValue> Ops;
+        for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
+          Ops.push_back(Use->getOperand(i));
+        }
+
+        if (!Use->isMachineOpcode()) {
+            if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+              // We can only use literal constants (e.g. AMDGPU::ZERO,
+              // AMDGPU::ONE, etc) in machine opcodes.
+              continue;
+            }
+        } else {
+          if (!TII->isALUInstr(Use->getMachineOpcode())) {
+            continue;
+          }
+
+          int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
+          assert(ImmIdx != -1);
+
+          // subtract one from ImmIdx, because the DST operand is usually index
+          // 0 for MachineInstrs, but we have no DST in the Ops vector.
+          ImmIdx--;
+
+          // Check that we aren't already using an immediate.
+          // XXX: It's possible for an instruction to have more than one
+          // immediate operand, but this is not supported yet.
+          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+            ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
+            assert(C);
+
+            if (C->getZExtValue() != 0) {
+              // This instruction is already using an immediate.
+              continue;
+            }
+
+            // Set the immediate value
+            Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
+          }
+        }
+        // Set the immediate register
+        Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
+
+        CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
+      }
+      break;
+    }
+
   }
   return SelectCode(N);
 }
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 20b1aa3..814e0a2 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -127,6 +127,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
   }
 }
 
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const
+{
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
 DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
     const ScheduleDAG *DAG) const {
   const InstrItineraryData *II = TM->getInstrItineraryData();
@@ -505,6 +514,11 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
 
 int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
                                  R600Operands::Ops Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
+                                 R600Operands::Ops Op) const {
   const static int OpTable[3][R600Operands::COUNT] = {
 //            W        C     S  S  S     S  S  S     S  S
 //            R  O  D  L  S  R  R  R  S  R  R  R  S  R  R  L  P
@@ -515,7 +529,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
     {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
     {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
   };
-  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+  unsigned TargetFlags = get(Opcode).TSFlags;
   unsigned OpTableIdx;
 
   if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index cec1c3b..81e1828 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -50,6 +50,9 @@ namespace llvm {
   bool isReductionOp(unsigned opcode) const;
   bool isCubeOp(unsigned opcode) const;
 
+  /// isALUInstr - Returns true if this Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+
   /// isVector - Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
@@ -130,6 +133,10 @@ namespace llvm {
   /// if the Instruction does not contain the specified Op.
   int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
 
+  /// getOperandIdx - Get the index of Op for the given Opcode.  Returns -1
+  /// if the Instruction does not contain the specified Op.
+  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+
   /// setImmOperand - Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
 
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
index c6b6236..a94cfb5 100644
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -1,6 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
index 92f3b5f..5c981ef 100644
--- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -4,7 +4,7 @@
 ; chance to optimize the fcmp + select instructions to CNDE was missed
 ; due to the fact that the operands to fcmp and select had different types
 
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
new file mode 100644
index 0000000..4c731b2
--- /dev/null
+++ b/test/CodeGen/R600/literals.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Test using an integer literal constant.
+; Generated ASM should be:
+; ADD_INT REG literal.x, 5
+; or
+; ADD_INT literal.x REG, 5
+
+; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 5, %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test using a float literal constant.
+; Generated ASM should be:
+; ADD REG literal.x, 5.0
+; or
+; ADD literal.x REG, 5.0
+
+; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
+define void @float_literal(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fadd float 5.0, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
index f1f8ab1..f65a300 100644
--- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -2,7 +2,7 @@
 
 ; Note additional optimizations may cause this SGT to be replaced with a
 ; CND* instruction.
-; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
 define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/selectcc_cnde.ll b/test/CodeGen/R600/selectcc_cnde.ll
index e06a170..f0a0f51 100644
--- a/test/CodeGen/R600/selectcc_cnde.ll
+++ b/test/CodeGen/R600/selectcc_cnde.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
 define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll b/test/CodeGen/R600/selectcc_cnde_int.ll
index 03d000f..b38078e 100644
--- a/test/CodeGen/R600/selectcc_cnde_int.ll
+++ b/test/CodeGen/R600/selectcc_cnde_int.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE_INT
-;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
-- 
1.7.11.4