[Mesa-dev] [PATCH 2/2] R600: Proper insert S_WAITCNT instructions
Christian König
deathsimple at vodafone.de
Mon Dec 17 06:32:24 PST 2012
Some instructions like memory reads/writes are executed
asynchronously, so we need to insert S_WAITCNT instructions
to block before accessing their results. Previously we have
just inserted S_WAITCNT instructions after each async
instruction, this patch fixes this and adds a prober
insertion pass.
Signed-off-by: Christian König <deathsimple at vodafone.de>
---
lib/Target/AMDGPU/AMDGPU.h | 1 +
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +
lib/Target/AMDGPU/SIISelLowering.cpp | 12 -
lib/Target/AMDGPU/SIISelLowering.h | 5 -
lib/Target/AMDGPU/SIInsertWaits.cpp | 353 +++++++++++++++++++++++++++++
lib/Target/AMDGPU/SIInstrInfo.h | 4 +-
lib/Target/AMDGPU/SIInstrInfo.td | 30 +--
7 files changed, 378 insertions(+), 32 deletions(-)
create mode 100644 lib/Target/AMDGPU/SIInsertWaits.cpp
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0f5125d..c75ec24 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -30,6 +30,7 @@ FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
// Passes common to R600 and SI
Pass *createAMDGPUStructurizeCFGPass();
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 098d42e..1edef6f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -116,6 +116,11 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
}
bool AMDGPUPassConfig::addPostRegAlloc() {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+ addPass(createSIInsertWaits(*TM));
+ }
return false;
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 435b0b3..11ba452 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -66,11 +66,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
MachineBasicBlock::iterator I = MI;
- if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
- AppendS_WAITCNT(MI, *BB, llvm::next(I));
- return BB;
- }
-
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -141,13 +136,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
return BB;
}
-void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
- MachineBasicBlock::iterator I) const {
- BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
-}
-
-
void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index db36eef..8528c24 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -23,11 +23,6 @@ namespace llvm {
class SITargetLowering : public AMDGPUTargetLowering {
const SIInstrInfo * TII;
- /// Memory reads and writes are syncronized using the S_WAITCNT instruction.
- /// This function takes the most conservative approach and inserts an
- /// S_WAITCNT instruction after every read and write.
- void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
- MachineBasicBlock::iterator I) const;
void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
MachineBasicBlock::iterator I, unsigned Opocde) const;
void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
new file mode 100644
index 0000000..24fc929
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -0,0 +1,353 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+ struct {
+ unsigned VM;
+ unsigned EXP;
+ unsigned LGKM;
+ } Named;
+ unsigned Array[3];
+
+} Counters;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+ static char ID;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo &TRI;
+ const MachineRegisterInfo *MRI;
+
+ /// \brief Constant hardware limits
+ static const Counters WaitCounts;
+
+ /// \brief Constant zero value
+ static const Counters ZeroCounts;
+
+ /// \brief Counter values we have already waited on.
+ Counters WaitedOn;
+
+ /// \brief Counter values for last instruction issued.
+ Counters LastIssued;
+
+ /// \brief Registers used by async instructions.
+ RegCounters UsedRegs;
+
+ /// \brief Registers defined by async instructions.
+ RegCounters DefinedRegs;
+
+ /// \brief Different export instruction types seen since last wait.
+ unsigned ExpInstrTypesSeen;
+
+ /// \brief Get increment/decrement amount for this instruction.
+ Counters getHwCounts(MachineInstr &MI);
+
+ /// \brief Is operand relevant for async execution?
+ bool isOpRelevant(MachineOperand &Op);
+
+ /// \brief Get register interval an operand affects.
+ RegInterval getRegInterval(MachineOperand &Op);
+
+ /// \brief Handle instructions async components
+ void pushInstruction(MachineInstr &MI);
+
+ /// \brief Insert the actual wait instruction
+ bool insertWait(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters &Counts);
+
+ /// \brief Resolve all operand dependencies to counter requirements
+ Counters handleOperands(MachineInstr &MI);
+
+public:
+ SIInsertWaits(TargetMachine &tm) :
+ MachineFunctionPass(ID),
+ TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
+ TRI(TII->getRegisterInfo()) { }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ const char *getPassName() const {
+ return "SI insert wait instructions";
+ }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+ return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+ uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+ Counters Result;
+
+ Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+ // Only consider stores or EXP for EXP_CNT
+ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+ (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
+
+ // LGKM may uses larger values
+ if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+ MachineOperand &Op = MI.getOperand(0);
+ assert(Op.isReg() && "First LGKM operand must be a register!");
+
+ unsigned Reg = Op.getReg();
+ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+ Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+ } else {
+ Result.Named.LGKM = 0;
+ }
+
+ return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+ // Constants are always irrelevant
+ if (!Op.isReg())
+ return false;
+
+ // Defines are always relevant
+ if (Op.isDef())
+ return true;
+
+ // For exports all registers are relevant
+ MachineInstr &MI = *Op.getParent();
+ if (MI.getOpcode() == AMDGPU::EXP)
+ return true;
+
+ // For stores the stored value is also relevant
+ if (!MI.getDesc().mayStore())
+ return false;
+
+ for (MachineInstr::mop_iterator I = MI.operands_begin(),
+ E = MI.operands_end(); I != E; ++I) {
+
+ if (I->isReg() && I->isUse())
+ return Op.isIdenticalTo(*I);
+ }
+
+ return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+ if (!Op.isReg())
+ return std::make_pair(0, 0);
+
+ unsigned Reg = Op.getReg();
+ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+
+ assert(Size >= 4);
+
+ RegInterval Result;
+ Result.first = TRI.getEncodingValue(Reg);
+ Result.second = Result.first + Size / 4;
+
+ return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+
+ // Get the hardware counter increments and sum them up
+ Counters Increment = getHwCounts(MI);
+ unsigned Sum = 0;
+
+ for (unsigned i = 0; i < 3; ++i) {
+ LastIssued.Array[i] += Increment.Array[i];
+ Sum += Increment.Array[i];
+ }
+
+ // If we don't increase anything then that's it
+ if (Sum == 0)
+ return;
+
+ // Remember which export instructions we have seen
+ if (Increment.Named.EXP) {
+ ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+ }
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+ MachineOperand &Op = MI.getOperand(i);
+ if (!isOpRelevant(Op))
+ continue;
+
+ RegInterval Interval = getRegInterval(Op);
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+ // Remember which registers we define
+ if (Op.isDef())
+ DefinedRegs[j] = LastIssued;
+
+ // and which one we are using
+ if (Op.isUse())
+ UsedRegs[j] = LastIssued;
+ }
+ }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const Counters &Required) {
+
+ // End of program? No need to wait on anything
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+ return false;
+
+ // Figure out if the async instructions execute in order
+ bool Ordered[3];
+
+ // VM_CNT is always ordered
+ Ordered[0] = true;
+
+ // EXP_CNT is unordered if we have both EXP & VM-writes
+ Ordered[1] = ExpInstrTypesSeen == 3;
+
+ // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+ Ordered[2] = false;
+
+ // The values we are going to put into the S_WAITCNT instruction
+ Counters Counts = WaitCounts;
+
+ // Do we really need to wait?
+ bool NeedWait = false;
+
+ for (unsigned i = 0; i < 3; ++i) {
+
+ if (Required.Array[i] <= WaitedOn.Array[i])
+ continue;
+
+ NeedWait = true;
+
+ if (Ordered[i]) {
+ unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+ // adjust the value to the real hardware posibilities
+ Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+ } else
+ Counts.Array[i] = 0;
+
+ // Remember on what we have waited on
+ WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+ }
+
+ if (!NeedWait)
+ return false;
+
+ // Reset EXP_CNT instruction types
+ if (Counts.Named.EXP == 0)
+ ExpInstrTypesSeen = 0;
+
+ // Build the wait instruction
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm((Counts.Named.VM & 0xF) |
+ ((Counts.Named.EXP & 0x7) << 4) |
+ ((Counts.Named.LGKM & 0x7) << 8));
+
+ return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+ for (unsigned i = 0; i < 3; ++i)
+ Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+ Counters Result = ZeroCounts;
+
+ // For each register affected by this
+ // instruction increase the result sequence
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+ MachineOperand &Op = MI.getOperand(i);
+ RegInterval Interval = getRegInterval(Op);
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+ if (Op.isDef())
+ increaseCounters(Result, UsedRegs[j]);
+
+ if (Op.isUse())
+ increaseCounters(Result, DefinedRegs[j]);
+ }
+ }
+
+ return Result;
+}
+
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+
+ bool Changes = false;
+
+ MRI = &MF.getRegInfo();
+
+ WaitedOn = ZeroCounts;
+ LastIssued = ZeroCounts;
+
+ memset(&UsedRegs, 0, sizeof(UsedRegs));
+ memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+
+ Changes |= insertWait(MBB, I, handleOperands(*I));
+ pushInstruction(*I);
+ }
+
+ // Wait for everything at the end of the MBB
+ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+ }
+
+ return Changes;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 631f6c0..783cd9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -55,7 +55,9 @@ public:
namespace SIInstrFlags {
enum Flags {
// First 4 bits are the instruction encoding
- NEED_WAIT = 1 << 4
+ VM_CNT = 1 << 4,
+ EXP_CNT = 1 << 5,
+ LGKM_CNT = 1 << 6
};
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 873a451..8ff2d6d 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -42,11 +42,14 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
AMDGPUInst<outs, ins, asm, pattern> {
field bits<4> EncodingType = 0;
- field bits<1> NeedWait = 0;
+ field bits<1> VM_CNT = 0;
+ field bits<1> EXP_CNT = 0;
+ field bits<1> LGKM_CNT = 0;
let TSFlags{3-0} = EncodingType;
- let TSFlags{4} = NeedWait;
-
+ let TSFlags{4} = VM_CNT;
+ let TSFlags{5} = EXP_CNT;
+ let TSFlags{6} = LGKM_CNT;
}
class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -140,8 +143,7 @@ def EXP : Enc64<
let Inst{63-56} = VSRC3;
let EncodingType = 0; //SIInstrEncodingType::EXP
- let NeedWait = 1;
- let usesCustomInserter = 1;
+ let EXP_CNT = 1;
}
class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -174,11 +176,10 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{47-40} = VDATA;
let Inst{52-48} = SRSRC;
let Inst{57-53} = SSAMP;
-
let EncodingType = 2; //SIInstrEncodingType::MIMG
- let NeedWait = 1;
- let usesCustomInserter = 1;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
}
class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -215,8 +216,9 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{63-56} = SOFFSET;
let EncodingType = 3; //SIInstrEncodingType::MTBUF
- let NeedWait = 1;
- let usesCustomInserter = 1;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+
let neverHasSideEffects = 1;
}
@@ -252,8 +254,9 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{63-56} = SOFFSET;
let EncodingType = 4; //SIInstrEncodingType::MUBUF
- let NeedWait = 1;
- let usesCustomInserter = 1;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+
let neverHasSideEffects = 1;
}
@@ -276,8 +279,7 @@ class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{31-27} = 0x18; //encoding
let EncodingType = 5; //SIInstrEncodingType::SMRD
- let NeedWait = 1;
- let usesCustomInserter = 1;
+ let LGKM_CNT = 1;
}
class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
--
1.7.9.5
More information about the mesa-dev
mailing list