[Mesa-dev] [PATCH] AMDGPU: s/flow control/control flow/g in SI code.
Christian König
deathsimple at vodafone.de
Fri Nov 9 08:02:20 PST 2012
On 09.11.2012 16:13, Michel Dänzer wrote:
> From: Michel Dänzer <michel.daenzer at amd.com>
>
>
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
>
> This patch applies on top of Christian's SGPR liveness patch.
>
> lib/Target/AMDGPU/AMDGPU.h | 2 +-
> lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
> lib/Target/AMDGPU/CMakeLists.txt | 2 +-
> lib/Target/AMDGPU/SIFixSGPRLiveness.cpp | 2 +-
> lib/Target/AMDGPU/SILowerControlFlow.cpp | 193 +++++++++++++++++++++++++++++
> lib/Target/AMDGPU/SILowerFlowControl.cpp | 193 -----------------------------
> 6 files changed, 197 insertions(+), 197 deletions(-)
> create mode 100644 lib/Target/AMDGPU/SILowerControlFlow.cpp
> delete mode 100644 lib/Target/AMDGPU/SILowerFlowControl.cpp
>
> diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
> index 33a74dc..2a06ade 100644
> --- a/lib/Target/AMDGPU/AMDGPU.h
> +++ b/lib/Target/AMDGPU/AMDGPU.h
> @@ -25,7 +25,7 @@ FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
>
> // SI Passes
> FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
> -FunctionPass *createSILowerFlowControlPass(TargetMachine &tm);
> +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
> FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
> FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
> FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm);
> diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> index 024ff3d..5c4af91 100644
> --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> @@ -137,7 +137,7 @@ bool AMDGPUPassConfig::addPreEmitPass() {
> addPass(&FinalizeMachineBundlesID);
> } else {
> addPass(createSILowerLiteralConstantsPass(*TM));
> - addPass(createSILowerFlowControlPass(*TM));
> + addPass(createSILowerControlFlowPass(*TM));
> }
>
> return false;
> diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
> index 5e013f6..cd3f174 100644
> --- a/lib/Target/AMDGPU/CMakeLists.txt
> +++ b/lib/Target/AMDGPU/CMakeLists.txt
> @@ -41,7 +41,7 @@ add_llvm_target(AMDGPUCodeGen
> SIInstrInfo.cpp
> SIISelLowering.cpp
> SILowerLiteralConstants.cpp
> - SILowerFlowControl.cpp
> + SILowerControlFlow.cpp
> SIMachineFunctionInfo.cpp
> SIRegisterInfo.cpp
> SIPreColorSGPRs.cpp
> diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> index 028753e..f92eff5 100644
> --- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> +++ b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> @@ -7,7 +7,7 @@
> //
> //===----------------------------------------------------------------------===//
> //
> -// SGPRs are not affected by flow control. This pass adjust SGPR liveness in
> +// SGPRs are not affected by control flow. This pass adjust SGPR liveness in
> // so that the register allocator can still correctly allocate them.
> //
> //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
> new file mode 100644
> index 0000000..b43fdeb
> --- /dev/null
> +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
> @@ -0,0 +1,193 @@
> +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
> +// to predicated instructions.
> +//
> +// All control flow (except loops) is handled using predicated instructions and
> +// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
> +// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
> +// by writting to the 64-bit EXEC register (each bit corresponds to a
> +// single vector ALU). Typically, for predicates, a vector ALU will write
> +// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
> +// Vector ALU) and then the ScalarALU will AND the VCC register with the
> +// EXEC to update the predicates.
> +//
> +// For example:
> +// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
> +// SI_IF_NZ %VCC
> +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
> +// ELSE
> +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
> +// ENDIF
> +//
> +// becomes:
> +//
> +// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask
> +// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask
> +// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
> +// S_CBRANCH_EXECZ label0 // This instruction is an
> +// // optimization which allows us to
> +// // branch if all the bits of
> +// // EXEC are zero.
> +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
> +//
> +// label0:
> +// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask
> +// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then block
> +// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block
> +// S_BRANCH_EXECZ label1 // Use our branch optimization
> +// // instruction again.
> +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
> +// label1:
> +// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
> +//===----------------------------------------------------------------------===//
> +
> +#include "AMDGPU.h"
> +#include "SIInstrInfo.h"
> +#include "SIMachineFunctionInfo.h"
> +#include "llvm/CodeGen/MachineFunction.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> +#include "llvm/CodeGen/MachineRegisterInfo.h"
> +
> +using namespace llvm;
> +
> +namespace {
> +
> +class SILowerControlFlowPass : public MachineFunctionPass {
> +
> +private:
> + static char ID;
> + const TargetInstrInfo *TII;
> + std::vector<unsigned> PredicateStack;
> + std::vector<unsigned> UnusedRegisters;
> +
> + void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> + void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> +
> +public:
> + SILowerControlFlowPass(TargetMachine &tm) :
> + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
> +
> + virtual bool runOnMachineFunction(MachineFunction &MF);
> +
> + const char *getPassName() const {
> + return "SI Lower control flow instructions";
> + }
> +
> +};
> +
> +} // End anonymous namespace
> +
> +char SILowerControlFlowPass::ID = 0;
> +
> +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
> + return new SILowerControlFlowPass(tm);
> +}
> +
> +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
> +
> + // Find all the unused registers that can be used for the predicate stack.
> + for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
> + S = AMDGPU::SReg_64RegClass.end();
> + I != S; ++I) {
> + unsigned Reg = *I;
> + if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
> + UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
> + }
> + }
> +
> + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
> + BB != BB_E; ++BB) {
> + MachineBasicBlock &MBB = *BB;
> + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
> + I != MBB.end(); I = Next) {
> + Next = llvm::next(I);
> + MachineInstr &MI = *I;
> + switch (MI.getOpcode()) {
> + default: break;
> + case AMDGPU::SI_IF_NZ:
> + pushExecMask(MBB, I);
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
> + AMDGPU::EXEC)
> + .addOperand(MI.getOperand(0)) // VCC
> + .addReg(AMDGPU::EXEC);
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
> + PredicateStack.back())
> + .addReg(PredicateStack.back())
> + .addReg(AMDGPU::EXEC);
> + MI.eraseFromParent();
> + break;
> + case AMDGPU::ELSE:
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> + UnusedRegisters.back())
> + .addReg(AMDGPU::EXEC);
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> + AMDGPU::EXEC)
> + .addReg(PredicateStack.back());
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> + PredicateStack.back())
> + .addReg(UnusedRegisters.back());
> + MI.eraseFromParent();
> + break;
> + case AMDGPU::ENDIF:
> + popExecMask(MBB, I);
> + if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
> + PredicateStack.empty()) {
> + // If the exec mask is non-zero, skip the next two instructions
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> + .addImm(3)
> + .addReg(AMDGPU::EXEC);
> +
> + // Exec mask is zero: Export to NULL target...
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
> + .addImm(0)
> + .addImm(0x09) // V_008DFC_SQ_EXP_NULL
> + .addImm(0)
> + .addImm(1)
> + .addImm(1)
> + .addReg(AMDGPU::SREG_LIT_0)
> + .addReg(AMDGPU::SREG_LIT_0)
> + .addReg(AMDGPU::SREG_LIT_0)
> + .addReg(AMDGPU::SREG_LIT_0);
> +
> + // ... and terminate wavefront
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
> + }
> + MI.eraseFromParent();
> + break;
> + }
> + }
> + }
> + return false;
> +}
> +
> +void SILowerControlFlowPass::pushExecMask(MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator I) {
> +
> + assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
> + unsigned StackReg = UnusedRegisters.back();
> + UnusedRegisters.pop_back();
> + PredicateStack.push_back(StackReg);
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> + StackReg)
> + .addReg(AMDGPU::EXEC);
> +}
> +
> +void SILowerControlFlowPass::popExecMask(MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator I) {
> + unsigned StackReg = PredicateStack.back();
> + PredicateStack.pop_back();
> + UnusedRegisters.push_back(StackReg);
> + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
> + AMDGPU::EXEC)
> + .addReg(AMDGPU::EXEC)
> + .addReg(StackReg);
> +}
> diff --git a/lib/Target/AMDGPU/SILowerFlowControl.cpp b/lib/Target/AMDGPU/SILowerFlowControl.cpp
> deleted file mode 100644
> index 0d90c13..0000000
> --- a/lib/Target/AMDGPU/SILowerFlowControl.cpp
> +++ /dev/null
> @@ -1,193 +0,0 @@
> -//===-- SILowerFlowControl.cpp - Use predicates for flow control ----------===//
> -//
> -// The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
> -//===----------------------------------------------------------------------===//
> -//
> -// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, ENDIF)
> -// to predicated instructions.
> -//
> -// All flow control (except loops) is handled using predicated instructions and
> -// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
> -// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
> -// by writting to the 64-bit EXEC register (each bit corresponds to a
> -// single vector ALU). Typically, for predicates, a vector ALU will write
> -// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
> -// Vector ALU) and then the ScalarALU will AND the VCC register with the
> -// EXEC to update the predicates.
> -//
> -// For example:
> -// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
> -// SI_IF_NZ %VCC
> -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
> -// ELSE
> -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
> -// ENDIF
> -//
> -// becomes:
> -//
> -// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask
> -// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask
> -// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
> -// S_CBRANCH_EXECZ label0 // This instruction is an
> -// // optimization which allows us to
> -// // branch if all the bits of
> -// // EXEC are zero.
> -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
> -//
> -// label0:
> -// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask
> -// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then block
> -// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block
> -// S_BRANCH_EXECZ label1 // Use our branch optimization
> -// // instruction again.
> -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
> -// label1:
> -// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
> -//===----------------------------------------------------------------------===//
> -
> -#include "AMDGPU.h"
> -#include "SIInstrInfo.h"
> -#include "SIMachineFunctionInfo.h"
> -#include "llvm/CodeGen/MachineFunction.h"
> -#include "llvm/CodeGen/MachineFunctionPass.h"
> -#include "llvm/CodeGen/MachineInstrBuilder.h"
> -#include "llvm/CodeGen/MachineRegisterInfo.h"
> -
> -using namespace llvm;
> -
> -namespace {
> -
> -class SILowerFlowControlPass : public MachineFunctionPass {
> -
> -private:
> - static char ID;
> - const TargetInstrInfo *TII;
> - std::vector<unsigned> PredicateStack;
> - std::vector<unsigned> UnusedRegisters;
> -
> - void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> - void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> -
> -public:
> - SILowerFlowControlPass(TargetMachine &tm) :
> - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
> -
> - virtual bool runOnMachineFunction(MachineFunction &MF);
> -
> - const char *getPassName() const {
> - return "SI Lower flow control instructions";
> - }
> -
> -};
> -
> -} // End anonymous namespace
> -
> -char SILowerFlowControlPass::ID = 0;
> -
> -FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) {
> - return new SILowerFlowControlPass(tm);
> -}
> -
> -bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) {
> -
> - // Find all the unused registers that can be used for the predicate stack.
> - for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
> - S = AMDGPU::SReg_64RegClass.end();
> - I != S; ++I) {
> - unsigned Reg = *I;
> - if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
> - UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
> - }
> - }
> -
> - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
> - BB != BB_E; ++BB) {
> - MachineBasicBlock &MBB = *BB;
> - for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
> - I != MBB.end(); I = Next) {
> - Next = llvm::next(I);
> - MachineInstr &MI = *I;
> - switch (MI.getOpcode()) {
> - default: break;
> - case AMDGPU::SI_IF_NZ:
> - pushExecMask(MBB, I);
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
> - AMDGPU::EXEC)
> - .addOperand(MI.getOperand(0)) // VCC
> - .addReg(AMDGPU::EXEC);
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
> - PredicateStack.back())
> - .addReg(PredicateStack.back())
> - .addReg(AMDGPU::EXEC);
> - MI.eraseFromParent();
> - break;
> - case AMDGPU::ELSE:
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> - UnusedRegisters.back())
> - .addReg(AMDGPU::EXEC);
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> - AMDGPU::EXEC)
> - .addReg(PredicateStack.back());
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> - PredicateStack.back())
> - .addReg(UnusedRegisters.back());
> - MI.eraseFromParent();
> - break;
> - case AMDGPU::ENDIF:
> - popExecMask(MBB, I);
> - if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
> - PredicateStack.empty()) {
> - // If the exec mask is non-zero, skip the next two instructions
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> - .addImm(3)
> - .addReg(AMDGPU::EXEC);
> -
> - // Exec mask is zero: Export to NULL target...
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
> - .addImm(0)
> - .addImm(0x09) // V_008DFC_SQ_EXP_NULL
> - .addImm(0)
> - .addImm(1)
> - .addImm(1)
> - .addReg(AMDGPU::SREG_LIT_0)
> - .addReg(AMDGPU::SREG_LIT_0)
> - .addReg(AMDGPU::SREG_LIT_0)
> - .addReg(AMDGPU::SREG_LIT_0);
> -
> - // ... and terminate wavefront
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
> - }
> - MI.eraseFromParent();
> - break;
> - }
> - }
> - }
> - return false;
> -}
> -
> -void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB,
> - MachineBasicBlock::iterator I) {
> -
> - assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
> - unsigned StackReg = UnusedRegisters.back();
> - UnusedRegisters.pop_back();
> - PredicateStack.push_back(StackReg);
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> - StackReg)
> - .addReg(AMDGPU::EXEC);
> -}
> -
> -void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB,
> - MachineBasicBlock::iterator I) {
> - unsigned StackReg = PredicateStack.back();
> - PredicateStack.pop_back();
> - UnusedRegisters.push_back(StackReg);
> - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
> - AMDGPU::EXEC)
> - .addReg(AMDGPU::EXEC)
> - .addReg(StackReg);
> -}
More information about the mesa-dev
mailing list