[Mesa-dev] [PATCH 5/5] R600: Fold immediates into ALU instructions when possible v2
Vincent Lejeune
vljn at ovi.com
Thu Nov 29 08:32:16 PST 2012
It's a nice optimisation, it also makes machine code easier to read.
Reviewed-by: Vincent Lejeune <vljn at ovi.com>
----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : mesa-dev at lists.freedesktop.org
> Cc : Tom Stellard <thomas.stellard at amd.com>
> Envoyé le : Mercredi 28 novembre 2012 23h50
> Objet : [Mesa-dev] [PATCH 5/5] R600: Fold immediates into ALU instructions when possible v2
>
> From: Tom Stellard <thomas.stellard at amd.com>
>
> v2:
> - Fold the immediates using the SelectionDAG
> ---
> lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp | 94 +++++++++++++++++++++++++
> lib/Target/AMDGPU/R600InstrInfo.cpp | 16 ++++-
> lib/Target/AMDGPU/R600InstrInfo.h | 7 ++
> test/CodeGen/R600/fcmp-cnd.ll | 4 +-
> test/CodeGen/R600/fcmp-cnde-int-args.ll | 2 +-
> test/CodeGen/R600/literals.ll | 30 ++++++++
> test/CodeGen/R600/selectcc-icmp-select-float.ll | 2 +-
> test/CodeGen/R600/selectcc_cnde.ll | 2 +-
> test/CodeGen/R600/selectcc_cnde_int.ll | 2 +-
> 9 files changed, 153 insertions(+), 6 deletions(-)
> create mode 100644 test/CodeGen/R600/literals.ll
>
> diff --git a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> index 10ce6ad..2a80f1b 100644
> --- a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> +++ b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
> @@ -14,6 +14,7 @@
> #include "AMDGPUISelLowering.h" // For AMDGPUISD
> #include "AMDGPURegisterInfo.h"
> #include "AMDILDevices.h"
> +#include "R600InstrInfo.h"
> #include "llvm/ADT/ValueMap.h"
> #include "llvm/CodeGen/PseudoSourceValue.h"
> #include "llvm/CodeGen/SelectionDAGISel.h"
> @@ -167,6 +168,99 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
> }
> }
> break;
> + case ISD::ConstantFP:
> + case ISD::Constant:
> + {
> + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
> + // XXX: Custom immediate lowering not implemented yet. Instead we use
> + // pseudo instructions defined in SIInstructions.td
> + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
> + break;
> + }
> + const R600InstrInfo *TII = static_cast<const
> R600InstrInfo*>(TM.getInstrInfo());
> +
> + uint64_t ImmValue = 0;
> + unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
> +
> + if (N->getOpcode() == ISD::ConstantFP) {
> + // XXX: 64-bit Immediates not supported yet
> + assert(N->getValueType(0) != MVT::f64);
> +
> + ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
> + APFloat Value = C->getValueAPF();
> + float FloatValue = Value.convertToFloat();
> + if (FloatValue == 0.0) {
> + ImmReg = AMDGPU::ZERO;
> + } else if (FloatValue == 0.5) {
> + ImmReg = AMDGPU::HALF;
> + } else if (FloatValue == 1.0) {
> + ImmReg = AMDGPU::ONE;
> + } else {
> + ImmValue = Value.bitcastToAPInt().getZExtValue();
> + }
> + } else {
> + // XXX: 64-bit Immediates not supported yet
> + assert(N->getValueType(0) != MVT::i64);
> +
> + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
> + if (C->getZExtValue() == 0) {
> + ImmReg = AMDGPU::ZERO;
> + } else if (C->getZExtValue() == 1) {
> + ImmReg = AMDGPU::ONE_INT;
> + } else {
> + ImmValue = C->getZExtValue();
> + }
> + }
> +
> + for (SDNode::use_iterator Use = N->use_begin(), E = SDNode::use_end();
> + Use != E; ++Use) {
> + std::vector<SDValue> Ops;
> + for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
> + Ops.push_back(Use->getOperand(i));
> + }
> +
> + if (!Use->isMachineOpcode()) {
> + if (ImmReg == AMDGPU::ALU_LITERAL_X) {
> + // We can only use literal constants (e.g. AMDGPU::ZERO,
> + // AMDGPU::ONE, etc) in machine opcodes.
> + continue;
> + }
> + } else {
> + if (!TII->isALUInstr(Use->getMachineOpcode())) {
> + continue;
> + }
> +
> + int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
> R600Operands::IMM);
> + assert(ImmIdx != -1);
> +
> + // subtract one from ImmIdx, because the DST operand is usually index
> + // 0 for MachineInstrs, but we have no DST in the Ops vector.
> + ImmIdx--;
> +
> + // Check that we aren't already using an immediate.
> + // XXX: It's possible for an instruction to have more than one
> + // immediate operand, but this is not supported yet.
> + if (ImmReg == AMDGPU::ALU_LITERAL_X) {
> + ConstantSDNode *C =
> dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
> + assert(C);
> +
> + if (C->getZExtValue() != 0) {
> + // This instruction is already using an immediate.
> + continue;
> + }
> +
> + // Set the immediate value
> + Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
> + }
> + }
> + // Set the immediate register
> + Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
> +
> + CurDAG->UpdateNodeOperands(*Use, Ops.data(),
> Use->getNumOperands());
> + }
> + break;
> + }
> +
> }
> return SelectCode(N);
> }
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp
> b/lib/Target/AMDGPU/R600InstrInfo.cpp
> index 20b1aa3..814e0a2 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.cpp
> +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
> @@ -127,6 +127,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
> }
> }
>
> +bool R600InstrInfo::isALUInstr(unsigned Opcode) const
> +{
> + unsigned TargetFlags = get(Opcode).TSFlags;
> +
> + return ((TargetFlags & R600_InstFlag::OP1) |
> + (TargetFlags & R600_InstFlag::OP2) |
> + (TargetFlags & R600_InstFlag::OP3));
> +}
> +
> DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
> const ScheduleDAG *DAG) const {
> const InstrItineraryData *II = TM->getInstrItineraryData();
> @@ -505,6 +514,11 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock
> &BB,
>
> int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
> R600Operands::Ops Op) const {
> + return getOperandIdx(MI.getOpcode(), Op);
> +}
> +
> +int R600InstrInfo::getOperandIdx(unsigned Opcode,
> + R600Operands::Ops Op) const {
> const static int OpTable[3][R600Operands::COUNT] = {
> // W C S S S S S S S S
> // R O D L S R R R S R R R S R R L P
> @@ -515,7 +529,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
> {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
> {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
> };
> - unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
> + unsigned TargetFlags = get(Opcode).TSFlags;
> unsigned OpTableIdx;
>
> if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
> diff --git a/lib/Target/AMDGPU/R600InstrInfo.h
> b/lib/Target/AMDGPU/R600InstrInfo.h
> index cec1c3b..81e1828 100644
> --- a/lib/Target/AMDGPU/R600InstrInfo.h
> +++ b/lib/Target/AMDGPU/R600InstrInfo.h
> @@ -50,6 +50,9 @@ namespace llvm {
> bool isReductionOp(unsigned opcode) const;
> bool isCubeOp(unsigned opcode) const;
>
> + /// isALUInstr - Returns true if this Opcode represents an ALU instruction.
> + bool isALUInstr(unsigned Opcode) const;
> +
> /// isVector - Vector instructions are instructions that must fill all
> /// instruction slots within an instruction group.
> bool isVector(const MachineInstr &MI) const;
> @@ -130,6 +133,10 @@ namespace llvm {
> /// if the Instruction does not contain the specified Op.
> int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
>
> + /// getOperandIdx - Get the index of Op for the given Opcode. Returns -1
> + /// if the Instruction does not contain the specified Op.
> + int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
> +
> /// setImmOperand - Helper function for setting instruction flag values.
> void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm)
> const;
>
> diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
> index c6b6236..a94cfb5 100644
> --- a/test/CodeGen/R600/fcmp-cnd.ll
> +++ b/test/CodeGen/R600/fcmp-cnd.ll
> @@ -1,6 +1,8 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW],
> T[0-9]+\.[XYZW]}}
> +;Not checking arguments 2 and 3 to CNDE, because they may change between
> +;registers and literal.x depending on what the optimizer does.
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll
> b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> index 92f3b5f..5c981ef 100644
> --- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
> +++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
> @@ -4,7 +4,7 @@
> ; chance to optimize the fcmp + select instructions to CNDE was missed
> ; due to the fact that the operands to fcmp and select had different types
>
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW],
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
>
> define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
> entry:
> diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
> new file mode 100644
> index 0000000..4c731b2
> --- /dev/null
> +++ b/test/CodeGen/R600/literals.ll
> @@ -0,0 +1,30 @@
> +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> +
> +; Test using an integer literal constant.
> +; Generated ASM should be:
> +; ADD_INT REG literal.x, 5
> +; or
> +; ADD_INT literal.x REG, 5
> +
> +; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
> +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
> +entry:
> + %0 = add i32 5, %in
> + store i32 %0, i32 addrspace(1)* %out
> + ret void
> +}
> +
> +; Test using a float literal constant.
> +; Generated ASM should be:
> +; ADD REG literal.x, 5.0
> +; or
> +; ADD literal.x REG, 5.0
> +
> +; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
> +define void @float_literal(float addrspace(1)* %out, float %in) {
> +entry:
> + %0 = fadd float 5.0, %in
> + store float %0, float addrspace(1)* %out
> + ret void
> +}
> +
> diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll
> b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> index f1f8ab1..f65a300 100644
> --- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
> +++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
> @@ -2,7 +2,7 @@
>
> ; Note additional optimizations may cause this SGT to be replaced with a
> ; CND* instruction.
> -; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW],
> T[0-9]+\.[XYZW]}}
> +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
> ; Test a selectcc with i32 LHS/RHS and float True/False
>
> define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
> diff --git a/test/CodeGen/R600/selectcc_cnde.ll
> b/test/CodeGen/R600/selectcc_cnde.ll
> index e06a170..f0a0f51 100644
> --- a/test/CodeGen/R600/selectcc_cnde.ll
> +++ b/test/CodeGen/R600/selectcc_cnde.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ;CHECK-NOT: SETE
> -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW],
> T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x,
> [-0-9]+\(2.0}}
> define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
> %1 = load float addrspace(1)* %in
> %2 = fcmp oeq float %1, 0.0
> diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll
> b/test/CodeGen/R600/selectcc_cnde_int.ll
> index 03d000f..b38078e 100644
> --- a/test/CodeGen/R600/selectcc_cnde_int.ll
> +++ b/test/CodeGen/R600/selectcc_cnde_int.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ;CHECK-NOT: SETE_INT
> -;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW],
> T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
> define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
> %1 = load i32 addrspace(1)* %in
> %2 = icmp eq i32 %1, 0
> --
> 1.7.11.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
More information about the mesa-dev
mailing list