[Mesa-dev] PATCH: R600 + SI: Add support for lds atomic add
Tom Stellard
tom at stellard.net
Wed Aug 21 11:30:27 PDT 2013
Hi,
The attached patches fix some LDS bugs on SI and add support for atomic
add for R600 and SI.
Please Review.
-Tom
-------------- next part --------------
>From 0447b0918efca9000e66414e9ff1bd291936f702 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 21 Aug 2013 11:35:27 -0400
Subject: [PATCH 1/4] R600/SI: Don't emit S_WQM_B64 instruction for compute
shaders
---
lib/Target/R600/SILowerControlFlow.cpp | 3 ++-
test/CodeGen/R600/load.ll | 13 +++++++++++++
test/CodeGen/R600/local-memory.ll | 1 +
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2e8f02..09cf25b 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -409,6 +409,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getTarget().getInstrInfo();
TRI = MF.getTarget().getRegisterInfo();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
bool NeedM0 = false;
@@ -508,7 +509,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::M0).addImm(0xffffffff);
}
- if (NeedWQM) {
+ if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index c7fe611..8829ff5 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -446,6 +446,7 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
; R600-CHECK: @load_i8_local
; R600-CHECK: LDS_UBYTE_READ_RET
; SI-CHECK: @load_i8_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U8
define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
%1 = load i8 addrspace(3)* %in
@@ -458,6 +459,7 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
; R600-CHECK: LDS_UBYTE_READ_RET
; R600-CHECK: ASHR
; SI-CHECK: @load_i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I8
define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
entry:
@@ -471,6 +473,7 @@ entry:
; R600-CHECK: LDS_UBYTE_READ_RET
; R600-CHECK: LDS_UBYTE_READ_RET
; SI-CHECK: @load_v2i8_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U8
; SI-CHECK: DS_READ_U8
define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
@@ -487,6 +490,7 @@ entry:
; R600-CHECK-DAG: ASHR
; R600-CHECK-DAG: ASHR
; SI-CHECK: @load_v2i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I8
; SI-CHECK: DS_READ_I8
define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
@@ -503,6 +507,7 @@ entry:
; R600-CHECK: LDS_UBYTE_READ_RET
; R600-CHECK: LDS_UBYTE_READ_RET
; SI-CHECK: @load_v4i8_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U8
; SI-CHECK: DS_READ_U8
; SI-CHECK: DS_READ_U8
@@ -525,6 +530,7 @@ entry:
; R600-CHECK-DAG: ASHR
; R600-CHECK-DAG: ASHR
; SI-CHECK: @load_v4i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I8
; SI-CHECK: DS_READ_I8
; SI-CHECK: DS_READ_I8
@@ -541,6 +547,7 @@ entry:
; R600-CHECK: @load_i16_local
; R600-CHECK: LDS_USHORT_READ_RET
; SI-CHECK: @load_i16_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U16
define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
entry:
@@ -554,6 +561,7 @@ entry:
; R600-CHECK: LDS_USHORT_READ_RET
; R600-CHECK: ASHR
; SI-CHECK: @load_i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I16
define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
entry:
@@ -567,6 +575,7 @@ entry:
; R600-CHECK: LDS_USHORT_READ_RET
; R600-CHECK: LDS_USHORT_READ_RET
; SI-CHECK: @load_v2i16_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U16
; SI-CHECK: DS_READ_U16
define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
@@ -583,6 +592,7 @@ entry:
; R600-CHECK-DAG: ASHR
; R600-CHECK-DAG: ASHR
; SI-CHECK: @load_v2i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I16
; SI-CHECK: DS_READ_I16
define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
@@ -599,6 +609,7 @@ entry:
; R600-CHECK: LDS_USHORT_READ_RET
; R600-CHECK: LDS_USHORT_READ_RET
; SI-CHECK: @load_v4i16_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_U16
; SI-CHECK: DS_READ_U16
; SI-CHECK: DS_READ_U16
@@ -621,6 +632,7 @@ entry:
; R600-CHECK-DAG: ASHR
; R600-CHECK-DAG: ASHR
; SI-CHECK: @load_v4i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_I16
; SI-CHECK: DS_READ_I16
; SI-CHECK: DS_READ_I16
@@ -637,6 +649,7 @@ entry:
; R600-CHECK: @load_i32_local
; R600-CHECK: LDS_READ_RET
; SI-CHECK: @load_i32_local
+; SI-CHECK-NOT: S_WQM_B64
; SI-CHECK: DS_READ_B32
define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 9ebb769..bd0d59c 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -13,6 +13,7 @@
; SI-CHECK-NEXT: .long 32768
; EG-CHECK: LDS_WRITE
+; SI-CHECK_NOT: S_WQM_B64
; SI-CHECK: DS_WRITE_B32 0
; GROUP_BARRIER must be the last instruction in a clause
--
1.7.11.4
-------------- next part --------------
>From a3606062a83cbe95916b2983f5e966a80540e6ff Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 20 Aug 2013 13:17:22 -0700
Subject: [PATCH 2/4] R600: Fix incorrect LDS size calculation
GlobalAdderss nodes that appeared in more than one basic block were
being counted twice.
---
lib/Target/R600/AMDGPUISelLowering.cpp | 14 ++++++++++----
lib/Target/R600/AMDGPUMachineFunction.h | 4 ++++
test/CodeGen/R600/lds-size.ll | 26 ++++++++++++++++++++++++++
3 files changed, 40 insertions(+), 4 deletions(-)
create mode 100644 test/CodeGen/R600/lds-size.ll
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 24b826b..5497356 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -248,12 +248,18 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
- unsigned Offset = MFI->LDSSize;
const GlobalValue *GV = G->getGlobal();
- uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
- // XXX: Account for alignment?
- MFI->LDSSize += Size;
+ unsigned Offset;
+ if (MFI->LocalMemoryObjects.count(GV) == 0) {
+ uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+ Offset = MFI->LDSSize;
+ MFI->LocalMemoryObjects[GV] = Offset;
+ // XXX: Account for alignment?
+ MFI->LDSSize += Size;
+ } else {
+ Offset = MFI->LocalMemoryObjects[GV];
+ }
return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
}
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 789b96a..fe80ce3 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -14,6 +14,7 @@
#define AMDGPUMACHINEFUNCTION_H
#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
namespace llvm {
@@ -21,6 +22,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
public:
AMDGPUMachineFunction(const MachineFunction &MF);
unsigned ShaderType;
+ /// A map to keep track of local memory objects and their offsets within
+ /// the local memory space.
+ std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
};
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
new file mode 100644
index 0000000..2185180
--- /dev/null
+++ b/test/CodeGen/R600/lds-size.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test makes sure we do not double count global values when they are
+; used in different basic blocks.
+
+; CHECK-LABEL: @test
+; CHECK: .long 166120
+; CHECK-NEXT: .long 1
+ at lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4
+
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %0 = icmp eq i32 %cond, 0
+ br i1 %0, label %if, label %else
+
+if:
+ store i32 1, i32 addrspace(3)* @lds
+ br label %endif
+
+else:
+ store i32 2, i32 addrspace(3)* @lds
+ br label %endif
+
+endif:
+ ret void
+}
--
1.7.11.4
-------------- next part --------------
>From 48dce43f670fe6c58987e4c8a3c0d31c3e3f9c3a Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 19 Aug 2013 07:28:48 -0700
Subject: [PATCH 3/4] R600: Expand SELECT nodes rather than custom lowering
them
---
lib/Target/R600/R600ISelLowering.cpp | 20 +++++-----------
lib/Target/R600/R600ISelLowering.h | 1 -
test/CodeGen/R600/select.ll | 46 ++++++++++++++++++++++++++++++++++++
3 files changed, 52 insertions(+), 15 deletions(-)
create mode 100644 test/CodeGen/R600/select.ll
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index f0242b8..450e2a8 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -60,8 +60,12 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
- setOperationAction(ISD::SELECT, MVT::i32, Custom);
- setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -480,7 +484,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::FCOS:
case ISD::FSIN: return LowerTrig(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
- case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
@@ -930,17 +933,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
DAG.getCondCode(ISD::SETNE));
}
-SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- return DAG.getNode(ISD::SELECT_CC,
- SDLoc(Op),
- Op.getValueType(),
- Op.getOperand(0),
- DAG.getConstant(0, MVT::i32),
- Op.getOperand(1),
- Op.getOperand(2),
- DAG.getCondCode(ISD::SETNE));
-}
-
/// LLVM generates byte-addresed pointers. For indirect addressing, we need to
/// convert these pointers to a register index. Each register holds
/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a033fcb..811850d 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -56,7 +56,6 @@ private:
SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
new file mode 100644
index 0000000..f940142
--- /dev/null
+++ b/test/CodeGen/R600/select.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Normally icmp + select is optimized to select_cc, when this happens the
+; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
+;
+; In order to avoid the select_cc optimization, this test case calculates the
+; condition for the select in a separate basic block.
+
+; CHECK-LABEL: @select
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+ <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
+ <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+ i32 %cond) {
+entry:
+ br label %for
+body:
+ %inc = add i32 %i, 1
+ %br_cmp.i = icmp eq i1 %br_cmp, 0
+ br label %for
+for:
+ %i = phi i32 [ %inc, %body], [ 0, %entry ]
+ %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
+ %0 = icmp eq i32 %cond, %i
+ %1 = select i1 %br_cmp, i32 2, i32 3
+ %2 = select i1 %br_cmp, float 2.0 , float 5.0
+ %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
+ %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
+ %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+ %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
+ br i1 %0, label %body, label %done
+
+done:
+ store i32 %1, i32 addrspace(1)* %i32out
+ store float %2, float addrspace(1)* %f32out
+ store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
+ store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
+ store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
+ store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+ ret void
+}
--
1.7.11.4
-------------- next part --------------
>From 8f258501b8e45434125fed4d0b88bfaadc0a62ce Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 20 Aug 2013 13:22:28 -0700
Subject: [PATCH 4/4] R600: Add support for local memory atomic add
---
lib/Target/R600/AMDGPUInstructions.td | 5 ++++
lib/Target/R600/R600ISelLowering.cpp | 22 ++++++++++++------
lib/Target/R600/R600InstrInfo.h | 6 +++++
lib/Target/R600/R600Instructions.td | 42 ++++++++++++++++++++++++++++------
lib/Target/R600/SIInstrInfo.td | 12 ++++++++++
lib/Target/R600/SIInstructions.td | 4 ++++
lib/Target/R600/SILowerControlFlow.cpp | 1 +
test/CodeGen/R600/atomic_load_add.ll | 23 +++++++++++++++++++
8 files changed, 101 insertions(+), 14 deletions(-)
create mode 100644 test/CodeGen/R600/atomic_load_add.ll
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index dec6082..6745fed 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -191,6 +191,11 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return isLocalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
+ (atomic_load_add node:$ptr, node:$value), [{
+ return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
def mskor_global : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 450e2a8..ff9ba52 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -109,16 +109,24 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
switch (MI->getOpcode()) {
default:
- if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::LDS_1A) {
- MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
- TII->get(MI->getOpcode()),
- AMDGPU::OQAP);
+ if (TII->isLDSInstr(MI->getOpcode()) &&
+ TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) {
+ int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ assert(DstIdx != -1);
+ MachineInstrBuilder NewMI;
+ if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) {
+ NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()),
+ AMDGPU::OQAP);
+ TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
+ MI->getOperand(0).getReg(),
+ AMDGPU::OQAP);
+ } else {
+ NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+ TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+ }
for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
NewMI.addOperand(MI->getOperand(i));
}
- TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
- MI->getOperand(0).getReg(),
- AMDGPU::OQAP);
} else {
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index e28d771..189d062 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -273,6 +273,12 @@ namespace llvm {
void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
};
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
} // End llvm namespace
#endif // R600INSTRINFO_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index f5c0266..76c3c4f 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1640,23 +1640,39 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
let DisableEncoding = "$dst";
}
-class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+ string dst =""> :
R600_LDS <
- lds_op,
- (outs),
+ lds_op, outs,
(ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
LAST:$last, R600_Pred:$pred_sel,
BANK_SWIZZLE:$bank_swizzle),
- " "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
+ " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
pattern
> {
+ field string BaseOp;
+
let src2 = 0;
let src2_rel = 0;
let LDS_1A1D = 1;
}
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+ let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+ let BaseOp = name;
+ let usesCustomInserter = 1;
+ let DisableEncoding = "$dst";
+ let Defs = [OQAP];
+}
+
class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
R600_LDS <
lds_op,
@@ -1670,15 +1686,19 @@ class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
let LDS_1A2D = 1;
}
-def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
[(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
>;
-def LDS_BYTE_WRITE : R600_LDS_1A1D<0x12, "LDS_BYTE_WRITE",
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
[(truncstorei8_local i32:$src1, i32:$src0)]
>;
-def LDS_SHORT_WRITE : R600_LDS_1A1D<0x13, "LDS_SHORT_WRITE",
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
[(truncstorei16_local i32:$src1, i32:$src0)]
>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+ [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
>;
@@ -2442,3 +2462,11 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>;
def : DwordAddrPat <i32, R600_Reg32>;
} // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+ let FilterClass = "R600_LDS_1A1D";
+ let RowFields = ["BaseOp"];
+ let ColFields = ["DisableEncoding"];
+ let KeyCol = ["$dst"];
+ let ValueCols = [[""""]];
+}
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index ecc4718..09d5f01 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -362,6 +362,18 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
let vdst = 0;
}
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS <
+ op,
+ (outs rc:$vdst),
+ (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0,
+ i8imm:$offset1),
+ asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]",
+ []> {
+ let mayStore = 1;
+ let mayLoad = 1;
+ let data1 = 0;
+}
+
class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
op,
(outs),
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 136f69c..31a5ad2 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -391,6 +391,7 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
} // End isCompare = 1
+def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
@@ -1775,6 +1776,9 @@ def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
+ (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>;
+
/********** ================== **********/
/********** SMRD Patterns **********/
/********** ================== **********/
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 09cf25b..a6c43bb 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -488,6 +488,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
NeedWQM = true;
// Fall through
case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_ADD_U32_RTN:
NeedM0 = true;
break;
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
new file mode 100644
index 0000000..054d9cd
--- /dev/null
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @atomic_add_local
+; R600-CHECK: LDS_ADD *
+; SI-CHECK-LABEL: @atomic_add_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_local(i32 addrspace(3)* %local) {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+ ret void
+}
+
+; R600-CHECK-LABEL: @atomic_add_ret_local
+; R600-CHECK: LDS_ADD_RET *
+; SI-CHECK-LABEL: @atomic_add_ret_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
--
1.7.11.4
More information about the mesa-dev
mailing list