[Mesa-dev] R600 Patchset: Optimizations for bfgminer
Tom Stellard
tom at stellard.net
Mon Apr 29 13:24:17 PDT 2013
Hi,
The attached patchset implements a few optimizations for the bfgminer
bitcoin mining program.
Please Review.
-Tom
-------------- next part --------------
>From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov <dcherkassov at gmail.com>
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/3] R600: Add 64-bit load/store support
* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 -> v2f32 conversions
Signed-off-by: Dmitry Cherkassov <dcherkassov at gmail.com>
Tom Stellard:
- Mark vec2 operations as expand. The addition of a vec2 register
class made them all legal.
---
lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++
lib/Target/R600/AMDILISelDAGToDAG.cpp | 10 ++++-
lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 ++
lib/Target/R600/R600ISelLowering.cpp | 17 +++++++++
lib/Target/R600/R600InstrInfo.cpp | 19 ++++++----
lib/Target/R600/R600Instructions.td | 44 ++++++++++++++++++++++
lib/Target/R600/R600RegisterInfo.td | 16 ++++++++
test/CodeGen/R600/64bit-kernel-args.ll | 41 ++++++++++++++++++++
test/CodeGen/R600/fadd.ll | 10 +++++
test/CodeGen/R600/fdiv.ll | 37 +++++++++++++-----
test/CodeGen/R600/fmul.ll | 10 +++++
test/CodeGen/R600/fp_to_sint.ll | 10 +++++
test/CodeGen/R600/fp_to_uint.ll | 10 +++++
test/CodeGen/R600/fsub.ll | 20 +++++++---
test/CodeGen/R600/setcc.ll | 18 +++++++--
test/CodeGen/R600/sint_to_fp.ll | 10 +++++
test/CodeGen/R600/udiv.ll | 20 +++++++---
test/CodeGen/R600/uint_to_fp.ll | 10 +++++
test/CodeGen/R600/urem.ll | 21 ++++++++---
19 files changed, 292 insertions(+), 40 deletions(-)
create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+ setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index ba75a44..198cd7e 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
break;
}
+ unsigned RegSequenceClassID;
+ EVT VT = N->getValueType(0);
+ assert(VT.isVector());
+ switch (VT.getVectorNumElements()) {
+ case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break;
+ case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break;
+ default: llvm_unreachable("Unhandled vector width in BUILD_VECTOR");
+ }
// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
// that adds a 128 bits reg copy when going through TwoAddressInstructions
// pass. We want to avoid 128 bits copies as much as possible because they
// can't be bundled by our scheduler.
SDValue RegSeqArgs[9] = {
- CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+ CurDAG->getTargetConstant(RegSequenceClassID, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..030fc87 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
} else {
switch(MI.getOpcode()) {
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
EmitByte(INSTR_NATIVE, OS);
@@ -160,9 +161,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case AMDGPU::VTX_READ_PARAM_8_eg:
case AMDGPU::VTX_READ_PARAM_16_eg:
case AMDGPU::VTX_READ_PARAM_32_eg:
+ case AMDGPU::VTX_READ_PARAM_64_eg:
case AMDGPU::VTX_READ_PARAM_128_eg:
case AMDGPU::VTX_READ_GLOBAL_8_eg:
case AMDGPU::VTX_READ_GLOBAL_32_eg:
+ case AMDGPU::VTX_READ_GLOBAL_64_eg:
case AMDGPU::VTX_READ_GLOBAL_128_eg:
case AMDGPU::TEX_VTX_CONSTBUF:
case AMDGPU::TEX_VTX_TEXBUF : {
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a66baca..b6b7c32 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -32,22 +32,38 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
computeRegisterProperties();
setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+ setOperationAction(ISD::FADD, MVT::v2f32, Expand);
setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+ setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+ setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
setOperationAction(ISD::ADD, MVT::v4i32, Expand);
+ setOperationAction(ISD::ADD, MVT::v2i32, Expand);
setOperationAction(ISD::AND, MVT::v4i32, Expand);
+ setOperationAction(ISD::AND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+ setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+ setOperationAction(ISD::UREM, MVT::v2i32, Expand);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -158,6 +174,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 85288c9..eb09665 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -50,9 +50,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
- if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
- for (unsigned I = 0; I < 4; I++) {
+ unsigned VectorComponents = 0;
+ if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+ VectorComponents = 4;
+ } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+ VectorComponents = 2;
+ }
+
+ if (VectorComponents > 0) {
+ for (unsigned I = 0; I < VectorComponents; I++) {
unsigned SubRegIndex = RI.getSubRegFromChannel(I);
buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
RI.getSubReg(DestReg, SubRegIndex),
@@ -61,11 +69,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
RegState::Define | RegState::Implicit);
}
} else {
-
- // We can't copy vec4 registers
- assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
DestReg, SrcReg);
NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index ea8ee05..b1e8d1c 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1794,6 +1794,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
[(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
>;
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+ (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ 0x3, "RAT_WRITE_CACHELESS_64_eg",
+ [(global_store (v2i32 R600_Reg64:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
//128-bit store
def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
@@ -1901,6 +1908,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$ptr.ptr = $dst";
}
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+ : VTX_READ_eg <"VTX_READ_64", buffer_id, (outs R600_Reg64:$dst),
+ pattern> {
+
+ let MEGA_FETCH_COUNT = 8;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
: VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
pattern> {
@@ -1934,6 +1953,11 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
[(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
>;
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+ [(set (v2i32 R600_Reg64:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+
def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
[(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
>;
@@ -1952,6 +1976,12 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
[(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
>;
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+ [(set (v2i32 R600_Reg64:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+
// 128-bit reads
def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
[(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
@@ -2439,10 +2469,24 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 0, sub0>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, R600_Reg64, 0, sub0>;
+def : Extract_Element <i32, v2i32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 1, sub1>;
+
// bitconvert patterns
def : BitConvert <i32, f32, R600_Reg32>;
def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
def : BitConvert <v4f32, v4i32, R600_Reg128>;
def : BitConvert <v4i32, v4f32, R600_Reg128>;
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 5a2e65c..6bde923 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
let HWEncoding = encoding;
}
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+ RegisterWithSubRegs<n, subregs> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = encoding;
+}
+
+
foreach Index = 0-127 in {
foreach Chan = [ "X", "Y", "Z", "W" ] in {
// 32-bit Temporary Registers
@@ -41,6 +49,11 @@ foreach Index = 0-127 in {
!cast<Register>("T"#Index#"_Z"),
!cast<Register>("T"#Index#"_W")],
Index>;
+
+ def T#Index#_XY : R600Reg_64 <"T"#Index#".XY",
+ [!cast<Register>("T"#Index#"_X"),
+ !cast<Register>("T"#Index#"_Y")],
+ Index>;
}
// KCACHE_BANK0
@@ -184,6 +197,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
let CopyCost = -1;
}
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add (sequence "T%u_XY", 0, 63))>;
+
//===----------------------------------------------------------------------===//
// Register classes for indirect addressing
//===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
new file mode 100644
index 0000000..6f03b68
--- /dev/null
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @v2i32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_extract_store(i32 addrspace(1)* nocapture %out, <2 x i32> %in) {
+entry:
+ %0 = extractelement <2 x i32> %in, i32 0
+ store i32 %0, i32 addrspace(1)* %out, align 4
+ %1 = extractelement <2 x i32> %in, i32 1
+ %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+ store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+ ret void
+}
+
+; CHECK: @v2f32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_extract_store(float addrspace(1)* nocapture %out, <2 x float> %in) {
+entry:
+ %0 = extractelement <2 x float> %in, i32 0
+ store float %0, float addrspace(1)* %out, align 4
+ %1 = extractelement <2 x float> %in, i32 1
+ %arrayidx1 = getelementptr inbounds float addrspace(1)* %out, i32 1
+ store float %1, float addrspace(1)* %arrayidx1, align 4
+ ret void
+}
+
+; CHECK: @v2i32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_store(<2 x i32> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+ store <2 x i32> %in, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @v2f32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_store(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+ store <2 x float> %in, <2 x float> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 81a4fa5..1e51c35 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
+; CHECK: @fadd_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fadd <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fadd_v4f32
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 79e677f..240f1e5 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,15 +1,32 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; CHECK: @fdiv_v2f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fdiv <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @fdiv_v4f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float> addrspace(1) * %in
%b = load <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 7fd22d8..74c277d 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
+; CHECK: @fmul_v2f32
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fmul <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fmul_v4f32
; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 9c21ad2..dabfe41 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @fp_to_sint_v2i32
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+ %result = fptosi <2 x float> %in to <2 x i32>
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fp_to_sint_v4i32
; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index d91098f..95c62f7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @fp_to_uint_v2i32
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+ %result = fptoui <2 x float> %in to <2 x i32>
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fp_to_uint_v4i32
; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 812388b..f93212c 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -2,7 +2,6 @@
; CHECK: @fsub_f32
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
define void @fsub_f32() {
%r0 = call float @llvm.R600.load.input(i32 0)
%r1 = call float @llvm.R600.load.input(i32 1)
@@ -15,12 +14,21 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fsub_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fsub <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+; CHECK: @fsub_v4f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 0752f2e..ba8fca7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @setcc_v2i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = icmp eq <2 x i32> %a, %b
+ %sext = sext <2 x i1> %result to <2 x i32>
+ store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @setcc_v4i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 6a56db3..dc163da 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @sint_to_fp_v2i32
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+ %result = sitofp <2 x i32> %in to <2 x float>
+ store <2 x float> %result, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @sint_to_fp_v4i32
; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index b81e366..0e91b2b 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,11 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
-;CHECK: CF_END
+; The code generated by udiv is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail on udiv
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @udiv_v2i32
+; CHECK: CF_END
+define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = udiv <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @udiv_v4i32
+; CHECK: CF_END
+define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index ae8fc8e..791f117 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @uint_to_fp_v2i32
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+ %result = uitofp <2 x i32> %in to <2 x float>
+ store <2 x float> %result, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @uint_to_fp_v4i32
; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index a2cc0bd..1a50e65 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,11 +1,20 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;The code generated by urem is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 urem
-;CHECK: CF_END
+; The code generated by urem is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail when it gets
+; a urem.
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @urem_v2i32
+; CHECK: CF_END
+define void @urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = urem <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @urem_v4i32
+; CHECK: CF_END
+define void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
--
1.8.1.5
-------------- next part --------------
>From b4a725a06bc00ec00c8d13e207d187d0fcca2ea2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 17 Apr 2013 11:55:44 -0400
Subject: [PATCH 2/3] R600: Add pattern for SHA-256 Ma function
This can be optimized using the BFI_INT instruction.
---
lib/Target/R600/AMDGPUInstructions.td | 8 ++++++++
lib/Target/R600/R600Instructions.td | 3 +++
lib/Target/R600/SIInstructions.td | 6 ++++++
test/CodeGen/R600/bfi_int.ll | 18 ++++++++++++++++++
4 files changed, 35 insertions(+)
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 4b37a53..d801b32 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -281,6 +281,14 @@ multiclass BFIPatterns <Instruction BFI_INT> {
}
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
include "R600Instructions.td"
include "SIInstrInfo.td"
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index b1e8d1c..1630936 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1691,6 +1691,9 @@ let hasSideEffects = 1 in {
def : Pat<(fp_to_uint R600_Reg32:$src0),
(FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
+ // SHA-256 Patterns
+ def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
let Word1{20} = 1; // VALID_PIXEL_MODE
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9faf89b..b6217e6 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1617,4 +1617,10 @@ def : Pat<
(V_CMP_U_F32_e64 f32:$src0, f32:$src1)
>;
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+
} // End isSI predicate
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index c9015a6..abe2591 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -32,3 +32,21 @@ entry:
store i32 %2, i32 addrspace(1)* %out
ret void
}
+
+; SHA-256 Ma function
+; ((x & z) | (y & (x | z)))
+; R600-CHECK: @bfi_sha256_ma
+; R600-CHECK: XOR_INT [[DST:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: BFI_INT {{T[0-9]+\.[XYZW]}}, [[DST]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: V_XOR_B32_e32 [[DST:VGPR[0-9]+]], {{VGPR[0-9]+, VGPR[0-9]+}}
+; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+ %0 = and i32 %x, %z
+ %1 = or i32 %x, %z
+ %2 = and i32 %y, %1
+ %3 = or i32 %0, %2
+ store i32 %3, i32 addrspace(1)* %out
+ ret void
+}
--
1.8.1.5
-------------- next part --------------
>From 2bf7212e559190087101b49e7b09a18f10a85db2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 17 Apr 2013 12:10:21 -0400
Subject: [PATCH 3/3] R600: BFI_INT is a vector-only instruction
---
lib/Target/R600/R600Instructions.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 1630936..78d2320 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1635,7 +1635,7 @@ let Predicates = [isEGorCayman] in {
VecALU
>;
- def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>;
+ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
defm : BFIPatterns <BFI_INT_eg>;
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
--
1.8.1.5
-------------- next part --------------
>From 05c4903985bbe60a54b43ce745d34143895cdf9a Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 29 Apr 2013 13:40:01 -0400
Subject: [PATCH] R600: Expand vector or, shl, srl, and xor nodes
---
lib/Target/R600/R600ISelLowering.cpp | 8 ++++++++
test/CodeGen/R600/or.ll | 23 +++++++++++++++++++++++
test/CodeGen/R600/shl.ll | 23 +++++++++++++++++++++++
test/CodeGen/R600/srl.ll | 23 +++++++++++++++++++++++
test/CodeGen/R600/xor.ll | 23 +++++++++++++++++++++++
5 files changed, 100 insertions(+)
create mode 100644 test/CodeGen/R600/or.ll
create mode 100644 test/CodeGen/R600/shl.ll
create mode 100644 test/CodeGen/R600/srl.ll
create mode 100644 test/CodeGen/R600/xor.ll
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index b6b7c32..ce3f16f 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -54,8 +54,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
+ setOperationAction(ISD::OR, MVT::v4i32, Expand);
+ setOperationAction(ISD::OR, MVT::v2i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
+ setOperationAction(ISD::SHL, MVT::v4i32, Expand);
+ setOperationAction(ISD::SHL, MVT::v2i32, Expand);
+ setOperationAction(ISD::SRL, MVT::v4i32, Expand);
+ setOperationAction(ISD::SRL, MVT::v2i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
@@ -64,6 +70,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::UREM, MVT::v2i32, Expand);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+ setOperationAction(ISD::XOR, MVT::v4i32, Expand);
+ setOperationAction(ISD::XOR, MVT::v2i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
new file mode 100644
index 0000000..e102a5e
--- /dev/null
+++ b/test/CodeGen/R600/or.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @or_v2i32
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = or <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @or_v4i32
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+ %result = or <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
new file mode 100644
index 0000000..88db44d
--- /dev/null
+++ b/test/CodeGen/R600/shl.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @shl_v2i32
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = shl <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @shl_v4i32
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+ %result = shl <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
new file mode 100644
index 0000000..ebfb9bc
--- /dev/null
+++ b/test/CodeGen/R600/srl.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @lshr_v2i32
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = lshr <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @lshr_v4i32
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+ %result = lshr <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
new file mode 100644
index 0000000..109019f
--- /dev/null
+++ b/test/CodeGen/R600/xor.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @xor_v2i32
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = xor <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @xor_v4i32
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+ %result = xor <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
--
1.8.1.5
More information about the mesa-dev
mailing list