[Mesa-dev] R600/SI Patches: A few cleanups for compute

Wed May 15 14:26:14 PDT 2013

Hi,

The attached patches add some new patterns and instructions for SI and
are a prerequisite for more invasive compute shader changes that I'm
working on.

Please Review.

-Tom
-------------- next part --------------
>From 5b87402d1290df5ec8bdbe1333cadb5739a8c8bd Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 21:50:35 -0400
Subject: [PATCH 1/7] R600/SI: Make fitsRegClass() operands const

---
 lib/Target/R600/SIISelLowering.cpp | 2 +-
 lib/Target/R600/SIISelLowering.h   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 6bd82a5..d7e2981 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -513,7 +513,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
 }
 
 /// \brief Does "Op" fit into register class "RegClass" ?
-bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op,
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
                                     unsigned RegClass) const {
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index de637be..e9ea68a 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -30,7 +30,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
-  bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const;
+  bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
+                    unsigned RegClass) const;
   void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 
                        unsigned RegClass, bool &ScalarSlotUsed) const;
 
-- 
1.8.1.5

-------------- next part --------------
>From a2d4b16a0022110c6198ed330966911b2bad3361 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 9 May 2013 16:44:22 -0400
Subject: [PATCH 2/7] R600/SI: Use the same names for VOP3 operands and
 encoding fields

This makes it possible to reorder the operands without breaking the
encoding.
---
 lib/Target/R600/SIInstrFormats.td | 62 +++++++++++++++++++--------------------
 lib/Target/R600/SIInstrInfo.td    | 12 ++++----
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index f737ddd..51f323d 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -185,25 +185,25 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<3> ABS; 
-  bits<1> CLAMP;
-  bits<2> OMOD;
-  bits<3> NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{10-8} = ABS;
-  let Inst{11} = CLAMP;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<3> abs;
+  bits<1> clamp;
+  bits<2> omod;
+  bits<3> neg;
+
+  let Inst{7-0} = dst;
+  let Inst{10-8} = abs;
+  let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -213,23 +213,23 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<7> SDST;
-  bits<2> OMOD;
-  bits<3> NEG;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<3> neg;
 
-  let Inst{7-0} = VDST;
-  let Inst{14-8} = SDST;
+  let Inst{7-0} = dst;
+  let Inst{14-8} = sdst;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index c8aecb7..11c8f9d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -163,8 +163,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
   >, VOP <opName> {
-    let SRC1 = SIOperand.ZERO;
-    let SRC2 = SIOperand.ZERO;
+    let src1 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -189,7 +189,7 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -217,11 +217,11 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
        can write it into any SGPR. We currently don't use the carry out,
        so for now hardcode it to VCC as well */
-    let SDST = SIOperand.VCC;
+    let sdst = SIOperand.VCC;
   }
 }
 
@@ -244,7 +244,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
-- 
1.8.1.5

-------------- next part --------------
>From 6e388fbb80c079e7f88555174ea0252248d9b6df Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 9 May 2013 16:46:47 -0400
Subject: [PATCH 3/7] R600/SI: Add patterns for 64-bit shift operations

---
 lib/Target/R600/SIInstrInfo.td    | 13 +++++++++++++
 lib/Target/R600/SIInstructions.td | 12 +++++++++---
 test/CodeGen/R600/shl.ll          |  3 +++
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 11c8f9d..6d5325b 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -263,6 +263,19 @@ class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
+class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>, VOP <opName> {
+
+  let src2 = SIOperand.ZERO;
+  let abs = 0;
+  let clamp = 0;
+  let omod = 0;
+  let neg = 0;
+}
+
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 0d50c5d..f557922 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -982,9 +982,15 @@ def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
 def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
-def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
-def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
-def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+
+def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64", []>;
+
 def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
 def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
 def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 43cc1e2..db970e9 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -11,3 +11,6 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b)
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; XXX: Add SI test for i64 shl once i64 stores and i64 function arguments are
+; supported.
-- 
1.8.1.5

-------------- next part --------------
>From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 7 May 2013 16:26:26 -0400
Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr

The hardware supports rotr and not rotl.
---
 lib/Target/R600/AMDGPUISelLowering.cpp |  3 +++
 lib/Target/R600/AMDGPUISelLowering.h   |  1 -
 lib/Target/R600/AMDGPUInstrInfo.td     |  6 ------
 lib/Target/R600/AMDGPUInstructions.td  |  6 ++++++
 lib/Target/R600/AMDILISelLowering.cpp  |  2 --
 lib/Target/R600/R600ISelLowering.cpp   | 15 ---------------
 lib/Target/R600/R600Instructions.td    |  6 ++----
 test/CodeGen/R600/rotr.ll              | 29 +++++++++++++++++++++++++++++
 8 files changed, 40 insertions(+), 28 deletions(-)
 create mode 100644 test/CodeGen/R600/rotr.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..b3c51e3 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 
+  // The hardware supports ROTR, but not ROTL
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c2a79ea..6f8ab8b 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -115,7 +115,6 @@ enum {
   RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
-  BITALIGN,
   BUFFER_STORE,
   DWORDADDR,
   FRACT,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index b66ae87..a0a3410 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
 // AMDGPU DAG Nodes
 //
 
-// out = ((a << 32) | b) >> c)
-//
-// Can be used to optimize rtol:
-// rotl(a, b) = bitalign(a, a, 32 - b)
-def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
-
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index d2620b2..54df7d0 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -295,6 +295,12 @@ class BFEPattern <Instruction BFE> : Pat <
   (BFE $x, $y, $z)
 >;
 
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 922cac1..e20dbe0 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
-    // GPU doesn't have a rotl, rotr, or byteswap instruction
-    setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
 
     // GPU doesn't have any counting operators
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 7252235..e58a8dd 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
-  setOperationAction(ISD::ROTL, MVT::i32, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic;
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ROTL: return LowerROTL(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
@@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 }
 
-SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-
-  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
-                     Op.getOperand(0),
-                     Op.getOperand(0),
-                     DAG.getNode(ISD::SUB, DL, VT,
-                                 DAG.getConstant(32, MVT::i32),
-                                 Op.getOperand(1)));
-}
-
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8f47523..83d465a 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1620,10 +1620,8 @@ let Predicates = [isEGorCayman] in {
   def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
 
-  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
-    [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))],
-    VecALU
-  >;
+  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+  def : ROTRPattern <BIT_ALIGN_INT_eg>;
 
   def MULADD_eg : MULADD_Common<0x14>;
   def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
new file mode 100644
index 0000000..30fa01f
--- /dev/null
+++ b/test/CodeGen/R600/rotr.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -debug-only=isel -march=r600 -mcpu=redwood -o - 2>&1 | FileCheck %s
+
+; CHECK: rotr
+; CHECK: @rotr
+; CHECK: BIT_ALIGN_INT
+define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = sub i32 32, %y
+  %1 = shl i32 %x, %0
+  %2 = lshr i32 %x, %y
+  %3 = or i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
+
+; CHECK: rotr
+; CHECK: @rotl
+; CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; CHECK-NEXT: 32
+; CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PV.x
+define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = shl i32 %x, %y
+  %1 = sub i32 32, %y
+  %2 = lshr i32 %x, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
-- 
1.8.1.5

-------------- next part --------------
>From e31ff11aa4c59f2ee24d1315dbbff0d75b2c7c67 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 7 May 2013 17:04:22 -0400
Subject: [PATCH 5/7] R600/SI: Add pattern for rotr

---
 lib/Target/R600/SIInstructions.td |  2 ++
 test/CodeGen/R600/rotr.ll         | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index f557922..7c725cc 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -964,6 +964,8 @@ def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
 def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
 def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
 def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index 30fa01f..5deb280 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,8 +1,13 @@
-; RUN: llc < %s -debug-only=isel -march=r600 -mcpu=redwood -o - 2>&1 | FileCheck %s
+; RUN: llc < %s -debug-only=isel -march=r600 -mcpu=redwood -o - 2>&1 | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -debug-only=isel -march=r600 -mcpu=SI -o - 2>&1 | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: rotr
-; CHECK: @rotr
-; CHECK: BIT_ALIGN_INT
+; R600-CHECK: rotr
+; R600-CHECK: @rotr
+; R600-CHECK: BIT_ALIGN_INT
+
+; SI-CHECK: rotr
+; SI-CHECK: @rotr
+; SI-CHECK: V_ALIGNBIT_B32
 define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = sub i32 32, %y
@@ -13,11 +18,16 @@ entry:
   ret void
 }
 
-; CHECK: rotr
-; CHECK: @rotl
-; CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
-; CHECK-NEXT: 32
-; CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PV.x
+; R600-CHECK: rotr
+; R600-CHECK: @rotl
+; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; R600-CHECK-NEXT: 32
+; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PV.x
+
+; SI-CHECK: rotr
+; SI-CHECK: @rotl
+; SI-CHECK: V_SUB_I32_e32 [[DST:VGPR[0-9]+]], 32, {{VGPR[0-9]+}}
+; SI-CHECK: V_ALIGNBIT_B32 {{VGPR[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}, [[DST]]
 define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
-- 
1.8.1.5

-------------- next part --------------
>From a62d359ff84b8bd85ec87880db83683ece95dbae Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 6 May 2013 22:13:01 -0400
Subject: [PATCH 6/7] R600/SI: Add a pattern for S_LOAD_DWORDX2_* instructions

---
 lib/Target/R600/SIInstructions.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 7c725cc..334c567 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1589,6 +1589,7 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
 
-- 
1.8.1.5

-------------- next part --------------
>From 16d0350caa782a3aa0a39b24220f01ba3975ade6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 7 May 2013 12:55:30 -0400
Subject: [PATCH 7/7] R600/SI: Use a multiclass for MUBUF_Load_Helper

This will simplify the instructions and also the pattern definitions.
---
 lib/Target/R600/SIInstrInfo.td    | 34 +++++++++++++++++++++++-----------
 lib/Target/R600/SIInstructions.td | 16 +++++++---------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 6d5325b..99df1f6 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -300,17 +300,29 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
   let mayLoad = 0;
 }
 
-class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
-  op,
-  (outs regClass:$vdata),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
-       i1imm:$tfe, SSrc_32:$soffset),
-  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
-     #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
+
+  let glc = 0, lds = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */,
+                                          mayLoad = 1 in {
+
+  let offen = 1, idxen = 0, addr64 = 0, offset = 0 in {
+    def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr),
+                         asm#" $srsrc + $vaddr", []>;
+  }
+
+  let offen = 0, idxen = 1, addr64 = 0 in {
+    def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr, i16imm:$offset),
+                         asm#" $srsrc[$vaddr] + $offset", []>;
+  }
+
+  let offen = 0, idxen = 0, addr64 = 1 in {
+    def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                         asm#" $srsrc + $vaddr + $offset", []>;
+  }
+  }
 }
 
 class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 334c567..03eced0 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -394,7 +394,7 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
 //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
-def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
 //def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
 //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
 //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
@@ -403,9 +403,9 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT
 //def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
 //def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
 //def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
-def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
-def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
-def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
 //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
 //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
 
@@ -1214,10 +1214,8 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset,
-                        i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
-                           $buf_idx_vgpr, $tlst, 0, 0, 0)
+  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
 /* int_SI_export */
@@ -1542,7 +1540,7 @@ def : Pat <
 // 3. Offset in an 32Bit VGPR
 def : Pat <
   (int_SI_load_const v16i8:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0)
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
 // The multiplication scales from [0,1] to the unsigned integer range
-- 
1.8.1.5