[Mesa-dev] [PATCH 4/4] R600/SI: Support AMDGPU.ddx/y intrinsics

Michel Dänzer michel at daenzer.net
Fri Feb 22 08:02:13 PST 2013


From: Michel Dänzer <michel.daenzer at amd.com>

Use LDS for calculating the deltas between neighbouring pixels. Not sure the
sign of the delta is correct for both dimensions (the sign doesn't seem to
matter for the relevant piglit tests), but it'll be easy to fix that up if
not.

Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
---
 lib/Target/R600/SIISelLowering.cpp | 84 ++++++++++++++++++++++++++++++++++++++
 lib/Target/R600/SIISelLowering.h   |  2 +
 lib/Target/R600/SIInstructions.td  | 29 ++++++++++++-
 3 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 212e3f2..b21a7e8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -75,6 +75,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
 
+  case AMDGPU::SI_DD:
+    LowerSI_DD(MI, *BB, I, MRI);
+    break;
   case AMDGPU::SI_INTERP:
     LowerSI_INTERP(MI, *BB, I, MRI);
     break;
@@ -93,6 +96,87 @@ void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
   MI->eraseFromParent();
 }
 
+void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  unsigned mbcnt_lo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned mbcnt = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned tid = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned tid0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned tid1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned coord0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned coord1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand coord = MI->getOperand(1);
+  MachineOperand incr = MI->getOperand(2);
+
+  // Get this thread's ID
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo)
+          .addImm(0xffffffff)
+          .addImm(0x80) // Inline constant 0
+          .addImm(0)
+          .addImm(0)
+          .addImm(0)
+          .addImm(0);
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e32), mbcnt)
+          .addImm(0xffffffff)
+          .addReg(mbcnt_lo);
+
+  // Multiply by 4 to get a DWORD offset
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_LSHL_B32_e64), tid)
+          .addReg(mbcnt)
+          .addImm(0x82) // Inline constant 2
+          .addImm(0)
+          .addImm(0)
+          .addImm(0)
+          .addImm(0);
+
+  // Write this thread's coordinate to LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_WRITE_B32))
+          .addOperand(coord)
+          .addImm(0) // LDS
+          .addReg(tid)
+          .addOperand(coord)
+          .addOperand(coord)
+          .addImm(0)
+          .addImm(0);
+
+  // Get bottom left thread ID * 4
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_AND_B32_e32), tid0)
+          .addImm(0xfffffff0)
+          .addReg(tid);
+
+  // Read bottom left thread's coordinate from LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord0)
+          .addImm(0) // LDS
+          .addReg(tid0)
+          .addReg(tid0)
+          .addReg(tid0)
+          .addImm(0)
+          .addImm(0);
+
+  // Get bottom right / top left thread ID * 4
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_ADD_I32_e32), tid1)
+          .addOperand(incr)
+          .addReg(tid0);
+
+  // Read bottom right / top left thread's coordinate from LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord1)
+          .addImm(0) // LDS
+          .addReg(tid1)
+          .addReg(tid1)
+          .addReg(tid1)
+          .addImm(0)
+          .addImm(0);
+
+  // Subtract bottom left coordinate from bottom right / top left coordinate
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_SUB_F32_e32))
+          .addOperand(dst)
+          .addReg(coord1)
+          .addReg(coord0);
+
+  MI->eraseFromParent();
+}
+
 void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
   unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 5d048f8..905a43e 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -25,6 +25,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, unsigned Opocde) const;
+  void LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
   void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
   void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 7152c49..490fb99 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -806,8 +806,8 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
 //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
-//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
-//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let Defs = [VCC] in { // Carry-out goes to VCC
 defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
   [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
@@ -996,6 +996,13 @@ def LOAD_CONST : AMDGPUShaderInst <
 
 let usesCustomInserter = 1 in {
 
+def SI_DD : InstSI <
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src, i32imm:$incr),
+  "SI_DD $src, $incr",
+  []
+>;
+
 def SI_INTERP : InstSI <
   (outs VReg_32:$dst),
   (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
@@ -1396,6 +1403,24 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
 >;
 
+def : Pat <
+  (int_AMDGPU_ddx VReg_128:$src, imm, imm, imm),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub0), 4), sub0),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub1), 4), sub1),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub2), 4), sub2),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub3), 4), sub3)
+>;
+
+def : Pat <
+  (int_AMDGPU_ddy VReg_128:$src, imm, imm, imm),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub0), 8), sub0),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub1), 8), sub1),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub2), 8), sub2),
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub3), 8), sub3)
+>;
+
 /********** ================== **********/
 /**********   VOP3 Patterns    **********/
 /********** ================== **********/
-- 
1.8.1.3



More information about the mesa-dev mailing list