<div dir="ltr">On 22 February 2013 08:02, Michel Dänzer <span dir="ltr"><<a href="mailto:michel@daenzer.net" target="_blank">michel@daenzer.net</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote">
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Michel Dänzer <<a href="mailto:michel.daenzer@amd.com">michel.daenzer@amd.com</a>><br>
<br>
Use LDS for calculating the deltas between neighbouring pixels. Not sure the<br>
sign of the delta is correct for both dimensions (the sign doesn't seem to<br>
matter for the relevant piglit tests), but it'll be easy to fix that up if<br>
not.<br></blockquote><div><br></div><div style>Is this the back-end code that implements the dFdx() and dFdy() GLSL functions?  If so, then there is a piglit test that verifies that the sign of the delta is correct for both dimensions.  It's in tests/fbo/fbo-deriv.c.</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
Signed-off-by: Michel Dänzer <<a href="mailto:michel.daenzer@amd.com">michel.daenzer@amd.com</a>><br>
---<br>
 lib/Target/R600/SIISelLowering.cpp | 84 ++++++++++++++++++++++++++++++++++++++<br>
 lib/Target/R600/SIISelLowering.h   |  2 +<br>
 lib/Target/R600/SIInstructions.td  | 29 ++++++++++++-<br>
 3 files changed, 113 insertions(+), 2 deletions(-)<br>
<br>
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp<br>
index 212e3f2..b21a7e8 100644<br>
--- a/lib/Target/R600/SIISelLowering.cpp<br>
+++ b/lib/Target/R600/SIISelLowering.cpp<br>
@@ -75,6 +75,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(<br>
     MI->eraseFromParent();<br>
     break;<br>
<br>
+  case AMDGPU::SI_DD:<br>
+    LowerSI_DD(MI, *BB, I, MRI);<br>
+    break;<br>
   case AMDGPU::SI_INTERP:<br>
     LowerSI_INTERP(MI, *BB, I, MRI);<br>
     break;<br>
@@ -93,6 +96,87 @@ void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,<br>
   MI->eraseFromParent();<br>
 }<br>
<br>
+void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,<br>
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {<br>
+  unsigned mbcnt_lo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned mbcnt = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned tid = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned tid0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned tid1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned coord0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  unsigned coord1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
+  MachineOperand dst = MI->getOperand(0);<br>
+  MachineOperand coord = MI->getOperand(1);<br>
+  MachineOperand incr = MI->getOperand(2);<br>
+<br>
+  // Get this thread's ID<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo)<br>
+          .addImm(0xffffffff)<br>
+          .addImm(0x80) // Inline constant 0<br>
+          .addImm(0)<br>
+          .addImm(0)<br>
+          .addImm(0)<br>
+          .addImm(0);<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e32), mbcnt)<br>
+          .addImm(0xffffffff)<br>
+          .addReg(mbcnt_lo);<br>
+<br>
+  // Multiply by 4 to get a DWORD offset<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_LSHL_B32_e64), tid)<br>
+          .addReg(mbcnt)<br>
+          .addImm(0x82) // Inline constant 2<br>
+          .addImm(0)<br>
+          .addImm(0)<br>
+          .addImm(0)<br>
+          .addImm(0);<br>
+<br>
+  // Write this thread's coordinate to LDS<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_WRITE_B32))<br>
+          .addOperand(coord)<br>
+          .addImm(0) // LDS<br>
+          .addReg(tid)<br>
+          .addOperand(coord)<br>
+          .addOperand(coord)<br>
+          .addImm(0)<br>
+          .addImm(0);<br>
+<br>
+  // Get bottom left thread ID * 4<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_AND_B32_e32), tid0)<br>
+          .addImm(0xfffffff0)<br>
+          .addReg(tid);<br>
+<br>
+  // Read bottom left thread's coordinate from LDS<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord0)<br>
+          .addImm(0) // LDS<br>
+          .addReg(tid0)<br>
+          .addReg(tid0)<br>
+          .addReg(tid0)<br>
+          .addImm(0)<br>
+          .addImm(0);<br>
+<br>
+  // Get bottom right / top left thread ID * 4<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_ADD_I32_e32), tid1)<br>
+          .addOperand(incr)<br>
+          .addReg(tid0);<br>
+<br>
+  // Read bottom right / top left thread's coordinate from LDS<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord1)<br>
+          .addImm(0) // LDS<br>
+          .addReg(tid1)<br>
+          .addReg(tid1)<br>
+          .addReg(tid1)<br>
+          .addImm(0)<br>
+          .addImm(0);<br>
+<br>
+  // Subtract bottom left coordinate from bottom right / top left coordinate<br>
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_SUB_F32_e32))<br>
+          .addOperand(dst)<br>
+          .addReg(coord1)<br>
+          .addReg(coord0);<br>
+<br>
+  MI->eraseFromParent();<br>
+}<br>
+<br>
 void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,<br>
     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {<br>
   unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);<br>
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h<br>
index 5d048f8..905a43e 100644<br>
--- a/lib/Target/R600/SIISelLowering.h<br>
+++ b/lib/Target/R600/SIISelLowering.h<br>
@@ -25,6 +25,8 @@ class SITargetLowering : public AMDGPUTargetLowering {<br>
<br>
   void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,<br>
               MachineBasicBlock::iterator I, unsigned Opocde) const;<br>
+  void LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,<br>
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;<br>
   void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,<br>
               MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;<br>
   void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,<br>
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td<br>
index 7152c49..490fb99 100644<br>
--- a/lib/Target/R600/SIInstructions.td<br>
+++ b/lib/Target/R600/SIInstructions.td<br>
@@ -806,8 +806,8 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;<br>
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;<br>
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;<br>
 //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;<br>
-//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;<br>
-//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;<br>
+defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;<br>
+defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;<br>
 let Defs = [VCC] in { // Carry-out goes to VCC<br>
 defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",<br>
   [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]<br>
@@ -996,6 +996,13 @@ def LOAD_CONST : AMDGPUShaderInst <<br>
<br>
 let usesCustomInserter = 1 in {<br>
<br>
+def SI_DD : InstSI <<br>
+  (outs VReg_32:$dst),<br>
+  (ins VReg_32:$src, i32imm:$incr),<br>
+  "SI_DD $src, $incr",<br>
+  []<br>
+>;<br>
+<br>
 def SI_INTERP : InstSI <<br>
   (outs VReg_32:$dst),<br>
   (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),<br>
@@ -1396,6 +1403,24 @@ def : Pat <<br>
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)<br>
 >;<br>
<br>
+def : Pat <<br>
+  (int_AMDGPU_ddx VReg_128:$src, imm, imm, imm),<br>
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub0), 4), sub0),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub1), 4), sub1),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub2), 4), sub2),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub3), 4), sub3)<br>
+>;<br>
+<br>
+def : Pat <<br>
+  (int_AMDGPU_ddy VReg_128:$src, imm, imm, imm),<br>
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub0), 8), sub0),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub1), 8), sub1),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub2), 8), sub2),<br>
+    (SI_DD (EXTRACT_SUBREG VReg_128:$src, sub3), 8), sub3)<br>
+>;<br>
+<br>
 /********** ================== **********/<br>
 /**********   VOP3 Patterns    **********/<br>
 /********** ================== **********/<br>
<span class="HOEnZb"><font color="#888888">--<br>
1.8.1.3<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/mesa-dev" target="_blank">http://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>