[Mesa-dev] R600/SI: Support for local memory and derivatives

Wed Jul 10 08:15:28 PDT 2013

On Wed, Jul 10, 2013 at 12:32:25PM +0200, Michel Dänzer wrote:
> On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote:
> > On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote:
> > > 
> > > These patches implement enough of local memory support to allow radeonsi
> > > to use that for computing derivatives, as suggested by Tom.
> > > 
> > > They also almost allow test/CodeGen/R600/local-memory.ll to generate
> > > code for SI. Right now it still fails because it tries to copy a VGPR to
> > > an SGPR, which is not possible.
> > 
> > Can you add some lit tests for these new intrinsics
> 
> Done, updated patches attached.
> 
> 
> > and also add CHECK lines for SI to the existing local-memory.ll test.
> 
> Can't do that while it still fails to generate SI code. Should I commit
> the other patches anyway, which are only necessary for that test?
> 
>

Can you add a TODO comment to that test for adding SI checks?

With that change, the patches are:

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

> -- 
> Earthling Michel Dänzer           |                   http://www.amd.com
> Libre software enthusiast         |          Debian, X and DRI developer

> From 3572bab6a6b5c967d19add0b0497a96123754ec2 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer at amd.com>
> Date: Thu, 21 Feb 2013 16:12:45 +0100
> Subject: [PATCH v2 1/4] R600/SI: Add intrinsics for texture sampling with user
>  derivatives
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
> ---
> 
> v2: Add lit test
> 
>  lib/Target/R600/SIInstructions.td    |   7 +-
>  lib/Target/R600/SIIntrinsics.td      |   1 +
>  test/CodeGen/R600/llvm.SI.sampled.ll | 140 +++++++++++++++++++++++++++++++++++
>  3 files changed, 147 insertions(+), 1 deletion(-)
>  create mode 100644 test/CodeGen/R600/llvm.SI.sampled.ll
> 
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 9c96c08..c9eac7d 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
>  //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
>  def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
>  //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
> -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
> +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper <0x0000002a, "IMAGE_SAMPLE_C_D">;
>  //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
>  def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
>  def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
> @@ -1296,6 +1296,11 @@ multiclass SamplePatterns<ValueType addr_type> {
>    def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
>    def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
>    def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
> +
> +  def : SamplePattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
> +  def : SampleArrayPattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
> +  def : SampleShadowPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
> +  def : SampleShadowArrayPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
>  }
>  
>  defm : SamplePatterns<v2i32>;
> diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
> index 224cd2f..d2643e0 100644
> --- a/lib/Target/R600/SIIntrinsics.td
> +++ b/lib/Target/R600/SIIntrinsics.td
> @@ -23,6 +23,7 @@ let TargetPrefix = "SI", isTarget = 1 in {
>  
>    def int_SI_sample : Sample;
>    def int_SI_sampleb : Sample;
> +  def int_SI_sampled : Sample;
>    def int_SI_samplel : Sample;
>  
>    def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
> diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
> new file mode 100644
> index 0000000..71b8ef5
> --- /dev/null
> +++ b/test/CodeGen/R600/llvm.SI.sampled.ll
> @@ -0,0 +1,140 @@
> +;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
> +
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
> +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
> +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
> +
> +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
> +   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
> +   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
> +   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
> +   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
> +   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
> +   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
> +   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
> +   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
> +   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
> +   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
> +   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
> +   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
> +   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
> +   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
> +   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
> +   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
> +   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
> +      <8 x i32> undef, <4 x i32> undef, i32 1)
> +   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
> +      <8 x i32> undef, <4 x i32> undef, i32 2)
> +   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
> +      <8 x i32> undef, <4 x i32> undef, i32 3)
> +   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
> +      <8 x i32> undef, <4 x i32> undef, i32 4)
> +   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
> +      <8 x i32> undef, <4 x i32> undef, i32 5)
> +   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
> +      <8 x i32> undef, <4 x i32> undef, i32 6)
> +   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
> +      <8 x i32> undef, <4 x i32> undef, i32 7)
> +   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
> +      <8 x i32> undef, <4 x i32> undef, i32 8)
> +   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
> +      <8 x i32> undef, <4 x i32> undef, i32 9)
> +   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
> +      <8 x i32> undef, <4 x i32> undef, i32 10)
> +   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
> +      <8 x i32> undef, <4 x i32> undef, i32 11)
> +   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
> +      <8 x i32> undef, <4 x i32> undef, i32 12)
> +   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
> +      <8 x i32> undef, <4 x i32> undef, i32 13)
> +   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
> +      <8 x i32> undef, <4 x i32> undef, i32 14)
> +   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
> +      <8 x i32> undef, <4 x i32> undef, i32 15)
> +   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
> +      <8 x i32> undef, <4 x i32> undef, i32 16)
> +   %e1 = extractelement <4 x float> %res1, i32 0
> +   %e2 = extractelement <4 x float> %res2, i32 1
> +   %e3 = extractelement <4 x float> %res3, i32 2
> +   %e4 = extractelement <4 x float> %res4, i32 3
> +   %t0 = extractelement <4 x float> %res5, i32 0
> +   %t1 = extractelement <4 x float> %res5, i32 1
> +   %e5 = fadd float %t0, %t1
> +   %t2 = extractelement <4 x float> %res6, i32 0
> +   %t3 = extractelement <4 x float> %res6, i32 2
> +   %e6 = fadd float %t2, %t3
> +   %t4 = extractelement <4 x float> %res7, i32 0
> +   %t5 = extractelement <4 x float> %res7, i32 3
> +   %e7 = fadd float %t4, %t5
> +   %t6 = extractelement <4 x float> %res8, i32 1
> +   %t7 = extractelement <4 x float> %res8, i32 2
> +   %e8 = fadd float %t6, %t7
> +   %t8 = extractelement <4 x float> %res9, i32 1
> +   %t9 = extractelement <4 x float> %res9, i32 3
> +   %e9 = fadd float %t8, %t9
> +   %t10 = extractelement <4 x float> %res10, i32 2
> +   %t11 = extractelement <4 x float> %res10, i32 3
> +   %e10 = fadd float %t10, %t11
> +   %t12 = extractelement <4 x float> %res11, i32 0
> +   %t13 = extractelement <4 x float> %res11, i32 1
> +   %t14 = extractelement <4 x float> %res11, i32 2
> +   %t15 = fadd float %t12, %t13
> +   %e11 = fadd float %t14, %t15
> +   %t16 = extractelement <4 x float> %res12, i32 0
> +   %t17 = extractelement <4 x float> %res12, i32 1
> +   %t18 = extractelement <4 x float> %res12, i32 3
> +   %t19 = fadd float %t16, %t17
> +   %e12 = fadd float %t18, %t19
> +   %t20 = extractelement <4 x float> %res13, i32 0
> +   %t21 = extractelement <4 x float> %res13, i32 2
> +   %t22 = extractelement <4 x float> %res13, i32 3
> +   %t23 = fadd float %t20, %t21
> +   %e13 = fadd float %t22, %t23
> +   %t24 = extractelement <4 x float> %res14, i32 1
> +   %t25 = extractelement <4 x float> %res14, i32 2
> +   %t26 = extractelement <4 x float> %res14, i32 3
> +   %t27 = fadd float %t24, %t25
> +   %e14 = fadd float %t26, %t27
> +   %t28 = extractelement <4 x float> %res15, i32 0
> +   %t29 = extractelement <4 x float> %res15, i32 1
> +   %t30 = extractelement <4 x float> %res15, i32 2
> +   %t31 = extractelement <4 x float> %res15, i32 3
> +   %t32 = fadd float %t28, %t29
> +   %t33 = fadd float %t30, %t31
> +   %e15 = fadd float %t32, %t33
> +   %e16 = extractelement <4 x float> %res16, i32 3
> +   %s1 = fadd float %e1, %e2
> +   %s2 = fadd float %s1, %e3
> +   %s3 = fadd float %s2, %e4
> +   %s4 = fadd float %s3, %e5
> +   %s5 = fadd float %s4, %e6
> +   %s6 = fadd float %s5, %e7
> +   %s7 = fadd float %s6, %e8
> +   %s8 = fadd float %s7, %e9
> +   %s9 = fadd float %s8, %e10
> +   %s10 = fadd float %s9, %e11
> +   %s11 = fadd float %s10, %e12
> +   %s12 = fadd float %s11, %e13
> +   %s13 = fadd float %s12, %e14
> +   %s14 = fadd float %s13, %e15
> +   %s15 = fadd float %s14, %e16
> +   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
> +   ret void
> +}
> +
> +declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
> +
> +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> -- 
> 1.8.3.2
> 

> From 6aeea8ac2d419fff3465b04ed4bd6ea5ff889650 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer at amd.com>
> Date: Thu, 21 Feb 2013 18:51:38 +0100
> Subject: [PATCH 2/4] R600/SI: Initial support for LDS/GDS instructions
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
> ---
>  lib/Target/R600/SIInsertWaits.cpp      |  2 ++
>  lib/Target/R600/SIInstrFormats.td      | 24 ++++++++++++++++++++++++
>  lib/Target/R600/SIInstrInfo.td         | 23 +++++++++++++++++++++++
>  lib/Target/R600/SIInstructions.td      |  3 +++
>  lib/Target/R600/SILowerControlFlow.cpp | 16 ++++++++++++++++
>  5 files changed, 68 insertions(+)
> 
> diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
> index c36e1dc..d31da45 100644
> --- a/lib/Target/R600/SIInsertWaits.cpp
> +++ b/lib/Target/R600/SIInsertWaits.cpp
> @@ -134,6 +134,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
>    if (TSFlags & SIInstrFlags::LGKM_CNT) {
>  
>      MachineOperand &Op = MI.getOperand(0);
> +    if (!Op.isReg())
> +      Op = MI.getOperand(1);
>      assert(Op.isReg() && "First LGKM operand must be a register!");
>  
>      unsigned Reg = Op.getReg();
> diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
> index 51f323d..434aa7e 100644
> --- a/lib/Target/R600/SIInstrFormats.td
> +++ b/lib/Target/R600/SIInstrFormats.td
> @@ -281,6 +281,30 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
>  
>  let Uses = [EXEC] in {
>  
> +class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
> +    Enc64 <outs, ins, asm, pattern> {
> +
> +  bits<8> vdst;
> +  bits<1> gds;
> +  bits<8> addr;
> +  bits<8> data0;
> +  bits<8> data1;
> +  bits<8> offset0;
> +  bits<8> offset1;
> +
> +  let Inst{7-0} = offset0;
> +  let Inst{15-8} = offset1;
> +  let Inst{17} = gds;
> +  let Inst{25-18} = op;
> +  let Inst{31-26} = 0x36; //encoding
> +  let Inst{39-32} = addr;
> +  let Inst{47-40} = data0;
> +  let Inst{55-48} = data1;
> +  let Inst{63-56} = vdst;
> +
> +  let LGKM_CNT = 1;
> +}
> +
>  class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
>      Enc64<outs, ins, asm, pattern> {
>  
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 36812ca..655a8b1 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -287,6 +287,29 @@ class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
>  // Vector I/O classes
>  //===----------------------------------------------------------------------===//
>  
> +class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
> +  op,
> +  (outs regClass:$vdst),
> +  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
> +       i8imm:$offset0, i8imm:$offset1),
> +  asm#" $vdst, $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
> +  []> {
> +  let mayLoad = 1;
> +  let mayStore = 0;
> +}
> +
> +class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
> +  op,
> +  (outs),
> +  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
> +       i8imm:$offset0, i8imm:$offset1),
> +  asm#" $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
> +  []> {
> +  let mayStore = 1;
> +  let mayLoad = 0;
> +  let vdst = 0;
> +}
> +
>  class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
>    op,
>    (outs),
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index c9eac7d..09460d8 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -391,6 +391,9 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
>  
>  } // End isCompare = 1
>  
> +def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
> +def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
> +
>  //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
>  //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
>  //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
> diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
> index 5b434fb..c2e8f02 100644
> --- a/lib/Target/R600/SILowerControlFlow.cpp
> +++ b/lib/Target/R600/SILowerControlFlow.cpp
> @@ -411,6 +411,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>    TRI = MF.getTarget().getRegisterInfo();
>  
>    bool HaveKill = false;
> +  bool NeedM0 = false;
>    bool NeedWQM = false;
>    unsigned Depth = 0;
>  
> @@ -482,6 +483,13 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>            IndirectDst(MI);
>            break;
>  
> +        case AMDGPU::DS_READ_B32:
> +          NeedWQM = true;
> +          // Fall through
> +        case AMDGPU::DS_WRITE_B32:
> +          NeedM0 = true;
> +          break;
> +
>          case AMDGPU::V_INTERP_P1_F32:
>          case AMDGPU::V_INTERP_P2_F32:
>          case AMDGPU::V_INTERP_MOV_F32:
> @@ -492,6 +500,14 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>      }
>    }
>  
> +  if (NeedM0) {
> +    MachineBasicBlock &MBB = MF.front();
> +    // Initialize M0 to a value that won't cause LDS access to be discarded
> +    // due to offset clamping
> +    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
> +            AMDGPU::M0).addImm(0xffffffff);
> +  }
> +
>    if (NeedWQM) {
>      MachineBasicBlock &MBB = MF.front();
>      BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
> -- 
> 1.8.3.2
> 

> From 18c3a2f532989d867634a1e140935de3beef4bc1 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer at amd.com>
> Date: Fri, 14 Jun 2013 11:12:53 +0200
> Subject: [PATCH v2 3/4] R600/SI: Add intrinsic for retrieving the current thread
>  ID
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
> ---
> 
> v2: Add lit test
> 
>  lib/Target/R600/SIInstructions.td | 10 ++++++++--
>  lib/Target/R600/SIIntrinsics.td   |  1 +
>  test/CodeGen/R600/llvm.SI.tid.ll  | 16 ++++++++++++++++
>  3 files changed, 25 insertions(+), 2 deletions(-)
>  create mode 100644 test/CodeGen/R600/llvm.SI.tid.ll
> 
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 09460d8..61755b4 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -903,8 +903,8 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
>  defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
>  defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
>  //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
> -//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
> -//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
> +defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
> +defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
>  
>  let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
>  defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
> @@ -1575,6 +1575,12 @@ def : Pat <
>                     (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>  >;
>  
> +def : Pat <
> +  (int_SI_tid),
> +  (V_MBCNT_HI_U32_B32_e32 0xffffffff,
> +                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
> +>;
> +
>  /********** ================== **********/
>  /**********   VOP3 Patterns    **********/
>  /********** ================== **********/
> diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
> index d2643e0..2fa073e 100644
> --- a/lib/Target/R600/SIIntrinsics.td
> +++ b/lib/Target/R600/SIIntrinsics.td
> @@ -14,6 +14,7 @@
>  
>  let TargetPrefix = "SI", isTarget = 1 in {
>  
> +  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
>    def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
>    def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
>    def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
> diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
> new file mode 100644
> index 0000000..238d9f2
> --- /dev/null
> +++ b/test/CodeGen/R600/llvm.SI.tid.ll
> @@ -0,0 +1,16 @@
> +;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
> +
> +;CHECK: V_MBCNT_LO_U32_B32_e64
> +;CHECK: V_MBCNT_HI_U32_B32_e32
> +
> +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
> +main_body:
> +  %4 = call i32 @llvm.SI.tid()
> +  %5 = bitcast i32 %4 to float
> +  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
> +  ret void
> +}
> +
> +declare i32 @llvm.SI.tid() readnone
> +
> +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> -- 
> 1.8.3.2
> 

> From a4defa41c9e607c9a3039507cb7fc9337ef30be3 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer at amd.com>
> Date: Mon, 17 Jun 2013 12:21:29 +0200
> Subject: [PATCH 4/4] R600/SI: Initial local memory support
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
> ---
>  lib/Target/R600/AMDGPUAsmPrinter.cpp   |  7 +++++++
>  lib/Target/R600/AMDGPUISelLowering.cpp |  4 +---
>  lib/Target/R600/R600ISelLowering.cpp   |  2 ++
>  lib/Target/R600/SIDefines.h            |  4 ++++
>  lib/Target/R600/SIISelLowering.cpp     |  5 +++++
>  lib/Target/R600/SIInstructions.td      | 15 +++++++++++++++
>  6 files changed, 34 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> index 996d2a6..e039b77 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> @@ -233,7 +233,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
>  
>    OutStreamer.EmitIntValue(RsrcReg, 4);
>    OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
> +
> +  if (MFI->ShaderType == ShaderType::COMPUTE) {
> +    OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
> +    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
> +  }
>    if (MFI->ShaderType == ShaderType::PIXEL) {
> +    OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
> +    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
>      OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
>      OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
>    }
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index 4019a1f..7fad3bb 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -72,8 +72,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
>    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
>    setOperationAction(ISD::UREM, MVT::i32, Expand);
>  
> -  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
> -
>    int types[] = {
>      (int)MVT::v2i32,
>      (int)MVT::v4i32
> @@ -158,7 +156,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
>    // XXX: Account for alignment?
>    MFI->LDSSize += Size;
>  
> -  return DAG.getConstant(Offset, MVT::i32);
> +  return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32);
>  }
>  
>  SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
> diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> index ad4fd87..7aef08a 100644
> --- a/lib/Target/R600/R600ISelLowering.cpp
> +++ b/lib/Target/R600/R600ISelLowering.cpp
> @@ -92,6 +92,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
>    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
>    setTargetDAGCombine(ISD::SELECT_CC);
>  
> +  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
> +
>    setBooleanContents(ZeroOrNegativeOneBooleanContent);
>    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
>    setSchedulingPreference(Sched::VLIW);
> diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
> index 716b093..147578c 100644
> --- a/lib/Target/R600/SIDefines.h
> +++ b/lib/Target/R600/SIDefines.h
> @@ -12,11 +12,15 @@
>  #define SIDEFINES_H_
>  
>  #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
> +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
> +#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
>  #define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
>  #define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
>  #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
>  #define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
>  #define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
> +#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
> +#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
>  #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
>  
>  #endif // SIDEFINES_H_
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 9d4cfef..a314bc4 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -75,6 +75,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
>  
>    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
>  
> +  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
> +
>    setTargetDAGCombine(ISD::SELECT_CC);
>  
>    setTargetDAGCombine(ISD::SETCC);
> @@ -310,11 +312,14 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
>  //===----------------------------------------------------------------------===//
>  
>  SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
> +  MachineFunction &MF = DAG.getMachineFunction();
> +  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
>    switch (Op.getOpcode()) {
>    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
>    case ISD::BRCOND: return LowerBRCOND(Op, DAG);
>    case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
>    case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
> +  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
>    case ISD::INTRINSIC_WO_CHAIN: {
>      unsigned IntrinsicID =
>                           cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 61755b4..e3cfdad 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1590,6 +1590,21 @@ def : Pat <
>    (V_MAD_F32 $src0, $src1, $src2)
>  >;
>  
> +/********** ======================= **********/
> +/**********   Load/Store Patterns   **********/
> +/********** ======================= **********/
> +
> +def : Pat <
> +    (local_load i64:$src0),
> +    (i32 (DS_READ_B32 0, (EXTRACT_SUBREG $src0, sub0),
> +                      (EXTRACT_SUBREG $src0, sub0), (EXTRACT_SUBREG $src0, sub0), 0, 0))
> +>;
> +
> +def : Pat <
> +    (local_store i32:$src1, i64:$src0),
> +    (DS_WRITE_B32 0, (EXTRACT_SUBREG $src0, sub0), $src1, $src1, 0, 0)
> +>;
> +
>  /********** ================== **********/
>  /**********   SMRD Patterns    **********/
>  /********** ================== **********/
> -- 
> 1.8.3.2
>