[Mesa-dev] [PATCH 1/2] radeon/llvm: add an intrinsic converting clipvertex to clipdistance

Sun Dec 16 19:01:36 PST 2012

I think the clipvertex emulation via clipdistance is rather a driver's
job and compiler shouldn't be aware of that, driver should simply emit
all corresponding DOT's and exports.

As for the kcache regs, in your patch it looks like you are going to
hardcode some logic to access another const buffer where clip planes are
stored, but I don't think we need kcache regs until we implement kcache
allocation for alu clauses in the backend, and this implies that we need
instruction scheduler that knows about clauses etc.

IMO first we might want to rework handling of the constants to add
proper support for multiple constant buffers and probably get rid of
Cxxx regs (I think we'll have to store kc_bank and const index in the
instructions instead of using Cxxx regs). Perhaps LOAD_CONST intrinsic
will take a kcache bank (constant buffer) index, a constant
index/address, and the optional base value for relative addressing (or
we can use separate intrinsics for direct/indirect addressing). Then in
case of direct addressing LOAD_CONST will be folded into the alu
instruction. With proper kcache lines allocation for alu clauses const
indices will be mapped to the kcache regs (it's when we'll need them),
but for now we can rely on the existing driver's kcache allocation logic
and simply pass src_sel = 512 + const_index and kc_bank to the driver.
And in case of relative adressing LOAD_CONST will be translated into
FETCH (in some cases we can try to do it directly in the alu
instructions as well, it should be more efficient but it's not always
possible, there are limitations).

With support for multiple const buffers, clipvertex emulation should be
pretty simple - we'll just need to make the driver emit LOAD_CONST's
from clipplane const buffer, DOT's and exports.

Vadim

On Sun, 2012-12-16 at 21:01 +0100, Vincent Lejeune wrote:
> ---
>  lib/Target/AMDGPU/R600ISelLowering.cpp | 21 ++++++++++++++++++++-
>  lib/Target/AMDGPU/R600Instructions.td  | 12 ++++++++++++
>  lib/Target/AMDGPU/R600Intrinsics.td    |  2 ++
>  lib/Target/AMDGPU/R600RegisterInfo.td  | 21 ++++++++++++++++++++-
>  4 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
> index 3a4283c..6c594cc 100644
> --- a/lib/Target/AMDGPU/R600ISelLowering.cpp
> +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
> @@ -328,7 +328,26 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
>  
>      return BB;
>    }
> -
> +  case AMDGPU::ClipVertexAdjust: {
> +    unsigned Temp[4];
> +    for (unsigned i = 0; i < 4; i++) {
> +      unsigned KcacheReg = AMDGPU::R600_KCache128RegClass.getRegister(i + 4 * MI->getOperand(2).getImm());
> +      Temp[i] = MRI.createVirtualRegister(&AMDGPU::R600_Reg32RegClass);
> +       BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::DOT4_r600_pseudo), Temp[i])
> +           .addOperand(MI->getOperand(1))
> +           .addReg(KcacheReg);
> +    }
> +    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(TargetOpcode::REG_SEQUENCE), MI->getOperand(0).getReg())
> +        .addReg(Temp[0])
> +        .addImm(TII->getRegisterInfo().getSubRegFromChannel(0))
> +        .addReg(Temp[1])
> +        .addImm(TII->getRegisterInfo().getSubRegFromChannel(1))
> +        .addReg(Temp[2])
> +        .addImm(TII->getRegisterInfo().getSubRegFromChannel(2))
> +        .addReg(Temp[3])
> +        .addImm(TII->getRegisterInfo().getSubRegFromChannel(3));
> +    break;
> +  }
>    case AMDGPU::EG_ExportSwz:
>    case AMDGPU::R600_ExportSwz: {
>      bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
> diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
> index d89b03b..c3ffe97 100644
> --- a/lib/Target/AMDGPU/R600Instructions.td
> +++ b/lib/Target/AMDGPU/R600Instructions.td
> @@ -598,6 +598,18 @@ class ExportBufInst : InstR600ISA<(
>    let Inst{63-32} = Word1;
>  }
>  
> +let usesCustomInserter = 1 in {
> +
> +def ClipVertexAdjust : AMDGPUInst <(outs R600_Reg128:$dst),
> +  (ins R600_Reg128:$src0, i32imm:$src1),
> +  "DOT4 $dst $src0",
> +  [(set R600_Reg128:$dst,
> +      (int_R600_clipvertex R600_Reg128:$src0, imm:$src1))]
> +> {
> +  field bits<64> Inst;
> +}
> +} // End usesCustomInserter = 1
> +
>  let Predicates = [isR600toCayman] in { 
>  
>  //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
> index 3825bc4..0186f9d 100644
> --- a/lib/Target/AMDGPU/R600Intrinsics.td
> +++ b/lib/Target/AMDGPU/R600Intrinsics.td
> @@ -19,6 +19,8 @@ let TargetPrefix = "R600", isTarget = 1 in {
>      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
>    def int_R600_load_input_linear :
>      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
> +  def int_R600_clipvertex :
> +    Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
>    def int_R600_store_stream_output :
>      Intrinsic<[], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], []>;
>    def int_R600_store_pixel_color :
> diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
> index 3b21825..67449d8 100644
> --- a/lib/Target/AMDGPU/R600RegisterInfo.td
> +++ b/lib/Target/AMDGPU/R600RegisterInfo.td
> @@ -50,6 +50,19 @@ foreach Index = 448-464 in {
>    def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
>  }
>  
> +foreach Index = 160-168 in {
> +  foreach Chan = [ "X", "Y", "Z", "W" ] in {
> +    // 32-bit Temporary Registers
> +    def K#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
> +  }
> +  def K#Index#_XYZW : R600Reg_128 <"K"#Index#".XYZW",
> +                                   [!cast<Register>("K"#Index#"_X"),
> +                                   !cast<Register>("K"#Index#"_Y"),
> +                                   !cast<Register>("K"#Index#"_Z"),
> +                                   !cast<Register>("K"#Index#"_W")],
> +                                   Index>;
> +}
> +
>  // Special Registers
>  
>  def ZERO : R600Reg<"0.0", 248>;
> @@ -117,7 +130,13 @@ def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
>  def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
>      PREDICATE_BIT)>;
>  
> +def R600_KCache128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
> +                                (add (sequence "K%u_XYZW", 160, 168))> {
> +  let isAllocatable = 0;
> +}
> +
>  def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
> -                                (add (sequence "T%u_XYZW", 0, 127))> {
> +                                (add (sequence "T%u_XYZW", 0, 127),
> +                                     R600_KCache128)> {
>    let CopyCost = -1;
>  }