[Mesa-dev] [PATCH 1/2] radeon/llvm: support for multiple const buffer

Mon Dec 17 16:36:34 PST 2012

AFAICS, your KC registers still represent the constant indices, same as
Cxxx registers previously. The problem with such representation is that
hw supports 16 buffers * 4096 constants per buffer = 65536 constants. So
we'll need 65536 registers for full support, and it looks like a bit too
much of registers. I never liked the idea of using registers to
represent constants, even though it makes instruction defs more simple
and requires less operands. Constant buffers are memory, so I think more
correct representation would be to use constant index/address as
operand. Also I suspect that thousands of registers affect the backend's
performance negatively, not to mention that code to handle them looks a
bit ugly.

As I said, I think we may need kcache registers in the backend later,
when we'll be able to perform kcache lines allocation per ALU clause and
clause creation in the backend. And then we can use the registers to
represent real kcache src_sel values used by hw, that is, to represent
[up to 4 kcache line sets per clause] * [up to 32 constants accessible
per kcache line set] = 128 total registers, each is mapped to real hw
src_sel, something like KC0_0 ... KC0_31, ..., KC3_0 ... KC3_31. But it
makes sense only when we'll be able to perform initialization of the
kcache line sets per ALU clause in the backend, that is, probably not
very soon. And even then it still might be better and simpler to use
numerical indices for that instead of registers.

What I think we need now is to get rid of registers for constants
completely, and use numerical indices as operands. We might want to pack
bank and index for constant in a single number (e.g. bank*4096 + index)
to reduce the number of the operands. I'm not sure what is a best way to
implement it, so far I think we might want to introduce some special
register "C" to designate that the alu src operand refers to a constant,
and add the operand for each alu src to store packed bank&index.

Vadim

On Mon, 2012-12-17 at 18:42 +0100, Vincent Lejeune wrote:
> ---
>  lib/Target/AMDGPU/AMDGPUIntrinsics.td              |   2 +-
>  .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp      |  26 +++-
>  lib/Target/AMDGPU/R600ISelLowering.cpp             |  57 +++++++-
>  lib/Target/AMDGPU/R600Instructions.td              |   4 +-
>  lib/Target/AMDGPU/R600RegisterInfo.td              | 149 +++++++++++++++++++--
>  lib/Target/AMDGPU/SIInstructions.td                |   6 +-
>  6 files changed, 223 insertions(+), 21 deletions(-)
> 
> diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
> index 2ba2d4b..3bb2eee 100644
> --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
> +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
> @@ -13,7 +13,7 @@
>  
>  let TargetPrefix = "AMDGPU", isTarget = 1 in {
>  
> -  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
> +  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
>    def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
>    def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
>    def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
> diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
> index 018234a..94f169d 100644
> --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
> +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
> @@ -307,11 +307,29 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned OpIdx,
>    // value of the source select is defined in the r600isa docs.
>    if (MO.isReg()) {
>      unsigned Reg = MO.getReg();
> -    if (AMDGPUMCRegisterClasses[AMDGPU::R600_CReg32RegClassID].contains(Reg)) {
> -      EmitByte(1, OS);
> -    } else {
> -      EmitByte(0, OS);
> +    unsigned EmittedBank = 0;
> +    unsigned BankClassId[16] = {
> +        AMDGPU::R600_KC0_CReg32RegClassID,
> +        AMDGPU::R600_KC1_CReg32RegClassID,
> +        AMDGPU::R600_KC2_CReg32RegClassID,
> +        AMDGPU::R600_KC3_CReg32RegClassID,
> +        AMDGPU::R600_KC4_CReg32RegClassID,
> +        AMDGPU::R600_KC5_CReg32RegClassID,
> +        AMDGPU::R600_KC6_CReg32RegClassID,
> +        AMDGPU::R600_KC7_CReg32RegClassID,
> +        AMDGPU::R600_KC8_CReg32RegClassID,
> +        AMDGPU::R600_KC9_CReg32RegClassID,
> +        AMDGPU::R600_KC10_CReg32RegClassID,
> +        AMDGPU::R600_KC11_CReg32RegClassID,
> +        AMDGPU::R600_KC12_CReg32RegClassID,
> +        AMDGPU::R600_KC13_CReg32RegClassID,
> +        AMDGPU::R600_KC14_CReg32RegClassID,
> +        AMDGPU::R600_KC15_CReg32RegClassID};
> +    for (unsigned i = 0; i < 16; i++) {
> +      if (AMDGPUMCRegisterClasses[BankClassId[i]].contains(Reg))
> +        EmittedBank = i + 1;
>      }
> +    EmitByte(EmittedBank, OS);
>  
>      if (Reg == AMDGPU::ALU_LITERAL_X) {
>        unsigned ImmOpIndex = MI.getNumOperands() - 1;
> diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
> index 3a4283c..16d2280 100644
> --- a/lib/Target/AMDGPU/R600ISelLowering.cpp
> +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
> @@ -161,7 +161,62 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
>  
>    case AMDGPU::R600_LOAD_CONST: {
>      int64_t RegIndex = MI->getOperand(1).getImm();
> -    unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
> +    int64_t BankIndex = MI->getOperand(2).getImm();
> +    unsigned ConstantReg;
> +    switch (BankIndex) {
> +    case 0:
> +      ConstantReg = AMDGPU::R600_KC0_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 1:
> +      ConstantReg = AMDGPU::R600_KC1_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 2:
> +      ConstantReg = AMDGPU::R600_KC2_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 3:
> +      ConstantReg = AMDGPU::R600_KC3_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 4:
> +      ConstantReg = AMDGPU::R600_KC4_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 5:
> +      ConstantReg = AMDGPU::R600_KC5_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 6:
> +      ConstantReg = AMDGPU::R600_KC6_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 7:
> +      ConstantReg = AMDGPU::R600_KC7_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 8:
> +      ConstantReg = AMDGPU::R600_KC8_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 9:
> +      ConstantReg = AMDGPU::R600_KC9_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 10:
> +      ConstantReg = AMDGPU::R600_KC10_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 11:
> +      ConstantReg = AMDGPU::R600_KC11_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 12:
> +      ConstantReg = AMDGPU::R600_KC12_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 13:
> +      ConstantReg = AMDGPU::R600_KC13_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 14:
> +      ConstantReg = AMDGPU::R600_KC14_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    case 15:
> +      ConstantReg = AMDGPU::R600_KC15_CReg32RegClass.getRegister(RegIndex);
> +      break;
> +    default:
> +      assert( 0 && "Not a valid Bank Index !");
> +    }
> +
> +    
>      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
>                  .addOperand(MI->getOperand(0))
>                  .addReg(ConstantReg);
> diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
> index d89b03b..621ed09 100644
> --- a/lib/Target/AMDGPU/R600Instructions.td
> +++ b/lib/Target/AMDGPU/R600Instructions.td
> @@ -1572,9 +1572,9 @@ def MASK_WRITE : AMDGPUShaderInst <
>  
>  def R600_LOAD_CONST : AMDGPUShaderInst <
>    (outs R600_Reg32:$dst),
> -  (ins i32imm:$src0),
> +  (ins i32imm:$src0, i32imm:$src1),
>    "R600_LOAD_CONST $dst, $src0",
> -  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
> +  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0, imm:$src1))]
>  >;
>  
>  def RESERVE_REG : AMDGPUShaderInst <
> diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
> index 3b21825..577938d 100644
> --- a/lib/Target/AMDGPU/R600RegisterInfo.td
> +++ b/lib/Target/AMDGPU/R600RegisterInfo.td
> @@ -28,10 +28,6 @@ foreach Index = 0-127 in {
>      // 32-bit Temporary Registers
>      def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
>  
> -    // 32-bit Constant Registers (There are more than 128, this the number
> -    // that is currently supported.
> -    def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>;
> -
>      // Indirect addressing offset registers
>      def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
>                                                Index, Chan>;
> @@ -45,6 +41,16 @@ foreach Index = 0-127 in {
>                                     Index>;
>  }
>  
> +foreach Index = 0 - 255 in {
> +  foreach Bank = 0 - 15 in {
> +    foreach Chan = [ "X", "Y", "Z", "W" ] in {
> +      // 32-bit Constant Registers (There are more than 128, this the number
> +      // that is currently supported.
> +      def KC#Bank#_C#Index#_#Chan : R600RegWithChan <"KC"#Bank#"_C"#Index#"."#Chan, Index, Chan>;
> +    }
> +  }
> +}
> +
>  // Array Base Register holding input in FS
>  foreach Index = 448-464 in {
>    def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
> @@ -80,12 +86,135 @@ def R600_Addr : RegisterClass <"AMDGPU", [i32], 127,
>  
>  } // End isAllocatable = 0
>  
> -def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> -                          (add (interleave
> -                                  (interleave (sequence "C%u_X", 0, 127),
> -                                              (sequence "C%u_Z", 0, 127)),
> -                                  (interleave (sequence "C%u_Y", 0, 127),
> -                                              (sequence "C%u_W", 0, 127))))>;
> +def R600_KC0_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC0_C%u_X", 0, 255),
> +                    (sequence "KC0_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC0_C%u_Y", 0, 255),
> +                    (sequence "KC0_C%u_W", 0, 255))))>;
> +
> +def R600_KC1_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC1_C%u_X", 0, 255),
> +                    (sequence "KC1_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC1_C%u_Y", 0, 255),
> +                    (sequence "KC1_C%u_W", 0, 255))))>;
> +
> +def R600_KC2_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC2_C%u_X", 0, 255),
> +                    (sequence "KC2_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC2_C%u_Y", 0, 255),
> +                    (sequence "KC2_C%u_W", 0, 255))))>;
> +
> +def R600_KC3_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC3_C%u_X", 0, 255),
> +                    (sequence "KC3_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC3_C%u_Y", 0, 255),
> +                    (sequence "KC3_C%u_W", 0, 255))))>;
> +
> +def R600_KC4_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC4_C%u_X", 0, 255),
> +                    (sequence "KC4_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC4_C%u_Y", 0, 255),
> +                    (sequence "KC4_C%u_W", 0, 255))))>;
> +
> +def R600_KC5_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC5_C%u_X", 0, 255),
> +                    (sequence "KC5_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC5_C%u_Y", 0, 255),
> +                    (sequence "KC5_C%u_W", 0, 255))))>;
> +
> +def R600_KC6_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC6_C%u_X", 0, 255),
> +                    (sequence "KC6_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC6_C%u_Y", 0, 255),
> +                    (sequence "KC6_C%u_W", 0, 255))))>;
> +
> +def R600_KC7_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC7_C%u_X", 0, 255),
> +                    (sequence "KC7_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC7_C%u_Y", 0, 255),
> +                    (sequence "KC7_C%u_W", 0, 255))))>;
> +
> +def R600_KC8_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC8_C%u_X", 0, 255),
> +                    (sequence "KC8_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC8_C%u_Y", 0, 255),
> +                    (sequence "KC8_C%u_W", 0, 255))))>;
> +
> +def R600_KC9_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC9_C%u_X", 0, 255),
> +                    (sequence "KC9_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC9_C%u_Y", 0, 255),
> +                    (sequence "KC9_C%u_W", 0, 255))))>;
> +
> +def R600_KC10_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC10_C%u_X", 0, 255),
> +                    (sequence "KC10_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC10_C%u_Y", 0, 255),
> +                    (sequence "KC10_C%u_W", 0, 255))))>;
> +
> +def R600_KC11_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC11_C%u_X", 0, 255),
> +                    (sequence "KC11_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC11_C%u_Y", 0, 255),
> +                    (sequence "KC11_C%u_W", 0, 255))))>;
> +
> +def R600_KC12_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC12_C%u_X", 0, 255),
> +                    (sequence "KC12_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC12_C%u_Y", 0, 255),
> +                    (sequence "KC12_C%u_W", 0, 255))))>;
> +
> +def R600_KC13_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC13_C%u_X", 0, 255),
> +                    (sequence "KC13_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC13_C%u_Y", 0, 255),
> +                    (sequence "KC13_C%u_W", 0, 255))))>;
> +
> +def R600_KC14_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC14_C%u_X", 0, 255),
> +                    (sequence "KC14_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC14_C%u_Y", 0, 255),
> +                    (sequence "KC14_C%u_W", 0, 255))))>;
> +
> +def R600_KC15_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
> +    (add (interleave
> +        (interleave (sequence "KC15_C%u_X", 0, 255),
> +                    (sequence "KC15_C%u_Z", 0, 255)),
> +        (interleave (sequence "KC15_C%u_Y", 0, 255),
> +                    (sequence "KC15_C%u_W", 0, 255))))>;
> +
> +def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
> +    R600_KC0_CReg32,
> +    R600_KC1_CReg32,
> +    R600_KC2_CReg32,
> +    R600_KC3_CReg32,
> +    R600_KC4_CReg32,
> +    R600_KC5_CReg32,
> +    R600_KC6_CReg32,
> +    R600_KC7_CReg32,
> +    R600_KC8_CReg32,
> +    R600_KC9_CReg32,
> +    R600_KC10_CReg32,
> +    R600_KC11_CReg32,
> +    R600_KC12_CReg32,
> +    R600_KC13_CReg32,
> +    R600_KC14_CReg32,
> +    R600_KC15_CReg32)>;
>  
>  def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
>                                     (add (sequence "T%u_X", 0, 127))>;
> diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
> index e9bbe23..bcf0635 100644
> --- a/lib/Target/AMDGPU/SIInstructions.td
> +++ b/lib/Target/AMDGPU/SIInstructions.td
> @@ -1050,9 +1050,9 @@ def SET_M0 : InstSI <
>  
>  def LOAD_CONST : AMDGPUShaderInst <
>    (outs GPRF32:$dst),
> -  (ins i32imm:$src),
> -  "LOAD_CONST $dst, $src",
> -  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
> +  (ins i32imm:$src0),
> +  "LOAD_CONST $dst, $src0",
> +  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src0, imm))]
>  >;
>  
>  let usesCustomInserter = 1 in {