[Mesa-dev] [PATCH 2/2] radeonsi/compute: Use the HSA abi for non-TGSI compute shaders

Wed Jul 27 22:35:35 UTC 2016

On Tue, Jul 26, 2016 at 5:39 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 26.07.2016 01:11, Tom Stellard wrote:
>>
>> This patche switches non-TGSI compute shaders over to using the HSA
>
>
> Typo: patch :)
>
>
>> ABI described here:
>>
>> https://github.com/RadeonOpenCompute/ROCm-Docs/blob/master/AMDGPU-ABI.md
>>
>> The HSA ABI provides a much cleaner interface for compute shaders and
>> allows
>> us to share more code in the compiler with the HSA stack.
>>
>> The main changes in this patch are:
>>   - We now pass the scratch buffer resource into the shader via user sgprs
>>     rather than using relocations.
>>   - Grid/Block sizes are now passed to the shader via the dispatch packet
>>     rather than at the beginning of the kernel arguments.
>>
>> Typically for HSA, the CP firmware will create the dispatch packet and set
>> up the user sgprs automatically.  However, in Mesa we let the driver do
>> this work.  The main reason for this is that I haven't researched how to
>> get the CP to do all these things, and I'm not sure if it is supported
>> for all GPUs.
>> ---
>>  src/gallium/drivers/radeon/r600_pipe_common.c    |   6 +-
>>  src/gallium/drivers/radeonsi/amd_kernel_code_t.h | 534
>> +++++++++++++++++++++++
>>  src/gallium/drivers/radeonsi/si_compute.c        | 234 +++++++++-
>>  3 files changed, 756 insertions(+), 18 deletions(-)
>>  create mode 100644 src/gallium/drivers/radeonsi/amd_kernel_code_t.h
>>
>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c
>> b/src/gallium/drivers/radeon/r600_pipe_common.c
>> index cd4908f..9ecf666 100644
>> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
>> @@ -784,7 +784,11 @@ static int r600_get_compute_param(struct pipe_screen
>> *screen,
>>                 if (rscreen->family <= CHIP_ARUBA) {
>>                         triple = "r600--";
>>                 } else {
>> -                       triple = "amdgcn--";
>> +                       if (HAVE_LLVM < 0x0400) {
>> +                               triple = "amdgcn--";
>> +                       } else {
>> +                               triple = "amdgcn--mesa3d";
>> +                       }
>>                 }
>>                 switch(rscreen->family) {
>>                 /* Clang < 3.6 is missing Hainan in its list of
>
> [snip]
>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 949ab1a..1aced60 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -28,6 +28,7 @@
>>  #include "radeon/r600_pipe_common.h"
>>  #include "radeon/radeon_elf_util.h"
>>
>> +#include "amd_kernel_code_t.h"
>>  #include "radeon/r600_cs.h"
>>  #include "si_pipe.h"
>>  #include "si_shader.h"
>> @@ -43,8 +44,52 @@ struct si_compute {
>>         struct si_shader shader;
>>
>>         struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>> +       bool use_code_object_v2;
>>  };
>>
>> +struct dispatch_packet {
>> +       uint16_t header;
>> +       uint16_t setup;
>> +       uint16_t workgroup_size_x;
>> +       uint16_t workgroup_size_y;
>> +       uint16_t workgroup_size_z;
>> +       uint16_t reserved0;
>> +       uint32_t grid_size_x;
>> +       uint32_t grid_size_y;
>> +       uint32_t grid_size_z;
>> +       uint32_t private_segment_size;
>> +       uint32_t group_segment_size;
>> +       uint64_t kernel_object;
>> +       uint64_t kernarg_address;
>> +       uint64_t reserved2;
>> +};
>> +
>> +static const amd_kernel_code_t *si_compute_get_code_object(
>> +       const struct si_compute *program,
>> +       uint64_t symbol_offset)
>> +{
>> +       if (!program->use_code_object_v2) {
>> +               return NULL;
>> +       }
>> +       return (const amd_kernel_code_t*)
>> +               (program->shader.binary.code + symbol_offset);
>> +}
>> +
>> +static void code_object_to_config(const amd_kernel_code_t *code_object,
>> +                                 struct si_shader_config *out_config) {
>> +
>> +       uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
>> +       uint32_t rsrc2 = code_object->compute_pgm_resource_registers >>
>> 32;
>> +       out_config->num_sgprs = code_object->wavefront_sgpr_count;
>> +       out_config->num_vgprs = code_object->workitem_vgpr_count;
>> +       out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
>> +       out_config->rsrc1 = rsrc1;
>> +       out_config->lds_size = MAX2(out_config->lds_size,
>> G_00B84C_LDS_SIZE(rsrc2));
>> +       out_config->rsrc2 = rsrc2;
>> +       out_config->scratch_bytes_per_wave =
>> +               align(code_object->workitem_private_segment_byte_size *
>> 64, 1024);
>> +}
>> +
>>  static void *si_create_compute_state(
>>         struct pipe_context *ctx,
>>         const struct pipe_compute_state *cso)
>> @@ -59,6 +104,8 @@ static void *si_create_compute_state(
>>         program->local_size = cso->req_local_mem;
>>         program->private_size = cso->req_private_mem;
>>         program->input_size = cso->req_input_mem;
>> +       program->use_code_object_v2 = HAVE_LLVM >= 0x0400 &&
>> +                                       cso->ir_type ==
>> PIPE_SHADER_IR_NATIVE;
>>
>>
>>         if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
>> @@ -110,8 +157,14 @@ static void *si_create_compute_state(
>>                 code = cso->prog + sizeof(struct
>> pipe_llvm_program_header);
>>
>>                 radeon_elf_read(code, header->num_bytes,
>> &program->shader.binary);
>> -               si_shader_binary_read_config(&program->shader.binary,
>> -                            &program->shader.config, 0);
>> +               if (program->use_code_object_v2) {
>> +                       const amd_kernel_code_t *code_object =
>> +                               si_compute_get_code_object(program, 0);
>> +                       code_object_to_config(code_object,
>> &program->shader.config);
>> +               } else {
>> +
>> si_shader_binary_read_config(&program->shader.binary,
>> +                                    &program->shader.config, 0);
>> +               }
>>                 si_shader_dump(sctx->screen, &program->shader,
>> &sctx->b.debug,
>>                                PIPE_SHADER_COMPUTE, stderr);
>>                 si_shader_binary_upload(sctx->screen, &program->shader);
>> @@ -234,7 +287,9 @@ static bool si_setup_compute_scratch_buffer(struct
>> si_context *sctx,
>>
>>  static bool si_switch_compute_shader(struct si_context *sctx,
>>                                       struct si_compute *program,
>> -                                     struct si_shader *shader, unsigned
>> offset)
>> +                                    struct si_shader *shader,
>> +                                    const amd_kernel_code_t *code_object,
>> +                                    unsigned offset)
>>  {
>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>         struct si_shader_config inline_config = {0};
>> @@ -251,7 +306,11 @@ static bool si_switch_compute_shader(struct
>> si_context *sctx,
>>                 unsigned lds_blocks;
>>
>>                 config = &inline_config;
>> -               si_shader_binary_read_config(&shader->binary, config,
>> offset);
>> +               if (code_object) {
>> +                       code_object_to_config(code_object, config);
>> +               } else {
>> +                       si_shader_binary_read_config(&shader->binary,
>> config, offset);
>> +               }
>>
>>                 lds_blocks = config->lds_size;
>>                 /* XXX: We are over allocating LDS.  For SI, the shader
>> reports
>> @@ -287,6 +346,11 @@ static bool si_switch_compute_shader(struct
>> si_context *sctx,
>>         }
>>
>>         shader_va = shader->bo->gpu_address + offset;
>> +       if (program->use_code_object_v2) {
>> +               /* Shader code is placed after the amd_kernel_code_t
>> +                * struct. */
>> +               shader_va += sizeof(amd_kernel_code_t);
>> +       }
>>
>>         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
>>                                   RADEON_USAGE_READ,
>> RADEON_PRIO_USER_SHADER);
>> @@ -314,14 +378,140 @@ static bool si_switch_compute_shader(struct
>> si_context *sctx,
>>         return true;
>>  }
>>
>> +static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
>> +                                         const amd_kernel_code_t
>> *code_object,
>> +                                         unsigned user_sgpr)
>> +{
>> +       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>> +       uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
>> +
>> +       unsigned max_private_element_size = AMD_HSA_BITS_GET(
>> +                       code_object->code_properties,
>> +                       AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
>> +
>> +       uint32_t scratch_dword0 = scratch_va & 0xffffffff;
>> +       uint32_t scratch_dword1 =
>> +               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
>> +               S_008F04_SWIZZLE_ENABLE(1);
>> +       uint32_t scratch_dword2 = 0xffffffff;
>> //sctx->compute_scratch_buffer->b.b.width0;
>
>
> What's the reason for not setting that?

The comment can be removed. We should always set 0xffffffff. The
hardware won't use the scratch buffer beyond what's allowed by the
TMPRING register. Address clamping here would be very very bad.

>
>> +       uint32_t scratch_dword3 =
>> +               S_008F0C_ELEMENT_SIZE(max_private_element_size) |
>> +               S_008F0C_INDEX_STRIDE(3) |
>> +               S_008F0C_ADD_TID_ENABLE(1);
>> +
>> +
>> +       if (sctx->screen->b.family < CHIP_TONGA) {
>> +               /* XXX: I have no idea why we use NUM_FORMAT_FLOAT. */
>> +               scratch_dword3 |=
>> +                       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT)
>> |
>> +                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
>
>
> chip_class < VI? Or is this really about Tonga specifically?
>
> And yes, those settings are peculiar. I don't think LLVM emits
> buffer_*_format instructions -- the non-format instructions should ignore
> those fields anyway. I'm also not aware of any quirks needed related to
> this...

VI only: If ADD_TID_ENABLE is set and untyped loads/stores are used,
the DATA_FORMAT bits specify high bits of STRIDE, allowing a much
greater stride. Always set to 0 unless you want a huge stride.

SI-CIK: I think NUM_FORMAT has no effect, but DATA_FORMAT shouldn't be
INVALID (0).

Marek