[Mesa-dev] [PATCH 2/2] radeonsi/compute: Use the HSA abi for non-TGSI compute shaders
Nicolai Hähnle
nhaehnle at gmail.com
Tue Jul 26 15:39:23 UTC 2016
On 26.07.2016 01:11, Tom Stellard wrote:
> This patche switches non-TGSI compute shaders over to using the HSA
Typo: patch :)
> ABI described here:
>
> https://github.com/RadeonOpenCompute/ROCm-Docs/blob/master/AMDGPU-ABI.md
>
> The HSA ABI provides a much cleaner interface for compute shaders and allows
> us to share more code in the compiler with the HSA stack.
>
> The main changes in this patch are:
> - We now pass the scratch buffer resource into the shader via user sgprs
> rather than using relocations.
> - Grid/Block sizes are now passed to the shader via the dispatch packet
> rather than at the beginning of the kernel arguments.
>
> Typically for HSA, the CP firmware will create the dispatch packet and set
> up the user sgprs automatically. However, in Mesa we let the driver do
> this work. The main reason for this is that I haven't researched how to
> get the CP to do all these things, and I'm not sure if it is supported
> for all GPUs.
> ---
> src/gallium/drivers/radeon/r600_pipe_common.c | 6 +-
> src/gallium/drivers/radeonsi/amd_kernel_code_t.h | 534 +++++++++++++++++++++++
> src/gallium/drivers/radeonsi/si_compute.c | 234 +++++++++-
> 3 files changed, 756 insertions(+), 18 deletions(-)
> create mode 100644 src/gallium/drivers/radeonsi/amd_kernel_code_t.h
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index cd4908f..9ecf666 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -784,7 +784,11 @@ static int r600_get_compute_param(struct pipe_screen *screen,
> if (rscreen->family <= CHIP_ARUBA) {
> triple = "r600--";
> } else {
> - triple = "amdgcn--";
> + if (HAVE_LLVM < 0x0400) {
> + triple = "amdgcn--";
> + } else {
> + triple = "amdgcn--mesa3d";
> + }
> }
> switch(rscreen->family) {
> /* Clang < 3.6 is missing Hainan in its list of
[snip]
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 949ab1a..1aced60 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -28,6 +28,7 @@
> #include "radeon/r600_pipe_common.h"
> #include "radeon/radeon_elf_util.h"
>
> +#include "amd_kernel_code_t.h"
> #include "radeon/r600_cs.h"
> #include "si_pipe.h"
> #include "si_shader.h"
> @@ -43,8 +44,52 @@ struct si_compute {
> struct si_shader shader;
>
> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
> + bool use_code_object_v2;
> };
>
> +struct dispatch_packet {
> + uint16_t header;
> + uint16_t setup;
> + uint16_t workgroup_size_x;
> + uint16_t workgroup_size_y;
> + uint16_t workgroup_size_z;
> + uint16_t reserved0;
> + uint32_t grid_size_x;
> + uint32_t grid_size_y;
> + uint32_t grid_size_z;
> + uint32_t private_segment_size;
> + uint32_t group_segment_size;
> + uint64_t kernel_object;
> + uint64_t kernarg_address;
> + uint64_t reserved2;
> +};
> +
> +static const amd_kernel_code_t *si_compute_get_code_object(
> + const struct si_compute *program,
> + uint64_t symbol_offset)
> +{
> + if (!program->use_code_object_v2) {
> + return NULL;
> + }
> + return (const amd_kernel_code_t*)
> + (program->shader.binary.code + symbol_offset);
> +}
> +
> +static void code_object_to_config(const amd_kernel_code_t *code_object,
> + struct si_shader_config *out_config) {
> +
> + uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
> + uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
> + out_config->num_sgprs = code_object->wavefront_sgpr_count;
> + out_config->num_vgprs = code_object->workitem_vgpr_count;
> + out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
> + out_config->rsrc1 = rsrc1;
> + out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
> + out_config->rsrc2 = rsrc2;
> + out_config->scratch_bytes_per_wave =
> + align(code_object->workitem_private_segment_byte_size * 64, 1024);
> +}
> +
> static void *si_create_compute_state(
> struct pipe_context *ctx,
> const struct pipe_compute_state *cso)
> @@ -59,6 +104,8 @@ static void *si_create_compute_state(
> program->local_size = cso->req_local_mem;
> program->private_size = cso->req_private_mem;
> program->input_size = cso->req_input_mem;
> + program->use_code_object_v2 = HAVE_LLVM >= 0x0400 &&
> + cso->ir_type == PIPE_SHADER_IR_NATIVE;
>
>
> if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
> @@ -110,8 +157,14 @@ static void *si_create_compute_state(
> code = cso->prog + sizeof(struct pipe_llvm_program_header);
>
> radeon_elf_read(code, header->num_bytes, &program->shader.binary);
> - si_shader_binary_read_config(&program->shader.binary,
> - &program->shader.config, 0);
> + if (program->use_code_object_v2) {
> + const amd_kernel_code_t *code_object =
> + si_compute_get_code_object(program, 0);
> + code_object_to_config(code_object, &program->shader.config);
> + } else {
> + si_shader_binary_read_config(&program->shader.binary,
> + &program->shader.config, 0);
> + }
> si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
> PIPE_SHADER_COMPUTE, stderr);
> si_shader_binary_upload(sctx->screen, &program->shader);
> @@ -234,7 +287,9 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
>
> static bool si_switch_compute_shader(struct si_context *sctx,
> struct si_compute *program,
> - struct si_shader *shader, unsigned offset)
> + struct si_shader *shader,
> + const amd_kernel_code_t *code_object,
> + unsigned offset)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> struct si_shader_config inline_config = {0};
> @@ -251,7 +306,11 @@ static bool si_switch_compute_shader(struct si_context *sctx,
> unsigned lds_blocks;
>
> config = &inline_config;
> - si_shader_binary_read_config(&shader->binary, config, offset);
> + if (code_object) {
> + code_object_to_config(code_object, config);
> + } else {
> + si_shader_binary_read_config(&shader->binary, config, offset);
> + }
>
> lds_blocks = config->lds_size;
> /* XXX: We are over allocating LDS. For SI, the shader reports
> @@ -287,6 +346,11 @@ static bool si_switch_compute_shader(struct si_context *sctx,
> }
>
> shader_va = shader->bo->gpu_address + offset;
> + if (program->use_code_object_v2) {
> + /* Shader code is placed after the amd_kernel_code_t
> + * struct. */
> + shader_va += sizeof(amd_kernel_code_t);
> + }
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
> RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
> @@ -314,14 +378,140 @@ static bool si_switch_compute_shader(struct si_context *sctx,
> return true;
> }
>
> +static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
> + const amd_kernel_code_t *code_object,
> + unsigned user_sgpr)
> +{
> + struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> + uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
> +
> + unsigned max_private_element_size = AMD_HSA_BITS_GET(
> + code_object->code_properties,
> + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
> +
> + uint32_t scratch_dword0 = scratch_va & 0xffffffff;
> + uint32_t scratch_dword1 =
> + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
> + S_008F04_SWIZZLE_ENABLE(1);
> + uint32_t scratch_dword2 = 0xffffffff; //sctx->compute_scratch_buffer->b.b.width0;
What's the reason for not setting that?
> + uint32_t scratch_dword3 =
> + S_008F0C_ELEMENT_SIZE(max_private_element_size) |
> + S_008F0C_INDEX_STRIDE(3) |
> + S_008F0C_ADD_TID_ENABLE(1);
> +
> +
> + if (sctx->screen->b.family < CHIP_TONGA) {
> + /* XXX: I have no idea why we use NUM_FORMAT_FLOAT. */
> + scratch_dword3 |=
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
chip_class < VI? Or is this really about Tonga specifically?
And yes, those settings are peculiar. I don't think LLVM emits
buffer_*_format instructions -- the non-format instructions should
ignore those fields anyway. I'm also not aware of any quirks needed
related to this...
Nicolai
> + }
> +
> + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> + (user_sgpr * 4), 4);
> + radeon_emit(cs, scratch_dword0);
> + radeon_emit(cs, scratch_dword1);
> + radeon_emit(cs, scratch_dword2);
> + radeon_emit(cs, scratch_dword3);
> +}
> +
> +static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
> + const amd_kernel_code_t *code_object,
> + const struct pipe_grid_info *info,
> + uint64_t kernel_args_va)
> +{
> + struct si_compute *program = sctx->cs_shader_state.program;
> + struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> +
> + static const enum amd_code_property_mask_t workgroup_count_masks [] = {
> + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
> + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
> + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
> + };
> +
> + unsigned i, user_sgpr = 0;
> + if (AMD_HSA_BITS_GET(code_object->code_properties,
> + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
> + if (code_object->workitem_private_segment_byte_size > 0) {
> + setup_scratch_rsrc_user_sgprs(sctx, code_object,
> + user_sgpr);
> + }
> + user_sgpr += 4;
> + }
> +
> + if (AMD_HSA_BITS_GET(code_object->code_properties,
> + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
> + struct dispatch_packet dispatch;
> + unsigned dispatch_offset;
> + struct r600_resource *dispatch_buf = NULL;
> + uint64_t dispatch_va;
> +
> + /* Upload dispatch ptr */
> + memset(&dispatch, 0, sizeof(dispatch));
> +
> + dispatch.workgroup_size_x = info->block[0];
> + dispatch.workgroup_size_y = info->block[1];
> + dispatch.workgroup_size_z = info->block[2];
> +
> + dispatch.grid_size_x = info->grid[0] * info->block[0];
> + dispatch.grid_size_y = info->grid[1] * info->block[1];
> + dispatch.grid_size_z = info->grid[2] * info->block[2];
> +
> + dispatch.private_segment_size = program->private_size;
> + dispatch.group_segment_size = program->local_size;
> +
> + dispatch.kernarg_address = kernel_args_va;
> +
> + u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256,
> + &dispatch, &dispatch_offset,
> + (struct pipe_resource**)&dispatch_buf);
> +
> + assert(dispatch_buf);
> + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
> + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
> +
> + dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
> +
> + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> + (user_sgpr * 4), 2);
> + radeon_emit(cs, dispatch_va);
> + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
> + S_008F04_STRIDE(0));
> +
> + r600_resource_reference(&dispatch_buf, NULL);
> + user_sgpr += 2;
> + }
> +
> + if (AMD_HSA_BITS_GET(code_object->code_properties,
> + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
> + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> + (user_sgpr * 4), 2);
> + radeon_emit(cs, kernel_args_va);
> + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> + S_008F04_STRIDE(0));
> + user_sgpr += 2;
> + }
> +
> + for (i = 0; i < 3 && user_sgpr < 16; i++) {
> + if (code_object->code_properties & workgroup_count_masks[i]) {
> + radeon_set_sh_reg_seq(cs,
> + R_00B900_COMPUTE_USER_DATA_0 +
> + (user_sgpr * 4), 1);
> + radeon_emit(cs, info->grid[i]);
> + user_sgpr += 1;
> + }
> + }
> +}
> +
> static void si_upload_compute_input(struct si_context *sctx,
> - const struct pipe_grid_info *info)
> + const amd_kernel_code_t *code_object,
> + const struct pipe_grid_info *info)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> struct si_compute *program = sctx->cs_shader_state.program;
> struct r600_resource *input_buffer = NULL;
> unsigned kernel_args_size;
> - unsigned num_work_size_bytes = 36;
> + unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
> uint32_t kernel_args_offset = 0;
> uint32_t *kernel_args;
> void *kernel_args_ptr;
> @@ -336,10 +526,14 @@ static void si_upload_compute_input(struct si_context *sctx,
> (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
>
> kernel_args = (uint32_t*)kernel_args_ptr;
> - for (i = 0; i < 3; i++) {
> - kernel_args[i] = info->grid[i];
> - kernel_args[i + 3] = info->grid[i] * info->block[i];
> - kernel_args[i + 6] = info->block[i];
> + kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
> +
> + if (!code_object) {
> + for (i = 0; i < 3; i++) {
> + kernel_args[i] = info->grid[i];
> + kernel_args[i + 3] = info->grid[i] * info->block[i];
> + kernel_args[i + 6] = info->block[i];
> + }
> }
>
> memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
> @@ -351,15 +545,18 @@ static void si_upload_compute_input(struct si_context *sctx,
> kernel_args[i]);
> }
>
> - kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
> RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
>
> - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
> - radeon_emit(cs, kernel_args_va);
> - radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> - S_008F04_STRIDE(0));
> + if (code_object) {
> + si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
> + } else {
> + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
> + radeon_emit(cs, kernel_args_va);
> + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> + S_008F04_STRIDE(0));
> + }
>
> r600_resource_reference(&input_buffer, NULL);
> }
> @@ -442,6 +639,8 @@ static void si_launch_grid(
> {
> struct si_context *sctx = (struct si_context*)ctx;
> struct si_compute *program = sctx->cs_shader_state.program;
> + const amd_kernel_code_t *code_object =
> + si_compute_get_code_object(program, info->pc);
> int i;
> /* HW bug workaround when CS threadgroups > 256 threads and async
> * compute isn't used, i.e. only one compute job can run at a time.
> @@ -469,7 +668,8 @@ static void si_launch_grid(
> if (sctx->b.flags)
> si_emit_cache_flush(sctx, NULL);
>
> - if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
> + if (!si_switch_compute_shader(sctx, program, &program->shader,
> + code_object, info->pc))
> return;
>
> si_upload_compute_shader_descriptors(sctx);
> @@ -482,7 +682,7 @@ static void si_launch_grid(
> }
>
> if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
> - si_upload_compute_input(sctx, info);
> + si_upload_compute_input(sctx, code_object, info);
>
> /* Global buffers */
> for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
>
More information about the mesa-dev
mailing list