[Mesa-dev] [PATCH 2/2] radeonsi/compute: Use the HSA abi for non-TGSI compute shaders

Tue Jul 26 15:39:23 UTC 2016

On 26.07.2016 01:11, Tom Stellard wrote:
> This patche switches non-TGSI compute shaders over to using the HSA

Typo: patch :)

> ABI described here:
>
> https://github.com/RadeonOpenCompute/ROCm-Docs/blob/master/AMDGPU-ABI.md
>
> The HSA ABI provides a much cleaner interface for compute shaders and allows
> us to share more code in the compiler with the HSA stack.
>
> The main changes in this patch are:
>   - We now pass the scratch buffer resource into the shader via user sgprs
>     rather than using relocations.
>   - Grid/Block sizes are now passed to the shader via the dispatch packet
>     rather than at the beginning of the kernel arguments.
>
> Typically for HSA, the CP firmware will create the dispatch packet and set
> up the user sgprs automatically.  However, in Mesa we let the driver do
> this work.  The main reason for this is that I haven't researched how to
> get the CP to do all these things, and I'm not sure if it is supported
> for all GPUs.
> ---
>  src/gallium/drivers/radeon/r600_pipe_common.c    |   6 +-
>  src/gallium/drivers/radeonsi/amd_kernel_code_t.h | 534 +++++++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_compute.c        | 234 +++++++++-
>  3 files changed, 756 insertions(+), 18 deletions(-)
>  create mode 100644 src/gallium/drivers/radeonsi/amd_kernel_code_t.h
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index cd4908f..9ecf666 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -784,7 +784,11 @@ static int r600_get_compute_param(struct pipe_screen *screen,
>  		if (rscreen->family <= CHIP_ARUBA) {
>  			triple = "r600--";
>  		} else {
> -			triple = "amdgcn--";
> +			if (HAVE_LLVM < 0x0400) {
> +				triple = "amdgcn--";
> +			} else {
> +				triple = "amdgcn--mesa3d";
> +			}
>  		}
>  		switch(rscreen->family) {
>  		/* Clang < 3.6 is missing Hainan in its list of
[snip]
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 949ab1a..1aced60 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -28,6 +28,7 @@
>  #include "radeon/r600_pipe_common.h"
>  #include "radeon/radeon_elf_util.h"
>
> +#include "amd_kernel_code_t.h"
>  #include "radeon/r600_cs.h"
>  #include "si_pipe.h"
>  #include "si_shader.h"
> @@ -43,8 +44,52 @@ struct si_compute {
>  	struct si_shader shader;
>
>  	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
> +	bool use_code_object_v2;
>  };
>
> +struct dispatch_packet {
> +	uint16_t header;
> +	uint16_t setup;
> +	uint16_t workgroup_size_x;
> +	uint16_t workgroup_size_y;
> +	uint16_t workgroup_size_z;
> +	uint16_t reserved0;
> +	uint32_t grid_size_x;
> +	uint32_t grid_size_y;
> +	uint32_t grid_size_z;
> +	uint32_t private_segment_size;
> +	uint32_t group_segment_size;
> +	uint64_t kernel_object;
> +	uint64_t kernarg_address;
> +	uint64_t reserved2;
> +};
> +
> +static const amd_kernel_code_t *si_compute_get_code_object(
> +	const struct si_compute *program,
> +	uint64_t symbol_offset)
> +{
> +	if (!program->use_code_object_v2) {
> +		return NULL;
> +	}
> +	return (const amd_kernel_code_t*)
> +		(program->shader.binary.code + symbol_offset);
> +}
> +
> +static void code_object_to_config(const amd_kernel_code_t *code_object,
> +				  struct si_shader_config *out_config) {
> +
> +	uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
> +	uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
> +	out_config->num_sgprs = code_object->wavefront_sgpr_count;
> +	out_config->num_vgprs = code_object->workitem_vgpr_count;
> +	out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
> +	out_config->rsrc1 = rsrc1;
> +	out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
> +	out_config->rsrc2 = rsrc2;
> +	out_config->scratch_bytes_per_wave =
> +		align(code_object->workitem_private_segment_byte_size * 64, 1024);
> +}
> +
>  static void *si_create_compute_state(
>  	struct pipe_context *ctx,
>  	const struct pipe_compute_state *cso)
> @@ -59,6 +104,8 @@ static void *si_create_compute_state(
>  	program->local_size = cso->req_local_mem;
>  	program->private_size = cso->req_private_mem;
>  	program->input_size = cso->req_input_mem;
> +	program->use_code_object_v2 = HAVE_LLVM >= 0x0400 &&
> +					cso->ir_type == PIPE_SHADER_IR_NATIVE;
>
>
>  	if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
> @@ -110,8 +157,14 @@ static void *si_create_compute_state(
>  		code = cso->prog + sizeof(struct pipe_llvm_program_header);
>
>  		radeon_elf_read(code, header->num_bytes, &program->shader.binary);
> -		si_shader_binary_read_config(&program->shader.binary,
> -			     &program->shader.config, 0);
> +		if (program->use_code_object_v2) {
> +			const amd_kernel_code_t *code_object =
> +				si_compute_get_code_object(program, 0);
> +			code_object_to_config(code_object, &program->shader.config);
> +		} else {
> +			si_shader_binary_read_config(&program->shader.binary,
> +				     &program->shader.config, 0);
> +		}
>  		si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
>  			       PIPE_SHADER_COMPUTE, stderr);
>  		si_shader_binary_upload(sctx->screen, &program->shader);
> @@ -234,7 +287,9 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
>
>  static bool si_switch_compute_shader(struct si_context *sctx,
>                                       struct si_compute *program,
> -                                     struct si_shader *shader, unsigned offset)
> +				     struct si_shader *shader,
> +				     const amd_kernel_code_t *code_object,
> +				     unsigned offset)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	struct si_shader_config inline_config = {0};
> @@ -251,7 +306,11 @@ static bool si_switch_compute_shader(struct si_context *sctx,
>  		unsigned lds_blocks;
>
>  		config = &inline_config;
> -		si_shader_binary_read_config(&shader->binary, config, offset);
> +		if (code_object) {
> +			code_object_to_config(code_object, config);
> +		} else {
> +			si_shader_binary_read_config(&shader->binary, config, offset);
> +		}
>
>  		lds_blocks = config->lds_size;
>  		/* XXX: We are over allocating LDS.  For SI, the shader reports
> @@ -287,6 +346,11 @@ static bool si_switch_compute_shader(struct si_context *sctx,
>  	}
>
>  	shader_va = shader->bo->gpu_address + offset;
> +	if (program->use_code_object_v2) {
> +		/* Shader code is placed after the amd_kernel_code_t
> +		 * struct. */
> +		shader_va += sizeof(amd_kernel_code_t);
> +	}
>
>  	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
>  	                          RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
> @@ -314,14 +378,140 @@ static bool si_switch_compute_shader(struct si_context *sctx,
>  	return true;
>  }
>
> +static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
> +					  const amd_kernel_code_t *code_object,
> +					  unsigned user_sgpr)
> +{
> +	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> +	uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
> +
> +	unsigned max_private_element_size = AMD_HSA_BITS_GET(
> +			code_object->code_properties,
> +			AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
> +
> +	uint32_t scratch_dword0 = scratch_va & 0xffffffff;
> +	uint32_t scratch_dword1 =
> +		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
> +		S_008F04_SWIZZLE_ENABLE(1);
> +	uint32_t scratch_dword2 = 0xffffffff; //sctx->compute_scratch_buffer->b.b.width0;

What's the reason for not setting that?

> +	uint32_t scratch_dword3 =
> +		S_008F0C_ELEMENT_SIZE(max_private_element_size) |
> +		S_008F0C_INDEX_STRIDE(3) |
> +		S_008F0C_ADD_TID_ENABLE(1);
> +
> +
> +	if (sctx->screen->b.family < CHIP_TONGA) {
> +		/* XXX: I have no idea why we use NUM_FORMAT_FLOAT. */
> +		scratch_dword3 |=
> +			S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +			S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);

chip_class < VI? Or is this really about Tonga specifically?

And yes, those settings are peculiar. I don't think LLVM emits 
buffer_*_format instructions -- the non-format instructions should 
ignore those fields anyway. I'm also not aware of any quirks needed 
related to this...

Nicolai

> +	}
> +
> +	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> +							(user_sgpr * 4), 4);
> +	radeon_emit(cs, scratch_dword0);
> +	radeon_emit(cs, scratch_dword1);
> +	radeon_emit(cs, scratch_dword2);
> +	radeon_emit(cs, scratch_dword3);
> +}
> +
> +static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
> +                                      const amd_kernel_code_t *code_object,
> +				      const struct pipe_grid_info *info,
> +				      uint64_t kernel_args_va)
> +{
> +	struct si_compute *program = sctx->cs_shader_state.program;
> +	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> +
> +	static const enum amd_code_property_mask_t workgroup_count_masks [] = {
> +		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
> +		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
> +		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
> +	};
> +
> +	unsigned i, user_sgpr = 0;
> +	if (AMD_HSA_BITS_GET(code_object->code_properties,
> +			AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
> +		if (code_object->workitem_private_segment_byte_size > 0) {
> +			setup_scratch_rsrc_user_sgprs(sctx, code_object,
> +								user_sgpr);
> +		}
> +		user_sgpr += 4;
> +	}
> +
> +	if (AMD_HSA_BITS_GET(code_object->code_properties,
> +			AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
> +		struct dispatch_packet dispatch;
> +		unsigned dispatch_offset;
> +		struct r600_resource *dispatch_buf = NULL;
> +		uint64_t dispatch_va;
> +
> +		/* Upload dispatch ptr */
> +		memset(&dispatch, 0, sizeof(dispatch));
> +
> +		dispatch.workgroup_size_x = info->block[0];
> +		dispatch.workgroup_size_y = info->block[1];
> +		dispatch.workgroup_size_z = info->block[2];
> +
> +		dispatch.grid_size_x = info->grid[0] * info->block[0];
> +		dispatch.grid_size_y = info->grid[1] * info->block[1];
> +		dispatch.grid_size_z = info->grid[2] * info->block[2];
> +
> +		dispatch.private_segment_size = program->private_size;
> +		dispatch.group_segment_size = program->local_size;
> +
> +		dispatch.kernarg_address = kernel_args_va;
> +
> +		u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256,
> +				&dispatch, &dispatch_offset,
> +				(struct pipe_resource**)&dispatch_buf);
> +
> +		assert(dispatch_buf);
> +		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
> +				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
> +
> +		dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
> +
> +		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> +							(user_sgpr * 4), 2);
> +		radeon_emit(cs, dispatch_va);
> +		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
> +                                S_008F04_STRIDE(0));
> +
> +		r600_resource_reference(&dispatch_buf, NULL);
> +		user_sgpr += 2;
> +	}
> +
> +	if (AMD_HSA_BITS_GET(code_object->code_properties,
> +			AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
> +		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
> +							(user_sgpr * 4), 2);
> +		radeon_emit(cs, kernel_args_va);
> +		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> +		                S_008F04_STRIDE(0));
> +		user_sgpr += 2;
> +	}
> +
> +	for (i = 0; i < 3 && user_sgpr < 16; i++) {
> +		if (code_object->code_properties & workgroup_count_masks[i]) {
> +			radeon_set_sh_reg_seq(cs,
> +				R_00B900_COMPUTE_USER_DATA_0 +
> +				(user_sgpr * 4), 1);
> +			radeon_emit(cs, info->grid[i]);
> +			user_sgpr += 1;
> +		}
> +	}
> +}
> +
>  static void si_upload_compute_input(struct si_context *sctx,
> -                                  const struct pipe_grid_info *info)
> +				    const amd_kernel_code_t *code_object,
> +				    const struct pipe_grid_info *info)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	struct si_compute *program = sctx->cs_shader_state.program;
>  	struct r600_resource *input_buffer = NULL;
>  	unsigned kernel_args_size;
> -	unsigned num_work_size_bytes = 36;
> +	unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
>  	uint32_t kernel_args_offset = 0;
>  	uint32_t *kernel_args;
>  	void *kernel_args_ptr;
> @@ -336,10 +526,14 @@ static void si_upload_compute_input(struct si_context *sctx,
>  		       (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
>
>  	kernel_args = (uint32_t*)kernel_args_ptr;
> -	for (i = 0; i < 3; i++) {
> -		kernel_args[i] = info->grid[i];
> -		kernel_args[i + 3] = info->grid[i] * info->block[i];
> -		kernel_args[i + 6] = info->block[i];
> +	kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
> +
> +	if (!code_object) {
> +		for (i = 0; i < 3; i++) {
> +			kernel_args[i] = info->grid[i];
> +			kernel_args[i + 3] = info->grid[i] * info->block[i];
> +			kernel_args[i + 6] = info->block[i];
> +		}
>  	}
>
>  	memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
> @@ -351,15 +545,18 @@ static void si_upload_compute_input(struct si_context *sctx,
>  			kernel_args[i]);
>  	}
>
> -	kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
>
>  	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
>  				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
>
> -	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
> -	radeon_emit(cs, kernel_args_va);
> -	radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> -	                S_008F04_STRIDE(0));
> +	if (code_object) {
> +		si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
> +	} else {
> +		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
> +		radeon_emit(cs, kernel_args_va);
> +		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
> +		                S_008F04_STRIDE(0));
> +	}
>
>  	r600_resource_reference(&input_buffer, NULL);
>  }
> @@ -442,6 +639,8 @@ static void si_launch_grid(
>  {
>  	struct si_context *sctx = (struct si_context*)ctx;
>  	struct si_compute *program = sctx->cs_shader_state.program;
> +	const amd_kernel_code_t *code_object =
> +		si_compute_get_code_object(program, info->pc);
>  	int i;
>  	/* HW bug workaround when CS threadgroups > 256 threads and async
>  	 * compute isn't used, i.e. only one compute job can run at a time.
> @@ -469,7 +668,8 @@ static void si_launch_grid(
>  	if (sctx->b.flags)
>  		si_emit_cache_flush(sctx, NULL);
>
> -	if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
> +	if (!si_switch_compute_shader(sctx, program, &program->shader,
> +					code_object, info->pc))
>  		return;
>
>  	si_upload_compute_shader_descriptors(sctx);
> @@ -482,7 +682,7 @@ static void si_launch_grid(
>  	}
>
>  	if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
> -		si_upload_compute_input(sctx, info);
> +		si_upload_compute_input(sctx, code_object, info);
>
>  	/* Global buffers */
>  	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
>