[Mesa-dev] [PATCH] radeonsi: pass the scratch buffer via user SGPRs on LLVM 4.0

Fri Dec 9 17:17:35 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

TGSI compute shaders don't have RW_BUFFERS, so use SGPR[0:1].
Graphics shaders use the first slot of RW_BUFFERS.

TODO: Dave's patch only implements the latter; fix the attribute names.

UNTESTED
---
 src/gallium/drivers/radeonsi/si_compute.c       |  27 +++++--
 src/gallium/drivers/radeonsi/si_shader.c        |  34 +++++---
 src/gallium/drivers/radeonsi/si_shader.h        |   1 +
 src/gallium/drivers/radeonsi/si_state.h         |   1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    |   8 ++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 102 +++++++++++++-----------
 6 files changed, 111 insertions(+), 62 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 9d83cb3..8a4c02e 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -287,21 +287,23 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
 		r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
 		sctx->compute_scratch_buffer = (struct r600_resource*)
 			pipe_buffer_create(&sctx->screen->b.b, 0,
 					   PIPE_USAGE_DEFAULT, scratch_needed);
 
 		if (!sctx->compute_scratch_buffer)
 			return false;
 	}
 
-	if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+	if (HAVE_LLVM <= 0x0309 &&
+	    scratch_needed &&
+	    sctx->compute_scratch_buffer != shader->scratch_bo) {
 		uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
 
 		si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va);
 
 		if (si_shader_binary_upload(sctx->screen, shader))
 			return false;
 
 		r600_resource_reference(&shader->scratch_bo,
 		                        sctx->compute_scratch_buffer);
 	}
@@ -351,30 +353,43 @@ static bool si_switch_compute_shader(struct si_context *sctx,
 		/* TODO: use si_multiwave_lds_size_workaround */
 		assert(lds_blocks <= 0xFF);
 
 		config->rsrc2 &= C_00B84C_LDS_SIZE;
 		config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
 	}
 
 	if (!si_setup_compute_scratch_buffer(sctx, shader, config))
 		return false;
 
-	if (shader->scratch_bo) {
+	if (config->scratch_bytes_per_wave) {
 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
 		            "Total Scratch: %u bytes\n", sctx->scratch_waves,
 			    config->scratch_bytes_per_wave,
 			    config->scratch_bytes_per_wave *
 			    sctx->scratch_waves);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-			      shader->scratch_bo, RADEON_USAGE_READWRITE,
-			      RADEON_PRIO_SCRATCH_BUFFER);
+					  sctx->compute_scratch_buffer,
+					  RADEON_USAGE_READWRITE,
+					  RADEON_PRIO_SCRATCH_BUFFER);
+
+		/* Write the scratch pointer to SGPR[0:1]. */
+		if (HAVE_LLVM >= 0x0400 &&
+		    program->ir_type == PIPE_SHADER_IR_TGSI) {
+			uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+			radeon_emit(cs, scratch_va);
+			radeon_emit(cs,
+				    S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+				    S_008F04_SWIZZLE_ENABLE(1));
+		}
 	}
 
 	shader_va = shader->bo->gpu_address + offset;
 	if (program->use_code_object_v2) {
 		/* Shader code is placed after the amd_kernel_code_t
 		 * struct. */
 		shader_va += sizeof(amd_kernel_code_t);
 	}
 
 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
@@ -729,21 +744,23 @@ static void si_launch_grid(
 
 	si_upload_compute_shader_descriptors(sctx);
 	si_emit_compute_shader_userdata(sctx);
 
 	if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
 		sctx->atoms.s.render_cond->emit(&sctx->b,
 		                                sctx->atoms.s.render_cond);
 		si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
 	}
 
-	if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
+	if (program->ir_type == PIPE_SHADER_IR_TGSI)
+		assert(program->input_size == 0);
+	else if (program->ir_type == PIPE_SHADER_IR_NATIVE)
 		si_upload_compute_input(sctx, code_object, info);
 
 	/* Global buffers */
 	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
 		struct r600_resource *buffer =
 				(struct r600_resource*)program->global_buffers[i];
 		if (!buffer) {
 			continue;
 		}
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ed8eff4..507a44d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5321,20 +5321,28 @@ static void si_create_function(struct si_shader_context *ctx,
 		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 						   "no-infs-fp-math",
 						   "true");
 		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 						   "no-nans-fp-math",
 						   "true");
 		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 						   "unsafe-fp-math",
 						   "true");
 	}
+
+	if (ctx->type == PIPE_SHADER_COMPUTE) {
+		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+						   "amdgpu-spill-bufsgpr01", "true");
+	} else {
+		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+						   "amdgpu-spill-bufsgpr01-load", "true");
+	}
 }
 
 static void create_meta_data(struct si_shader_context *ctx)
 {
 	struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm;
 
 	ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
 							       "invariant.load", 14);
 	ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
 						     "range", 5);
@@ -5762,32 +5770,36 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
 	lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
 }
 
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset)
 {
 	unsigned i;
 	const unsigned char *config =
 		radeon_shader_binary_config_start(binary, symbol_offset);
-	bool really_needs_scratch = false;
+	bool may_need_scratch = true;
 
-	/* LLVM adds SGPR spills to the scratch size.
-	 * Find out if we really need the scratch buffer.
-	 */
-	for (i = 0; i < binary->reloc_count; i++) {
-		const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+	if (HAVE_LLVM <= 0x0309) {
+		/* LLVM adds SGPR spills to the scratch size.
+		 * Find out if we really need the scratch buffer.
+		 */
+		may_need_scratch = false;
 
-		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
-		    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-			really_needs_scratch = true;
-			break;
+		for (i = 0; i < binary->reloc_count; i++) {
+			const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+
+			if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+			    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+				may_need_scratch = true;
+				break;
+			}
 		}
 	}
 
 	/* XXX: We may be able to emit some of these values directly rather than
 	 * extracting fields to be emitted later.
 	 */
 
 	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
 		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
@@ -5810,21 +5822,21 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			conf->spi_ps_input_ena = value;
 			break;
 		case R_0286D0_SPI_PS_INPUT_ADDR:
 			conf->spi_ps_input_addr = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			if (really_needs_scratch)
+			if (may_need_scratch)
 				conf->scratch_bytes_per_wave =
 					G_00B860_WAVESIZE(value) * 256 * 4;
 			break;
 		case 0x4: /* SPILLED_SGPRS */
 			conf->spilled_sgprs = value;
 			break;
 		case 0x8: /* SPILLED_VGPRS */
 			conf->spilled_vgprs = value;
 			break;
 		default:
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 129e571..b30f61b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -483,20 +483,21 @@ struct si_shader_info {
 
 struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
 
 	struct si_shader_part		*prolog;
 	struct si_shader_part		*epilog;
 
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
+	/* for tracking which scratch address the binary contains (<= LLVM 3.9) */
 	struct r600_resource		*scratch_bo;
 	struct si_shader_key		key;
 	struct util_queue_fence		optimized_ready;
 	bool				compilation_failed;
 	bool				is_monolithic;
 	bool				is_optimized;
 	bool				is_binary_shared;
 	bool				is_gs_copy_shader;
 
 	/* The following data is all that's needed for binary shaders. */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index d8e6024..b6b089a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -154,20 +154,21 @@ union si_state_atoms {
 
 #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
 
 struct si_shader_data {
 	struct r600_atom	atom;
 	uint32_t		sh_base[SI_NUM_SHADERS];
 };
 
 /* Private read-write buffer slots. */
 enum {
+	SI_SCRATCH_BUFFER,
 	SI_HS_RING_TESS_FACTOR,
 	SI_HS_RING_TESS_OFFCHIP,
 
 	SI_ES_RING_ESGS,
 	SI_GS_RING_ESGS,
 
 	SI_GS_RING_GSVS0,
 	SI_GS_RING_GSVS1,
 	SI_GS_RING_GSVS2,
 	SI_GS_RING_GSVS3,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cae19dc..e447e32 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1016,20 +1016,28 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 		if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
 			sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
 			sctx->do_update_shaders = true;
 		}
 	}
 
 	if (sctx->do_update_shaders && !si_update_shaders(sctx))
 		return;
 
+	/* Do it after si_update_shaders, but before
+	 * si_upload_graphics_shader_descriptors. */
+	if (HAVE_LLVM >= 0x0400 && sctx->emit_scratch_reloc) {
+		si_set_ring_buffer(ctx, SI_SCRATCH_BUFFER,
+				   &sctx->scratch_buffer->b.b,
+				   0, 0xffffffff, true, true, 4, 64, 0);
+	}
+
 	if (!si_upload_graphics_shader_descriptors(sctx))
 		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
 		pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer);
 		ib.user_buffer = sctx->index_buffer.user_buffer;
 		ib.index_size = sctx->index_buffer.index_size;
 		ib.offset = sctx->index_buffer.offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0afc3b4..bb9f3a8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2130,90 +2130,100 @@ static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 	unsigned bytes = 0;
 
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
 	return bytes;
 }
 
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+	int r;
+
+	/* Update the shaders, so they are using the latest scratch.  The
+	 * scratch buffer may have been changed since these shaders were
+	 * last used, so we still need to try to update them, even if
+	 * they require scratch buffers smaller than the current size.
+	 */
+	r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+	r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+	r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+
+	/* VS can be bound as LS, ES, or VS. */
+	r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1) {
+		if (sctx->tes_shader.current)
+			si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+		else if (sctx->gs_shader.current)
+			si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+		else
+			si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+	}
+
+	/* TES can be bound as ES or VS. */
+	r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1) {
+		if (sctx->gs_shader.current)
+			si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+		else
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+	}
+
+	return true;
+}
+
 static bool si_update_spi_tmpring_size(struct si_context *sctx)
 {
 	unsigned current_scratch_buffer_size =
 		si_get_current_scratch_buffer_size(sctx);
 	unsigned scratch_bytes_per_wave =
 		si_get_max_scratch_bytes_per_wave(sctx);
 	unsigned scratch_needed_size = scratch_bytes_per_wave *
 		sctx->scratch_waves;
 	unsigned spi_tmpring_size;
-	int r;
 
 	if (scratch_needed_size > 0) {
 		if (scratch_needed_size > current_scratch_buffer_size) {
 			/* Create a bigger scratch buffer */
 			r600_resource_reference(&sctx->scratch_buffer, NULL);
 
 			sctx->scratch_buffer = (struct r600_resource*)
 					pipe_buffer_create(&sctx->screen->b.b, 0,
 	                                PIPE_USAGE_DEFAULT, scratch_needed_size);
 			if (!sctx->scratch_buffer)
 				return false;
 			sctx->emit_scratch_reloc = true;
 		}
 
-		/* Update the shaders, so they are using the latest scratch.  The
-		 * scratch buffer may have been changed since these shaders were
-		 * last used, so we still need to try to update them, even if
-		 * they require scratch buffers smaller than the current size.
-		 */
-		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-		r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
-		r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-
-		/* VS can be bound as LS, ES, or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1) {
-			if (sctx->tes_shader.current)
-				si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-			else if (sctx->gs_shader.current)
-				si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-			else
-				si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-		}
-
-		/* TES can be bound as ES or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-		if (r < 0)
+		if (HAVE_LLVM <= 0x0309 &&
+		    !si_update_scratch_relocs(sctx))
 			return false;
-		if (r == 1) {
-			if (sctx->gs_shader.current)
-				si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-			else
-				si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
 	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
 		"scratch size should already be aligned correctly.");
 
 	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
 			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 	if (spi_tmpring_size != sctx->spi_tmpring_size) {
 		sctx->spi_tmpring_size = spi_tmpring_size;
-- 
2.7.4