[Mesa-dev] [PATCH 4/4] radeonsi: Enable VGPR spilling for all shader types

Tom Stellard thomas.stellard at amd.com
Wed Jan 7 13:03:45 PST 2015


---
 src/gallium/drivers/radeonsi/si_compute.c       | 40 ++------------
 src/gallium/drivers/radeonsi/si_shader.c        | 69 ++++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h        |  7 ++-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 36 +++++++++++--
 4 files changed, 108 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 37e5c42..e5f158d 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -42,12 +42,6 @@
 #define NUM_USER_SGPRS 4
 #endif
 
-static const char *scratch_rsrc_dword0_symbol =
-	"SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
-	"SCRATCH_RSRC_DWORD1";
-
 struct si_compute {
 	struct si_context *ctx;
 
@@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
 	return scratch_waves;
 }
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-			const struct radeon_shader_binary *binary,
-			struct si_shader *shader, uint64_t scratch_va) {
-	unsigned i;
-	char *ptr;
-	uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
-	uint32_t scratch_rsrc_dword1 =
-		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
-
-	if (!binary->reloc_count) {
-		return;
-	}
-
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
-					PIPE_TRANSFER_READ_WRITE);
-	for (i = 0 ; i < binary->reloc_count; i++) {
-		const struct radeon_shader_reloc *reloc = &binary->relocs[i];
-		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
-			util_memcpy_cpu_to_le32(ptr + reloc->offset,
-				&scratch_rsrc_dword0, 4);
-		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-			util_memcpy_cpu_to_le32(ptr + reloc->offset,
-				&scratch_rsrc_dword1, 4);
-		}
-	}
-	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
-}
-
 static void si_launch_grid(
 		struct pipe_context *ctx,
 		const uint *block_layout, const uint *grid_layout,
@@ -256,7 +221,8 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(&program->binary, &program->program, pc);
+	si_shader_binary_read_config(sctx->screen, &program->binary,
+					&program->program, pc);
 #endif
 
 	/* Upload the kernel arguments */
@@ -295,7 +261,7 @@ static void si_launch_grid(
 				RADEON_PRIO_SHADER_RESOURCE_RW);
 
 		/* Patch the shader with the scratch buffer address. */
-		apply_scratch_relocs(sctx->screen,
+		si_shader_apply_scratch_relocs(sctx->screen,
 			&program->binary, shader, scratch_buffer_va);
 
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cf28860..d59e736 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -46,6 +46,12 @@
 
 #include <errno.h>
 
+static const char *scratch_rsrc_dword0_symbol =
+	"SCRATCH_RSRC_DWORD0";
+
+static const char *scratch_rsrc_dword1_symbol =
+	"SCRATCH_RSRC_DWORD1";
+
 struct si_shader_output_values
 {
 	LLVMValueRef values[4];
@@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+				const struct radeon_shader_binary *binary,
 				struct si_shader *shader,
 				unsigned symbol_offset)
 {
@@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
 			break;
+		case R_0286E8_SPI_TMPRING_SIZE:
+			/* XXX: This is the maximum value allowed.  I'm not sure
+			 * how compute this for non-cs shaders.
+			 */
+			shader->scratch_waves =
+				32 * sscreen->b.info.max_compute_units;
+			/* Fall-through */
+
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
 			shader->scratch_bytes_per_wave =
@@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
 	}
 }
 
+void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
+			const struct radeon_shader_binary *binary,
+			struct si_shader *shader, uint64_t scratch_va) {
+	unsigned i;
+	char *ptr;
+	uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
+	uint32_t scratch_rsrc_dword1 =
+		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
+		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+
+	if (!binary->reloc_count) {
+		return;
+	}
+
+	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+					PIPE_TRANSFER_READ_WRITE);
+	for (i = 0 ; i < binary->reloc_count; i++) {
+		const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
+			util_memcpy_cpu_to_le32(ptr + reloc->offset,
+				&scratch_rsrc_dword0, 4);
+		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+			util_memcpy_cpu_to_le32(ptr + reloc->offset,
+				&scratch_rsrc_dword1, 4);
+		}
+	}
+	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+}
+
+
 int si_shader_binary_read(struct si_screen *sscreen,
 			struct si_shader *shader,
 			const struct radeon_shader_binary *binary)
@@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
 		}
 	}
 
-	si_shader_binary_read_config(binary, shader, 0);
+	si_shader_binary_read_config(sscreen, binary, shader, 0);
 
 	/* copy new shader */
 	code_size = binary->code_size + binary->rodata_size;
@@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
 		util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
 	}
 
+
 	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
 
 	return 0;
@@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		return r;
 	}
 	r = si_shader_binary_read(sscreen, shader, &binary);
+
+	if (shader->scratch_bytes_per_wave > 0) {
+		uint64_t scratch_buffer_va;
+		unsigned scratch_bytes = shader->scratch_bytes_per_wave *
+					shader->scratch_waves;
+		/* It's possible for different shader variants to have
+		 * different scratch buffer sizes.  The scratch buffer sizes
+		 * won't vary by too much, so allocating double the scratch
+		 * space need for the first variant should be enough for the
+		 * rest if they need it.
+		 */
+		shader->scratch_bo = (struct r600_resource*)
+				si_resource_create_custom(&sscreen->b.b,
+				PIPE_USAGE_DEFAULT, scratch_bytes * 2);
+		scratch_buffer_va = shader->scratch_bo->gpu_address;
+		si_shader_apply_scratch_relocs(sscreen, &binary, shader,
+			scratch_buffer_va);
+	}
+
 	FREE(binary.code);
 	FREE(binary.config);
 	FREE(binary.rodata);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 08e344a..c5b274e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -147,6 +147,7 @@ struct si_shader {
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
 	unsigned			scratch_bytes_per_wave;
+	unsigned			scratch_waves;
 	unsigned			spi_shader_col_format;
 	unsigned			spi_shader_z_format;
 	unsigned			db_shader_control;
@@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 		const struct radeon_shader_binary *binary);
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
+			const struct radeon_shader_binary *binary,
+			struct si_shader *shader, uint64_t scratch_va);
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+				const struct radeon_shader_binary *binary,
 				struct si_shader *shader,
 				unsigned symbol_offset);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 817a990..b6a0f32 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,22 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
+static void si_shader_setup_scratch_buffer(struct si_shader *shader,
+					struct si_pm4_state *pm4) {
+	if (shader->scratch_bytes_per_wave == 0) {
+		return;
+	}
+
+	assert(shader->scratch_bo);
+
+	si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE,
+			S_0286E8_WAVES(shader->scratch_waves)
+			| S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10));
+
+	si_pm4_add_bo(pm4, shader->scratch_bo, RADEON_USAGE_READWRITE,
+				RADEON_PRIO_SHADER_RESOURCE_RW);
+}
+
 static void si_shader_es(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
@@ -67,7 +83,10 @@ static void si_shader_es(struct si_shader *shader)
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
-		       S_00B32C_USER_SGPR(num_user_sgprs));
+		       S_00B32C_USER_SGPR(num_user_sgprs) |
+		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	si_shader_setup_scratch_buffer(shader, pm4);
 }
 
 static void si_shader_gs(struct si_shader *shader)
@@ -136,7 +155,10 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-		       S_00B22C_USER_SGPR(num_user_sgprs));
+		       S_00B22C_USER_SGPR(num_user_sgprs) |
+		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	si_shader_setup_scratch_buffer(shader, pm4);
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -216,7 +238,8 @@ static void si_shader_vs(struct si_shader *shader)
 		       S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
 		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
+		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
+		       S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 	if (window_space)
 		si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 			       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -226,6 +249,8 @@ static void si_shader_vs(struct si_shader *shader)
 			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
 			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+
+	si_shader_setup_scratch_buffer(shader, pm4);
 }
 
 static void si_shader_ps(struct si_shader *shader)
@@ -307,7 +332,10 @@ static void si_shader_ps(struct si_shader *shader)
 		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
 		       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
-		       S_00B02C_USER_SGPR(num_user_sgprs));
+		       S_00B02C_USER_SGPR(num_user_sgprs) |
+		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	si_shader_setup_scratch_buffer(shader, pm4);
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
-- 
1.8.5.5



More information about the mesa-dev mailing list