[Mesa-dev] [PATCH] radeonsi: pass the scratch buffer via user SGPRs on LLVM 4.0
Marek Olšák
maraeo at gmail.com
Fri Dec 9 17:17:35 UTC 2016
From: Marek Olšák <marek.olsak at amd.com>
TGSI compute shaders don't have RW_BUFFERS, so use SGPR[0:1].
Graphics shaders use the first slot of RW_BUFFERS.
TODO: Dave's patch only implements the latter; fix the attribute names.
UNTESTED
---
src/gallium/drivers/radeonsi/si_compute.c | 27 +++++--
src/gallium/drivers/radeonsi/si_shader.c | 34 +++++---
src/gallium/drivers/radeonsi/si_shader.h | 1 +
src/gallium/drivers/radeonsi/si_state.h | 1 +
src/gallium/drivers/radeonsi/si_state_draw.c | 8 ++
src/gallium/drivers/radeonsi/si_state_shaders.c | 102 +++++++++++++-----------
6 files changed, 111 insertions(+), 62 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 9d83cb3..8a4c02e 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -287,21 +287,23 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
sctx->compute_scratch_buffer = (struct r600_resource*)
pipe_buffer_create(&sctx->screen->b.b, 0,
PIPE_USAGE_DEFAULT, scratch_needed);
if (!sctx->compute_scratch_buffer)
return false;
}
- if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+ if (HAVE_LLVM <= 0x0309 &&
+ scratch_needed &&
+ sctx->compute_scratch_buffer != shader->scratch_bo) {
uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va);
if (si_shader_binary_upload(sctx->screen, shader))
return false;
r600_resource_reference(&shader->scratch_bo,
sctx->compute_scratch_buffer);
}
@@ -351,30 +353,43 @@ static bool si_switch_compute_shader(struct si_context *sctx,
/* TODO: use si_multiwave_lds_size_workaround */
assert(lds_blocks <= 0xFF);
config->rsrc2 &= C_00B84C_LDS_SIZE;
config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
}
if (!si_setup_compute_scratch_buffer(sctx, shader, config))
return false;
- if (shader->scratch_bo) {
+ if (config->scratch_bytes_per_wave) {
COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
"Total Scratch: %u bytes\n", sctx->scratch_waves,
config->scratch_bytes_per_wave,
config->scratch_bytes_per_wave *
sctx->scratch_waves);
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
- shader->scratch_bo, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SCRATCH_BUFFER);
+ sctx->compute_scratch_buffer,
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SCRATCH_BUFFER);
+
+ /* Write the scratch pointer to SGPR[0:1]. */
+ if (HAVE_LLVM >= 0x0400 &&
+ program->ir_type == PIPE_SHADER_IR_TGSI) {
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+ radeon_emit(cs, scratch_va);
+ radeon_emit(cs,
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+ S_008F04_SWIZZLE_ENABLE(1));
+ }
}
shader_va = shader->bo->gpu_address + offset;
if (program->use_code_object_v2) {
/* Shader code is placed after the amd_kernel_code_t
* struct. */
shader_va += sizeof(amd_kernel_code_t);
}
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
@@ -729,21 +744,23 @@ static void si_launch_grid(
si_upload_compute_shader_descriptors(sctx);
si_emit_compute_shader_userdata(sctx);
if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
sctx->atoms.s.render_cond->emit(&sctx->b,
sctx->atoms.s.render_cond);
si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
}
- if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
+ if (program->ir_type == PIPE_SHADER_IR_TGSI)
+ assert(program->input_size == 0);
+ else if (program->ir_type == PIPE_SHADER_IR_NATIVE)
si_upload_compute_input(sctx, code_object, info);
/* Global buffers */
for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
struct r600_resource *buffer =
(struct r600_resource*)program->global_buffers[i];
if (!buffer) {
continue;
}
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ed8eff4..507a44d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5321,20 +5321,28 @@ static void si_create_function(struct si_shader_context *ctx,
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
"no-infs-fp-math",
"true");
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
"no-nans-fp-math",
"true");
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
"unsafe-fp-math",
"true");
}
+
+ if (ctx->type == PIPE_SHADER_COMPUTE) {
+ LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+ "amdgpu-spill-bufsgpr01", "true");
+ } else {
+ LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+ "amdgpu-spill-bufsgpr01-load", "true");
+ }
}
static void create_meta_data(struct si_shader_context *ctx)
{
struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm;
ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
"invariant.load", 14);
ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
"range", 5);
@@ -5762,32 +5770,36 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
}
void si_shader_binary_read_config(struct radeon_shader_binary *binary,
struct si_shader_config *conf,
unsigned symbol_offset)
{
unsigned i;
const unsigned char *config =
radeon_shader_binary_config_start(binary, symbol_offset);
- bool really_needs_scratch = false;
+ bool may_need_scratch = true;
- /* LLVM adds SGPR spills to the scratch size.
- * Find out if we really need the scratch buffer.
- */
- for (i = 0; i < binary->reloc_count; i++) {
- const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+ if (HAVE_LLVM <= 0x0309) {
+ /* LLVM adds SGPR spills to the scratch size.
+ * Find out if we really need the scratch buffer.
+ */
+ may_need_scratch = false;
- if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
- !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
- really_needs_scratch = true;
- break;
+ for (i = 0; i < binary->reloc_count; i++) {
+ const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+
+ if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+ !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+ may_need_scratch = true;
+ break;
+ }
}
}
/* XXX: We may be able to emit some of these values directly rather than
* extracting fields to be emitted later.
*/
for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
@@ -5810,21 +5822,21 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
break;
case R_0286CC_SPI_PS_INPUT_ENA:
conf->spi_ps_input_ena = value;
break;
case R_0286D0_SPI_PS_INPUT_ADDR:
conf->spi_ps_input_addr = value;
break;
case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
- if (really_needs_scratch)
+ if (may_need_scratch)
conf->scratch_bytes_per_wave =
G_00B860_WAVESIZE(value) * 256 * 4;
break;
case 0x4: /* SPILLED_SGPRS */
conf->spilled_sgprs = value;
break;
case 0x8: /* SPILLED_VGPRS */
conf->spilled_vgprs = value;
break;
default:
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 129e571..b30f61b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -483,20 +483,21 @@ struct si_shader_info {
struct si_shader {
struct si_shader_selector *selector;
struct si_shader *next_variant;
struct si_shader_part *prolog;
struct si_shader_part *epilog;
struct si_pm4_state *pm4;
struct r600_resource *bo;
+ /* for tracking which scratch address the binary contains (<= LLVM 3.9) */
struct r600_resource *scratch_bo;
struct si_shader_key key;
struct util_queue_fence optimized_ready;
bool compilation_failed;
bool is_monolithic;
bool is_optimized;
bool is_binary_shared;
bool is_gs_copy_shader;
/* The following data is all that's needed for binary shaders. */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index d8e6024..b6b089a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -154,20 +154,21 @@ union si_state_atoms {
#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
struct si_shader_data {
struct r600_atom atom;
uint32_t sh_base[SI_NUM_SHADERS];
};
/* Private read-write buffer slots. */
enum {
+ SI_SCRATCH_BUFFER,
SI_HS_RING_TESS_FACTOR,
SI_HS_RING_TESS_OFFCHIP,
SI_ES_RING_ESGS,
SI_GS_RING_ESGS,
SI_GS_RING_GSVS0,
SI_GS_RING_GSVS1,
SI_GS_RING_GSVS2,
SI_GS_RING_GSVS3,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cae19dc..e447e32 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1016,20 +1016,28 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
sctx->do_update_shaders = true;
}
}
if (sctx->do_update_shaders && !si_update_shaders(sctx))
return;
+ /* Do it after si_update_shaders, but before
+ * si_upload_graphics_shader_descriptors. */
+ if (HAVE_LLVM >= 0x0400 && sctx->emit_scratch_reloc) {
+ si_set_ring_buffer(ctx, SI_SCRATCH_BUFFER,
+ &sctx->scratch_buffer->b.b,
+ 0, 0xffffffff, true, true, 4, 64, 0);
+ }
+
if (!si_upload_graphics_shader_descriptors(sctx))
return;
if (info->indexed) {
/* Initialize the index buffer struct. */
pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer);
ib.user_buffer = sctx->index_buffer.user_buffer;
ib.index_size = sctx->index_buffer.index_size;
ib.offset = sctx->index_buffer.offset;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0afc3b4..bb9f3a8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2130,90 +2130,100 @@ static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
unsigned bytes = 0;
bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
return bytes;
}
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+ int r;
+
+ /* Update the shaders, so they are using the latest scratch. The
+ * scratch buffer may have been changed since these shaders were
+ * last used, so we still need to try to update them, even if
+ * they require scratch buffers smaller than the current size.
+ */
+ r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+ r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+ r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+
+ /* VS can be bound as LS, ES, or VS. */
+ r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1) {
+ if (sctx->tes_shader.current)
+ si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+ else if (sctx->gs_shader.current)
+ si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+ else
+ si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+ }
+
+ /* TES can be bound as ES or VS. */
+ r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1) {
+ if (sctx->gs_shader.current)
+ si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+ else
+ si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+ }
+
+ return true;
+}
+
static bool si_update_spi_tmpring_size(struct si_context *sctx)
{
unsigned current_scratch_buffer_size =
si_get_current_scratch_buffer_size(sctx);
unsigned scratch_bytes_per_wave =
si_get_max_scratch_bytes_per_wave(sctx);
unsigned scratch_needed_size = scratch_bytes_per_wave *
sctx->scratch_waves;
unsigned spi_tmpring_size;
- int r;
if (scratch_needed_size > 0) {
if (scratch_needed_size > current_scratch_buffer_size) {
/* Create a bigger scratch buffer */
r600_resource_reference(&sctx->scratch_buffer, NULL);
sctx->scratch_buffer = (struct r600_resource*)
pipe_buffer_create(&sctx->screen->b.b, 0,
PIPE_USAGE_DEFAULT, scratch_needed_size);
if (!sctx->scratch_buffer)
return false;
sctx->emit_scratch_reloc = true;
}
- /* Update the shaders, so they are using the latest scratch. The
- * scratch buffer may have been changed since these shaders were
- * last used, so we still need to try to update them, even if
- * they require scratch buffers smaller than the current size.
- */
- r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
- r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
- r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-
- /* VS can be bound as LS, ES, or VS. */
- r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
- if (r < 0)
- return false;
- if (r == 1) {
- if (sctx->tes_shader.current)
- si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
- else if (sctx->gs_shader.current)
- si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
- else
- si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
- }
-
- /* TES can be bound as ES or VS. */
- r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
- if (r < 0)
+ if (HAVE_LLVM <= 0x0309 &&
+ !si_update_scratch_relocs(sctx))
return false;
- if (r == 1) {
- if (sctx->gs_shader.current)
- si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
- else
- si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
- }
}
/* The LLVM shader backend should be reporting aligned scratch_sizes. */
assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
"scratch size should already be aligned correctly.");
spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
if (spi_tmpring_size != sctx->spi_tmpring_size) {
sctx->spi_tmpring_size = spi_tmpring_size;
--
2.7.4
More information about the mesa-dev
mailing list