[Mesa-dev] [PATCH] radeonsi: reduce LDS stalls by 40% for tessellation
Marek Olšák
maraeo at gmail.com
Fri Jul 13 21:35:36 UTC 2018
From: Marek Olšák <marek.olsak at amd.com>
40% is the decrease in the LGKM counter (which includes SMEM too)
for the GFX9 LSHS stage.
This will make the LDS size slightly larger, but I wasn't able to increase
the patch stride without corruption, so I'm increasing the vertex stride.
---
src/gallium/drivers/radeonsi/si_shader.c | 8 ++++----
src/gallium/drivers/radeonsi/si_shader.h | 3 ++-
src/gallium/drivers/radeonsi/si_state_draw.c | 2 +-
src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +++++++
4 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5dc12d87243..43ba23ff494 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -410,28 +410,28 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
}
static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
{
unsigned stride;
switch (ctx->type) {
case PIPE_SHADER_VERTEX:
- stride = util_last_bit64(ctx->shader->selector->outputs_written);
- return LLVMConstInt(ctx->i32, stride * 4, 0);
+ stride = ctx->shader->selector->lshs_vertex_stride / 4;
+ return LLVMConstInt(ctx->i32, stride, 0);
case PIPE_SHADER_TESS_CTRL:
if (ctx->screen->info.chip_class >= GFX9 &&
ctx->shader->is_monolithic) {
- stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
- return LLVMConstInt(ctx->i32, stride * 4, 0);
+ stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+ return LLVMConstInt(ctx->i32, stride, 0);
}
return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
default:
assert(0);
return NULL;
}
}
static LLVMValueRef get_instance_index_for_fetch(
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ffe13b761d9..3f3294eee37 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -361,21 +361,22 @@ struct si_shader_selector {
/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
unsigned type;
bool vs_needs_prolog;
bool force_correct_derivs_after_kill;
unsigned pa_cl_vs_out_cntl;
ubyte clipdist_mask;
ubyte culldist_mask;
/* ES parameters. */
- unsigned esgs_itemsize;
+ unsigned esgs_itemsize; /* vertex stride */
+ unsigned lshs_vertex_stride;
/* GS parameters. */
unsigned gs_input_verts_per_prim;
unsigned gs_output_prim;
unsigned gs_max_out_vertices;
unsigned gs_num_invocations;
unsigned max_gs_stream; /* count - 1 */
unsigned gsvs_vertex_size;
unsigned max_gsvs_emit_size;
unsigned enabled_streamout_buffer_mask;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index f35f73a37ce..d901401f0bb 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -127,21 +127,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
num_tcs_outputs = util_last_bit64(tcs->outputs_written);
num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
} else {
/* No TCS. Route varyings from LS to TES. */
num_tcs_outputs = num_tcs_inputs;
num_tcs_output_cp = num_tcs_input_cp;
num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
}
- input_vertex_size = num_tcs_inputs * 16;
+ input_vertex_size = ls->lshs_vertex_stride;
output_vertex_size = num_tcs_outputs * 16;
input_patch_size = num_tcs_input_cp * input_vertex_size;
pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
/* Ensure that we only need one wave per SIMD so we don't need to check
* resource usage. Also ensures that the number of tcs in and out
* vertices per threadgroup are at most 256.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ffc8821df09..32d804c3fc5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2153,20 +2153,27 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->outputs_written |=
1ull << si_shader_io_get_unique_index(name, index, false);
sel->outputs_written_before_ps |=
1ull << si_shader_io_get_unique_index(name, index, true);
break;
case TGSI_SEMANTIC_EDGEFLAG:
break;
}
}
sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+ sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+ /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+ * will start on a different bank. (except for the maximum 32*16).
+ */
+ if (sel->lshs_vertex_stride < 32*16)
+ sel->lshs_vertex_stride += 4;
/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
* conflicts, i.e. each vertex will start at a different bank.
*/
if (sctx->chip_class >= GFX9)
sel->esgs_itemsize += 4;
assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
break;
--
2.17.1
More information about the mesa-dev
mailing list