[Mesa-dev] [PATCH] radeonsi: reduce LDS stalls by 40% for tessellation

Fri Jul 13 21:35:36 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

40% is the decrease in the LGKM counter (which includes SMEM too)
for the GFX9 LSHS stage.

This will make the LDS size slightly larger, but I wasn't able to increase
the patch stride without corruption, so I'm increasing the vertex stride.
---
 src/gallium/drivers/radeonsi/si_shader.c        | 8 ++++----
 src/gallium/drivers/radeonsi/si_shader.h        | 3 ++-
 src/gallium/drivers/radeonsi/si_state_draw.c    | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +++++++
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5dc12d87243..43ba23ff494 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -410,28 +410,28 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 
 	return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 }
 
 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 {
 	unsigned stride;
 
 	switch (ctx->type) {
 	case PIPE_SHADER_VERTEX:
-		stride = util_last_bit64(ctx->shader->selector->outputs_written);
-		return LLVMConstInt(ctx->i32, stride * 4, 0);
+		stride = ctx->shader->selector->lshs_vertex_stride / 4;
+		return LLVMConstInt(ctx->i32, stride, 0);
 
 	case PIPE_SHADER_TESS_CTRL:
 		if (ctx->screen->info.chip_class >= GFX9 &&
 		    ctx->shader->is_monolithic) {
-			stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
-			return LLVMConstInt(ctx->i32, stride * 4, 0);
+			stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+			return LLVMConstInt(ctx->i32, stride, 0);
 		}
 		return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
 	default:
 		assert(0);
 		return NULL;
 	}
 }
 
 static LLVMValueRef get_instance_index_for_fetch(
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ffe13b761d9..3f3294eee37 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -361,21 +361,22 @@ struct si_shader_selector {
 
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 	bool		vs_needs_prolog;
 	bool		force_correct_derivs_after_kill;
 	unsigned	pa_cl_vs_out_cntl;
 	ubyte		clipdist_mask;
 	ubyte		culldist_mask;
 
 	/* ES parameters. */
-	unsigned	esgs_itemsize;
+	unsigned	esgs_itemsize; /* vertex stride */
+	unsigned	lshs_vertex_stride;
 
 	/* GS parameters. */
 	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
 	unsigned	max_gs_stream; /* count - 1 */
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;
 	unsigned	enabled_streamout_buffer_mask;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index f35f73a37ce..d901401f0bb 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -127,21 +127,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
 		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
 		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
 	} else {
 		/* No TCS. Route varyings from LS to TES. */
 		num_tcs_outputs = num_tcs_inputs;
 		num_tcs_output_cp = num_tcs_input_cp;
 		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
 	}
 
-	input_vertex_size = num_tcs_inputs * 16;
+	input_vertex_size = ls->lshs_vertex_stride;
 	output_vertex_size = num_tcs_outputs * 16;
 
 	input_patch_size = num_tcs_input_cp * input_vertex_size;
 
 	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 
 	/* Ensure that we only need one wave per SIMD so we don't need to check
 	 * resource usage. Also ensures that the number of tcs in and out
 	 * vertices per threadgroup are at most 256.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ffc8821df09..32d804c3fc5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2153,20 +2153,27 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 				sel->outputs_written |=
 					1ull << si_shader_io_get_unique_index(name, index, false);
 				sel->outputs_written_before_ps |=
 					1ull << si_shader_io_get_unique_index(name, index, true);
 				break;
 			case TGSI_SEMANTIC_EDGEFLAG:
 				break;
 			}
 		}
 		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+		sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+		/* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+		 * will start on a different bank. (except for the maximum 32*16).
+		 */
+		if (sel->lshs_vertex_stride < 32*16)
+			sel->lshs_vertex_stride += 4;
 
 		/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
 		 * conflicts, i.e. each vertex will start at a different bank.
 		 */
 		if (sctx->chip_class >= GFX9)
 			sel->esgs_itemsize += 4;
 
 		assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
 		break;
 
-- 
2.17.1