[Mesa-dev] [PATCH 19/20] radeonsi: eliminate VS outputs that aren't used by PS at runtime

Marek Olšák maraeo at gmail.com
Wed Nov 16 18:38:42 UTC 2016


From: Marek Olšák <marek.olsak at amd.com>

A past commit added the ability to compile "optimized" shader variants
asynchronously (not stalling the app).

This commit builds upon that and adds what is basically a runtime shader
linker. If a VS output isn't used by the currently-bound PS, a new VS
compilation is started without that output. The new shader variant
is used when it's ready.

All apps using separate shader objects I've seen had unused VS outputs.

Eliminating unused/useless VS outputs also eliminates the corresponding
vertex attribute loads.
---
 src/gallium/drivers/radeonsi/si_shader.c        | 26 ++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h        |  7 ++---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 37 ++++++++++++++++++++++---
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f8de049..973750b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2274,20 +2274,40 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 	unsigned pos_idx;
 	int i;
 
 	if (outputs && ctx->shader->selector->so.num_outputs) {
 		si_llvm_emit_streamout(ctx, outputs, noutput);
 	}
 
 	for (i = 0; i < noutput; i++) {
 		semantic_name = outputs[i].name;
 		semantic_index = outputs[i].sid;
+		bool export_param = true;
+
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_POSITION: /* ignore these */
+		case TGSI_SEMANTIC_PSIZE:
+		case TGSI_SEMANTIC_CLIPVERTEX:
+		case TGSI_SEMANTIC_EDGEFLAG:
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+		case TGSI_SEMANTIC_CLIPDIST:
+			if (shader->key.opt.hw_vs.kill_outputs &
+			    (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
+				export_param = false;
+			break;
+		default:
+			if (shader->key.opt.hw_vs.kill_outputs2 &
+			    (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
+				export_param = false;
+			break;
+		}
 
 handle_semantic:
 		/* Select the correct target */
 		switch(semantic_name) {
 		case TGSI_SEMANTIC_PSIZE:
 			psize_value = outputs[i].values[0];
 			continue;
 		case TGSI_SEMANTIC_EDGEFLAG:
 			edgeflag_value = outputs[i].values[0];
 			continue;
@@ -2297,20 +2317,22 @@ handle_semantic:
 			goto handle_semantic;
 		case TGSI_SEMANTIC_VIEWPORT_INDEX:
 			viewport_index_value = outputs[i].values[0];
 			semantic_name = TGSI_SEMANTIC_GENERIC;
 			goto handle_semantic;
 		case TGSI_SEMANTIC_POSITION:
 			target = V_008DFC_SQ_EXP_POS;
 			break;
 		case TGSI_SEMANTIC_COLOR:
 		case TGSI_SEMANTIC_BCOLOR:
+			if (!export_param)
+				continue;
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
 			assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
 			shader->info.vs_output_param_offset[i] = param_count;
 			param_count++;
 			break;
 		case TGSI_SEMANTIC_CLIPDIST:
 			if (shader->key.opt.hw_vs.clip_disable) {
 				semantic_name = TGSI_SEMANTIC_GENERIC;
 				goto handle_semantic;
 			}
@@ -2318,20 +2340,22 @@ handle_semantic:
 			break;
 		case TGSI_SEMANTIC_CLIPVERTEX:
 			if (shader->key.opt.hw_vs.clip_disable)
 				continue;
 			si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
 			continue;
 		case TGSI_SEMANTIC_PRIMID:
 		case TGSI_SEMANTIC_FOG:
 		case TGSI_SEMANTIC_TEXCOORD:
 		case TGSI_SEMANTIC_GENERIC:
+			if (!export_param)
+				continue;
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
 			assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
 			shader->info.vs_output_param_offset[i] = param_count;
 			param_count++;
 			break;
 		default:
 			target = 0;
 			fprintf(stderr,
 				"Warning: SI unhandled vs output type:%d\n",
 				semantic_name);
@@ -7070,21 +7094,21 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	 * conversion fails. */
 	if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
 	    !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
 		tgsi_dump(sel->tokens, 0);
 		si_dump_streamout(&sel->so);
 	}
 
 	si_init_shader_ctx(&ctx, sscreen, shader, tm);
 	ctx.separate_prolog = !is_monolithic;
 
-	memset(shader->info.vs_output_param_offset, 0xff,
+	memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED,
 	       sizeof(shader->info.vs_output_param_offset));
 
 	shader->info.uses_instanceid = sel->info.uses_instanceid;
 
 	bld_base = &ctx.soa.bld_base;
 	ctx.load_system_value = declare_system_value;
 
 	if (!si_compile_tgsi_main(&ctx, shader)) {
 		si_llvm_dispose(&ctx);
 		return -1;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fc9c913..aa37676 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -315,24 +315,20 @@ struct si_shader_selector {
  */
 
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
 	unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
 };
 
 /* Common VS bits between the shader key and the epilog key. */
 struct si_vs_epilog_bits {
 	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
-	/* TODO:
-	 * - skip layer, viewport, clipdist, and culldist parameter exports
-	 *   if PS doesn't read them
-	 */
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
 };
 
 struct si_gs_prolog_bits {
 	unsigned	tri_strip_adj_fix:1;
 };
@@ -433,20 +429,22 @@ struct si_shader_key {
 			uint32_t	fix_fetch;
 		} vs;
 		struct {
 			uint64_t	inputs_to_copy; /* for fixed-func TCS */
 		} tcs;
 	} mono;
 
 	/* Optimization flags for asynchronous compilation only. */
 	union {
 		struct {
+			uint64_t	kill_outputs; /* "get_unique_index" bits */
+			uint32_t	kill_outputs2; /* "get_unique_index2" bits */
 			unsigned	clip_disable:1;
 		} hw_vs; /* HW VS (it can be VS, TES, GS) */
 	} opt;
 };
 
 struct si_shader_config {
 	unsigned			num_sgprs;
 	unsigned			num_vgprs;
 	unsigned			spilled_sgprs;
 	unsigned			spilled_vgprs;
@@ -461,20 +459,21 @@ struct si_shader_config {
 
 enum {
 	/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
 	EXP_PARAM_OFFSET_0 = 0,
 	EXP_PARAM_OFFSET_31 = 31,
 	/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
 	EXP_PARAM_DEFAULT_VAL_0000 = 64,
 	EXP_PARAM_DEFAULT_VAL_0001,
 	EXP_PARAM_DEFAULT_VAL_1110,
 	EXP_PARAM_DEFAULT_VAL_1111,
+	EXP_PARAM_UNDEFINED = 255,
 };
 
 /* GCN-specific shader info. */
 struct si_shader_info {
 	ubyte			vs_output_param_offset[SI_MAX_VS_OUTPUTS];
 	ubyte			num_input_sgprs;
 	ubyte			num_input_vgprs;
 	char			face_vgpr_index;
 	bool			uses_instanceid;
 	ubyte			nr_pos_exports;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index e4d8747..7834f87 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -851,25 +851,49 @@ static unsigned si_get_alpha_test_func(struct si_context *sctx)
 	if (sctx->queued.named.dsa)
 		return sctx->queued.named.dsa->alpha_func;
 
 	return PIPE_FUNC_ALWAYS;
 }
 
 static void si_shader_selector_key_hw_vs(struct si_context *sctx,
 					 struct si_shader_selector *vs,
 					 struct si_shader_key *key)
 {
+	struct si_shader_selector *ps = sctx->ps_shader.cso;
+
 	key->opt.hw_vs.clip_disable =
 		sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
 		(vs->info.clipdist_writemask ||
 		 vs->info.writes_clipvertex) &&
 		!vs->info.culldist_writemask;
+
+	/* Find out if PS is disabled. */
+	bool ps_disabled = ps == NULL;
+
+	/* Find out which VS outputs aren't used by the PS. */
+	uint64_t outputs_written = vs->outputs_written;
+	uint32_t outputs_written2 = vs->outputs_written2;
+	uint64_t inputs_read = 0;
+	uint32_t inputs_read2 = 0;
+
+	outputs_written &= ~0x3; /* ignore POSITION, PSIZE */
+
+	if (!ps_disabled) {
+		inputs_read = ps->inputs_read;
+		inputs_read2 = ps->inputs_read2;
+	}
+
+	uint64_t linked = outputs_written & inputs_read;
+	uint32_t linked2 = outputs_written2 & inputs_read2;
+
+	key->opt.hw_vs.kill_outputs = ~linked & outputs_written;
+	key->opt.hw_vs.kill_outputs2 = ~linked2 & outputs_written2;
 }
 
 /* Compute the key for the hw shader variant */
 static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
 					  struct si_shader_key *key)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	unsigned i;
 
@@ -1778,25 +1802,30 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
 
 	for (j = 0; j < vsinfo->num_outputs; j++) {
 		if (name == vsinfo->output_semantic_name[j] &&
 		    index == vsinfo->output_semantic_index[j]) {
 			offset = vs->info.vs_output_param_offset[j];
 
 			if (offset <= EXP_PARAM_OFFSET_31) {
 				/* The input is loaded from parameter memory. */
 				ps_input_cntl |= S_028644_OFFSET(offset);
 			} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-				/* The input is a DEFAULT_VAL constant. */
-				assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 &&
-				       offset <= EXP_PARAM_DEFAULT_VAL_1111);
+				if (offset == EXP_PARAM_UNDEFINED) {
+					/* This can happen with depth-only rendering. */
+					offset = 0;
+				} else {
+					/* The input is a DEFAULT_VAL constant. */
+					assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 &&
+					       offset <= EXP_PARAM_DEFAULT_VAL_1111);
+					offset -= EXP_PARAM_DEFAULT_VAL_0000;
+				}
 
-				offset -= EXP_PARAM_DEFAULT_VAL_0000;
 				ps_input_cntl = S_028644_OFFSET(0x20) |
 						S_028644_DEFAULT_VAL(offset);
 			}
 			break;
 		}
 	}
 
 	if (name == TGSI_SEMANTIC_PRIMID)
 		/* PrimID is written after the last output. */
 		ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
-- 
2.7.4



More information about the mesa-dev mailing list