[Mesa-dev] [PATCH v2] radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug

Nicolai Hähnle nhaehnle at gmail.com
Tue Sep 5 07:56:31 UTC 2017


From: Nicolai Hähnle <nicolai.haehnle at amd.com>

When the HS wave is empty, the hardware writes the LS VGPRs starting at
v0 instead of v2. Workaround by shifting them back into place when
necessary. For simplicity, this is always done in the LS prolog.

According to the hardware team, this will be fixed in future chips,
so take that into account already.

Note that this is not a bug fix, as the bug was already worked
around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround
for a tessellation driver bug"). This change merely replaces the
workaround by one that should be better.

v2: add workaround code to shader only when necessary
---
 src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
 src/gallium/drivers/radeonsi/si_shader.c        | 71 ++++++++++++++++++-------
 src/gallium/drivers/radeonsi/si_shader.h        |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    | 27 ++++++++--
 src/gallium/drivers/radeonsi/si_state_shaders.c |  8 +++
 5 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 3ae06584427..9832fd19ff6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -382,20 +382,21 @@ struct si_context {
 	bool			db_flush_stencil_inplace:1;
 	bool			db_depth_clear:1;
 	bool			db_depth_disable_expclear:1;
 	bool			db_stencil_clear:1;
 	bool			db_stencil_disable_expclear:1;
 	bool			occlusion_queries_disabled:1;
 	bool			generate_mipmap_for_depth:1;
 
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
+	bool			ls_vgpr_fix:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
 	int			last_drawid;
 	int			last_sh_base_reg;
 	int			last_primitive_restart_en;
 	int			last_restart_index;
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2cddfe97aa5..bae9b8384dd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5536,20 +5536,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 }
 
 static void si_dump_shader_key_vs(const struct si_shader_key *key,
 				  const struct si_vs_prolog_bits *prolog,
 				  const char *prefix, FILE *f)
 {
 	fprintf(f, "  %s.instance_divisor_is_one = %u\n",
 		prefix, prolog->instance_divisor_is_one);
 	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
 		prefix, prolog->instance_divisor_is_fetched);
+	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
+		prefix, prolog->ls_vgpr_fix);
 
 	fprintf(f, "  mono.vs.fix_fetch = {");
 	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
 		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
 	fprintf(f, "}\n");
 }
 
 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
 			       FILE *f)
 {
@@ -5728,20 +5730,28 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
 {
 	LLVMValueRef args[] = {
 		LLVMGetParam(ctx->main_fn, param),
 		LLVMConstInt(ctx->i32, bitoffset, 0),
 	};
 	lp_build_intrinsic(ctx->gallivm.builder,
 			   "llvm.amdgcn.init.exec.from.input",
 			   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
 }
 
+static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+			       const struct si_vs_prolog_bits *key)
+{
+	/* VGPR initialization fixup for Vega10 and Raven is always done in the
+	 * VS prolog. */
+	return sel->vs_needs_prolog || key->ls_vgpr_fix;
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 				 bool is_monolithic)
 {
 	struct si_shader *shader = ctx->shader;
 	struct si_shader_selector *sel = shader->selector;
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
 	// TODO clean all this up!
 	switch (ctx->type) {
 	case PIPE_SHADER_VERTEX:
@@ -5804,21 +5814,21 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 	 *
 	 * For monolithic merged shaders, the first shader is wrapped in an
 	 * if-block together with its prolog in si_build_wrapper_function.
 	 */
 	if (ctx->screen->b.chip_class >= GFX9) {
 		if (!is_monolithic &&
 		    sel->info.num_instructions > 1 && /* not empty shader */
 		    (shader->key.as_es || shader->key.as_ls) &&
 		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
 		     (ctx->type == PIPE_SHADER_VERTEX &&
-		      !sel->vs_needs_prolog))) {
+		      !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
 			si_init_exec_from_input(ctx,
 						ctx->param_merged_wave_info, 0);
 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			   ctx->type == PIPE_SHADER_GEOMETRY) {
 			if (!is_monolithic)
 				si_init_exec_full_mask(ctx);
 
 			/* The barrier must execute for all shaders in a
 			 * threadgroup.
 			 */
@@ -6478,33 +6488,35 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 			si_build_vs_prolog_function(&ctx, &prolog_key);
 			parts[0] = ctx.main_fn;
 		}
 
 		si_build_wrapper_function(&ctx, parts + !need_prolog,
 					  1 + need_prolog, need_prolog, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->b.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
 			LLVMValueRef parts[4];
+			bool vs_needs_prolog =
+				si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
 
 			/* TCS main part */
 			parts[2] = ctx.main_fn;
 
 			/* TCS epilog */
 			union si_shader_part_key tcs_epilog_key;
 			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
 			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 			si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
 			parts[3] = ctx.main_fn;
 
 			/* VS prolog */
-			if (ls->vs_needs_prolog) {
+			if (vs_needs_prolog) {
 				union si_shader_part_key vs_prolog_key;
 				si_get_vs_prolog_key(&ls->info,
 						     shader->info.num_input_sgprs,
 						     &shader->key.part.tcs.ls_prolog,
 						     shader, &vs_prolog_key);
 				vs_prolog_key.vs_prolog.is_monolithic = true;
 				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
 				parts[0] = ctx.main_fn;
 			}
 
@@ -6521,23 +6533,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 				return -1;
 			}
 			shader->info.uses_instanceid |= ls->info.uses_instanceid;
 			parts[1] = ctx.main_fn;
 
 			/* Reset the shader context. */
 			ctx.shader = shader;
 			ctx.type = PIPE_SHADER_TESS_CTRL;
 
 			si_build_wrapper_function(&ctx,
-						  parts + !ls->vs_needs_prolog,
-						  4 - !ls->vs_needs_prolog, 0,
-						  ls->vs_needs_prolog ? 2 : 1);
+						  parts + !vs_needs_prolog,
+						  4 - !vs_needs_prolog, 0,
+						  vs_needs_prolog ? 2 : 1);
 		} else {
 			LLVMValueRef parts[2];
 			union si_shader_part_key epilog_key;
 
 			parts[0] = ctx.main_fn;
 
 			memset(&epilog_key, 0, sizeof(epilog_key));
 			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 			si_build_tcs_epilog_function(&ctx, &epilog_key);
 			parts[1] = ctx.main_fn;
@@ -6860,73 +6872,95 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
  *   (InstanceID / 2 + StartInstance)
  */
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	struct si_function_info fninfo;
 	LLVMTypeRef *returns;
 	LLVMValueRef ret, func;
 	int num_returns, i;
-	unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
-				 key->vs_prolog.num_merged_next_stage_vgprs;
+	unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
 	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+	LLVMValueRef input_vgprs[9];
 	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
 				      num_input_vgprs;
 	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 
 	si_init_function_info(&fninfo);
 
 	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
 	returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
 			 sizeof(LLVMTypeRef));
 	num_returns = 0;
 
 	/* Declare input and output SGPRs. */
 	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 		add_arg(&fninfo, ARG_SGPR, ctx->i32);
 		returns[num_returns++] = ctx->i32;
 	}
 
 	/* Preloaded VGPRs (outputs must be floats) */
 	for (i = 0; i < num_input_vgprs; i++) {
-		add_arg(&fninfo, ARG_VGPR, ctx->i32);
+		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
 		returns[num_returns++] = ctx->f32;
 	}
 
-	fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
-	fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
-
 	/* Vertex load indices. */
 	for (i = 0; i <= key->vs_prolog.last_input; i++)
 		returns[num_returns++] = ctx->f32;
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
 	func = ctx->main_fn;
 
-	if (key->vs_prolog.num_merged_next_stage_vgprs &&
-	    !key->vs_prolog.is_monolithic)
-		si_init_exec_from_input(ctx, 3, 0);
+	if (key->vs_prolog.num_merged_next_stage_vgprs) {
+		if (!key->vs_prolog.is_monolithic)
+			si_init_exec_from_input(ctx, 3, 0);
+
+		if (key->vs_prolog.as_ls &&
+		    (ctx->screen->b.family == CHIP_VEGA10 ||
+		     ctx->screen->b.family == CHIP_RAVEN)) {
+			/* If there are no HS threads, SPI loads the LS VGPRs
+			 * starting at VGPR 0. Shift them back to where they
+			 * belong.
+			 */
+			LLVMValueRef has_hs_threads =
+				LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+				    unpack_param(ctx, 3, 8, 8),
+				    ctx->i32_0, "");
+
+			for (i = 4; i > 0; --i) {
+				input_vgprs[i + 1] =
+					LLVMBuildSelect(gallivm->builder, has_hs_threads,
+						        input_vgprs[i + 1],
+						        input_vgprs[i - 1], "");
+			}
+		}
+	}
+
+	ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
+	ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
 	ret = ctx->return_value;
 	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
-	for (; i < fninfo.num_params; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
+	for (i = 0; i < num_input_vgprs; i++) {
+		LLVMValueRef p = input_vgprs[i];
 		p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
-		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
+					   key->vs_prolog.num_input_sgprs + i, "");
 	}
 
 	/* Compute vertex load indices from instance divisors. */
 	LLVMValueRef instance_divisor_constbuf = NULL;
 
 	if (key->vs_prolog.states.instance_divisor_is_fetched) {
 		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 		LLVMValueRef buf_index =
 			LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
 		instance_divisor_constbuf =
@@ -6973,22 +7007,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 
 static bool si_get_vs_prolog(struct si_screen *sscreen,
 			     LLVMTargetMachineRef tm,
 			     struct si_shader *shader,
 			     struct pipe_debug_callback *debug,
 			     struct si_shader *main_part,
 			     const struct si_vs_prolog_bits *key)
 {
 	struct si_shader_selector *vs = main_part->selector;
 
-	/* The prolog is a no-op if there are no inputs. */
-	if (!vs->vs_needs_prolog)
+	if (!si_vs_needs_prolog(vs, key))
 		return true;
 
 	/* Get the prolog. */
 	union si_shader_part_key prolog_key;
 	si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
 			     key, shader, &prolog_key);
 
 	shader->prolog =
 		si_get_shader_part(sscreen, &sscreen->vs_prologs,
 				   PIPE_SHADER_VERTEX, true, &prolog_key, tm,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 0c0fa10f40f..ee6b0c167f9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -391,20 +391,21 @@ struct si_shader_selector {
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
 	/* - If neither "is_one" nor "is_fetched" has a bit set, the instance
 	 *   divisor is 0.
 	 * - If "is_one" has a bit set, the instance divisor is 1.
 	 * - If "is_fetched" has a bit set, the instance divisor will be loaded
 	 *   from the constant buffer.
 	 */
 	uint16_t	instance_divisor_is_one;     /* bitmask of inputs */
 	uint16_t	instance_divisor_is_fetched; /* bitmask of inputs */
+	unsigned	ls_vgpr_fix:1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
 	unsigned	tes_reads_tess_factors:1;
 };
 
 struct si_gs_prolog_bits {
 	unsigned	tri_strip_adj_fix:1;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 1092da403c7..20fd128e533 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -195,25 +195,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	/* Make sure the output data fits in the offchip buffer */
 	*num_patches = MIN2(*num_patches,
 			    (sctx->screen->tess_offchip_block_dw_size * 4) /
 			    output_patch_size);
 
 	/* Not necessary for correctness, but improves performance. The
 	 * specific value is taken from the proprietary driver.
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
-	if (sctx->b.chip_class == SI ||
-	    /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
-	     * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
-	    (sctx->b.chip_class == GFX9 &&
-	     num_tcs_input_cp > num_tcs_output_cp)) {
+	if (sctx->b.chip_class == SI) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
 		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
 
 	/* The VGT HS block increments the patch ID unconditionally
 	 * within a single threadgroup. This results in incorrect
 	 * patch IDs when instanced draws are used.
@@ -1266,20 +1262,41 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	else if (sctx->tes_shader.cso)
 		rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		rast_prim = info->mode;
 
 	if (rast_prim != sctx->current_rast_prim) {
 		sctx->current_rast_prim = rast_prim;
 		sctx->do_update_shaders = true;
 	}
 
+	if (sctx->tes_shader.cso &&
+	    (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
+		/* Determine whether the LS VGPR fix should be applied.
+		 *
+		 * It is only required when num input CPs > num output CPs,
+		 * which cannot happen with the fixed function TCS. We should
+		 * also update this bit when switching from TCS to fixed
+		 * function TCS.
+		 */
+		struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+		bool ls_vgpr_fix =
+			tcs &&
+			info->vertices_per_patch >
+			tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+		if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+			sctx->ls_vgpr_fix = ls_vgpr_fix;
+			sctx->do_update_shaders = true;
+		}
+	}
+
 	if (sctx->gs_shader.cso) {
 		/* Determine whether the GS triangle strip adjacency fix should
 		 * be applied. Rotate every other triangle if
 		 * - triangle strips with adjacency are fed to the GS and
 		 * - primitive restart is disabled (the rotation doesn't help
 		 *   when the restart occurs after an odd number of triangles).
 		 */
 		bool gs_tri_strip_adj_fix =
 			!sctx->tes_shader.cso &&
 			info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 7898c9196c6..acfccb96168 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1276,20 +1276,28 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 				key->mono.u.vs_export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		if (sctx->b.chip_class >= GFX9) {
 			si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
 						  key, &key->part.tcs.ls_prolog);
 			key->part.tcs.ls = sctx->vs_shader.cso;
+
+			/* When the LS VGPR fix is needed, monolithic shaders
+			 * can save:
+			 *  - redundant EXEC initialization
+			 *  - selection of unused input VGPRs
+			 */
+			key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+			key->opt.prefer_mono = sctx->ls_vgpr_fix;
 		}
 
 		key->part.tcs.epilog.prim_mode =
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		key->part.tcs.epilog.tes_reads_tess_factors =
 			sctx->tes_shader.cso->info.reads_tess_factors;
 
 		if (sel == sctx->fixed_func_tcs_shader.cso)
 			key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
 		break;
-- 
2.11.0



More information about the mesa-dev mailing list