[Mesa-dev] [PATCH] radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug

Nicolai Hähnle nhaehnle at gmail.com
Mon Sep 4 18:11:57 UTC 2017


From: Nicolai Hähnle <nicolai.haehnle at amd.com>

When the HS wave is empty, the hardware writes the LS VGPRs starting at
v0 instead of v2. Workaround by shifting them back into place when
necessary. For simplicity, this is always done in the LS prolog.

According to the hardware team, this will be fixed in future chips,
so take that into account already.

Note that this is not a bug fix, as the bug was already worked
around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround
for a tessellation driver bug"). This change merely replaces the
workaround by one that should be better.
---
 src/gallium/drivers/radeonsi/si_shader.c     | 71 ++++++++++++++++++++--------
 src/gallium/drivers/radeonsi/si_state_draw.c |  6 +--
 2 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e89ccac09d..6840b8e4b65 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5629,20 +5629,31 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
 {
 	LLVMValueRef args[] = {
 		LLVMGetParam(ctx->main_fn, param),
 		LLVMConstInt(ctx->i32, bitoffset, 0),
 	};
 	lp_build_intrinsic(ctx->gallivm.builder,
 			   "llvm.amdgcn.init.exec.from.input",
 			   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
 }
 
+static bool si_vs_needs_prolog(struct si_screen *sscreen,
+			       struct si_shader_selector *sel, bool as_ls)
+{
+	/* VGPR initialization fixup for Vega10 and Raven is always done in the
+	 * VS prolog. */
+	return sel->vs_needs_prolog ||
+	       (as_ls &&
+	        (sscreen->b.family == CHIP_VEGA10 ||
+	         sscreen->b.family == CHIP_RAVEN));
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 				 bool is_monolithic)
 {
 	struct si_shader *shader = ctx->shader;
 	struct si_shader_selector *sel = shader->selector;
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
 	// TODO clean all this up!
 	switch (ctx->type) {
 	case PIPE_SHADER_VERTEX:
@@ -5705,21 +5716,21 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 	 *
 	 * For monolithic merged shaders, the first shader is wrapped in an
 	 * if-block together with its prolog in si_build_wrapper_function.
 	 */
 	if (ctx->screen->b.chip_class >= GFX9) {
 		if (!is_monolithic &&
 		    sel->info.num_instructions > 1 && /* not empty shader */
 		    (shader->key.as_es || shader->key.as_ls) &&
 		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
 		     (ctx->type == PIPE_SHADER_VERTEX &&
-		      !sel->vs_needs_prolog))) {
+		      !si_vs_needs_prolog(ctx->screen, sel, shader->key.as_ls)))) {
 			si_init_exec_from_input(ctx,
 						ctx->param_merged_wave_info, 0);
 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			   ctx->type == PIPE_SHADER_GEOMETRY) {
 			if (!is_monolithic)
 				si_init_exec_full_mask(ctx);
 
 			/* The barrier must execute for all shaders in a
 			 * threadgroup.
 			 */
@@ -6357,33 +6368,34 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 			si_build_vs_prolog_function(&ctx, &prolog_key);
 			parts[0] = ctx.main_fn;
 		}
 
 		si_build_wrapper_function(&ctx, parts + !need_prolog,
 					  1 + need_prolog, need_prolog, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->b.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
 			LLVMValueRef parts[4];
+			bool vs_needs_prolog = si_vs_needs_prolog(sscreen, ls, true);
 
 			/* TCS main part */
 			parts[2] = ctx.main_fn;
 
 			/* TCS epilog */
 			union si_shader_part_key tcs_epilog_key;
 			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
 			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 			si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
 			parts[3] = ctx.main_fn;
 
 			/* VS prolog */
-			if (ls->vs_needs_prolog) {
+			if (vs_needs_prolog) {
 				union si_shader_part_key vs_prolog_key;
 				si_get_vs_prolog_key(&ls->info,
 						     shader->info.num_input_sgprs,
 						     &shader->key.part.tcs.ls_prolog,
 						     shader, &vs_prolog_key);
 				vs_prolog_key.vs_prolog.is_monolithic = true;
 				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
 				parts[0] = ctx.main_fn;
 			}
 
@@ -6400,23 +6412,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 				return -1;
 			}
 			shader->info.uses_instanceid |= ls->info.uses_instanceid;
 			parts[1] = ctx.main_fn;
 
 			/* Reset the shader context. */
 			ctx.shader = shader;
 			ctx.type = PIPE_SHADER_TESS_CTRL;
 
 			si_build_wrapper_function(&ctx,
-						  parts + !ls->vs_needs_prolog,
-						  4 - !ls->vs_needs_prolog, 0,
-						  ls->vs_needs_prolog ? 2 : 1);
+						  parts + !vs_needs_prolog,
+						  4 - !vs_needs_prolog, 0,
+						  vs_needs_prolog ? 2 : 1);
 		} else {
 			LLVMValueRef parts[2];
 			union si_shader_part_key epilog_key;
 
 			parts[0] = ctx.main_fn;
 
 			memset(&epilog_key, 0, sizeof(epilog_key));
 			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 			si_build_tcs_epilog_function(&ctx, &epilog_key);
 			parts[1] = ctx.main_fn;
@@ -6739,73 +6751,95 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
  *   (InstanceID / 2 + StartInstance)
  */
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	struct si_function_info fninfo;
 	LLVMTypeRef *returns;
 	LLVMValueRef ret, func;
 	int num_returns, i;
-	unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
-				 key->vs_prolog.num_merged_next_stage_vgprs;
+	unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
 	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+	LLVMValueRef input_vgprs[9];
 	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
 				      num_input_vgprs;
 	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 
 	si_init_function_info(&fninfo);
 
 	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
 	returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
 			 sizeof(LLVMTypeRef));
 	num_returns = 0;
 
 	/* Declare input and output SGPRs. */
 	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 		add_arg(&fninfo, ARG_SGPR, ctx->i32);
 		returns[num_returns++] = ctx->i32;
 	}
 
 	/* Preloaded VGPRs (outputs must be floats) */
 	for (i = 0; i < num_input_vgprs; i++) {
-		add_arg(&fninfo, ARG_VGPR, ctx->i32);
+		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
 		returns[num_returns++] = ctx->f32;
 	}
 
-	fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
-	fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
-
 	/* Vertex load indices. */
 	for (i = 0; i <= key->vs_prolog.last_input; i++)
 		returns[num_returns++] = ctx->f32;
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
 	func = ctx->main_fn;
 
-	if (key->vs_prolog.num_merged_next_stage_vgprs &&
-	    !key->vs_prolog.is_monolithic)
-		si_init_exec_from_input(ctx, 3, 0);
+	if (key->vs_prolog.num_merged_next_stage_vgprs) {
+		if (!key->vs_prolog.is_monolithic)
+			si_init_exec_from_input(ctx, 3, 0);
+
+		if (key->vs_prolog.as_ls &&
+		    (ctx->screen->b.family == CHIP_VEGA10 ||
+		     ctx->screen->b.family == CHIP_RAVEN)) {
+			/* If there are no HS threads, SPI loads the LS VGPRs
+			 * starting at VGPR 0. Shift them back to where they
+			 * belong.
+			 */
+			LLVMValueRef has_hs_threads =
+				LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+				    unpack_param(ctx, 3, 8, 8),
+				    ctx->i32_0, "");
+
+			for (i = 4; i > 0; --i) {
+				input_vgprs[i + 1] =
+					LLVMBuildSelect(gallivm->builder, has_hs_threads,
+						        input_vgprs[i + 1],
+						        input_vgprs[i - 1], "");
+			}
+		}
+	}
+
+	ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
+	ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
 	ret = ctx->return_value;
 	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
-	for (; i < fninfo.num_params; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
+	for (i = 0; i < num_input_vgprs; i++) {
+		LLVMValueRef p = input_vgprs[i];
 		p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
-		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
+					   key->vs_prolog.num_input_sgprs + i, "");
 	}
 
 	/* Compute vertex load indices from instance divisors. */
 	LLVMValueRef instance_divisor_constbuf = NULL;
 
 	if (key->vs_prolog.states.instance_divisor_is_fetched) {
 		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 		LLVMValueRef buf_index =
 			LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
 		instance_divisor_constbuf =
@@ -6852,22 +6886,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 
 static bool si_get_vs_prolog(struct si_screen *sscreen,
 			     LLVMTargetMachineRef tm,
 			     struct si_shader *shader,
 			     struct pipe_debug_callback *debug,
 			     struct si_shader *main_part,
 			     const struct si_vs_prolog_bits *key)
 {
 	struct si_shader_selector *vs = main_part->selector;
 
-	/* The prolog is a no-op if there are no inputs. */
-	if (!vs->vs_needs_prolog)
+	if (!si_vs_needs_prolog(sscreen, vs, main_part->key.as_ls))
 		return true;
 
 	/* Get the prolog. */
 	union si_shader_part_key prolog_key;
 	si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
 			     key, shader, &prolog_key);
 
 	shader->prolog =
 		si_get_shader_part(sscreen, &sscreen->vs_prologs,
 				   PIPE_SHADER_VERTEX, true, &prolog_key, tm,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 81751d2186e..28c396bd6c5 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -188,25 +188,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	/* Make sure the output data fits in the offchip buffer */
 	*num_patches = MIN2(*num_patches,
 			    (sctx->screen->tess_offchip_block_dw_size * 4) /
 			    output_patch_size);
 
 	/* Not necessary for correctness, but improves performance. The
 	 * specific value is taken from the proprietary driver.
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
-	if (sctx->b.chip_class == SI ||
-	    /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
-	     * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
-	    (sctx->b.chip_class == GFX9 &&
-	     num_tcs_input_cp > num_tcs_output_cp)) {
+	if (sctx->b.chip_class == SI) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
 		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
 
 	/* The VGT HS block increments the patch ID unconditionally
 	 * within a single threadgroup. This results in incorrect
 	 * patch IDs when instanced draws are used.
-- 
2.11.0



More information about the mesa-dev mailing list