[Mesa-dev] [PATCH v2] radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug
Marek Olšák
maraeo at gmail.com
Tue Sep 5 18:23:49 UTC 2017
On Tue, Sep 5, 2017 at 9:56 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> When the HS wave is empty, the hardware writes the LS VGPRs starting at
> v0 instead of v2. Workaround by shifting them back into place when
> necessary. For simplicity, this is always done in the LS prolog.
>
> According to the hardware team, this will be fixed in future chips,
> so take that into account already.
>
> Note that this is not a bug fix, as the bug was already worked
> around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround
> for a tessellation driver bug"). This change merely replaces the
> workaround by one that should be better.
>
> v2: add workaround code to shader only when necessary
> ---
> src/gallium/drivers/radeonsi/si_pipe.h | 1 +
> src/gallium/drivers/radeonsi/si_shader.c | 71 ++++++++++++++++++-------
> src/gallium/drivers/radeonsi/si_shader.h | 1 +
> src/gallium/drivers/radeonsi/si_state_draw.c | 27 ++++++++--
> src/gallium/drivers/radeonsi/si_state_shaders.c | 8 +++
> 5 files changed, 84 insertions(+), 24 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 3ae06584427..9832fd19ff6 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -382,20 +382,21 @@ struct si_context {
> bool db_flush_stencil_inplace:1;
> bool db_depth_clear:1;
> bool db_depth_disable_expclear:1;
> bool db_stencil_clear:1;
> bool db_stencil_disable_expclear:1;
> bool occlusion_queries_disabled:1;
> bool generate_mipmap_for_depth:1;
>
> /* Emitted draw state. */
> bool gs_tri_strip_adj_fix:1;
> + bool ls_vgpr_fix:1;
> int last_index_size;
> int last_base_vertex;
> int last_start_instance;
> int last_drawid;
> int last_sh_base_reg;
> int last_primitive_restart_en;
> int last_restart_index;
> int last_gs_out_prim;
> int last_prim;
> int last_multi_vgt_param;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 2cddfe97aa5..bae9b8384dd 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5536,20 +5536,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
> }
>
> static void si_dump_shader_key_vs(const struct si_shader_key *key,
> const struct si_vs_prolog_bits *prolog,
> const char *prefix, FILE *f)
> {
> fprintf(f, " %s.instance_divisor_is_one = %u\n",
> prefix, prolog->instance_divisor_is_one);
> fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
> prefix, prolog->instance_divisor_is_fetched);
> + fprintf(f, " %s.ls_vgpr_fix = %u\n",
> + prefix, prolog->ls_vgpr_fix);
>
> fprintf(f, " mono.vs.fix_fetch = {");
> for (int i = 0; i < SI_MAX_ATTRIBS; i++)
> fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
> fprintf(f, "}\n");
> }
>
> static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
> FILE *f)
> {
> @@ -5728,20 +5730,28 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
> {
> LLVMValueRef args[] = {
> LLVMGetParam(ctx->main_fn, param),
> LLVMConstInt(ctx->i32, bitoffset, 0),
> };
> lp_build_intrinsic(ctx->gallivm.builder,
> "llvm.amdgcn.init.exec.from.input",
> ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
> }
>
> +static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
> + const struct si_vs_prolog_bits *key)
> +{
> + /* VGPR initialization fixup for Vega10 and Raven is always done in the
> + * VS prolog. */
> + return sel->vs_needs_prolog || key->ls_vgpr_fix;
> +}
> +
> static bool si_compile_tgsi_main(struct si_shader_context *ctx,
> bool is_monolithic)
> {
> struct si_shader *shader = ctx->shader;
> struct si_shader_selector *sel = shader->selector;
> struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
>
> // TODO clean all this up!
> switch (ctx->type) {
> case PIPE_SHADER_VERTEX:
> @@ -5804,21 +5814,21 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
> *
> * For monolithic merged shaders, the first shader is wrapped in an
> * if-block together with its prolog in si_build_wrapper_function.
> */
> if (ctx->screen->b.chip_class >= GFX9) {
> if (!is_monolithic &&
> sel->info.num_instructions > 1 && /* not empty shader */
> (shader->key.as_es || shader->key.as_ls) &&
> (ctx->type == PIPE_SHADER_TESS_EVAL ||
> (ctx->type == PIPE_SHADER_VERTEX &&
> - !sel->vs_needs_prolog))) {
> + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
> si_init_exec_from_input(ctx,
> ctx->param_merged_wave_info, 0);
> } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
> ctx->type == PIPE_SHADER_GEOMETRY) {
> if (!is_monolithic)
> si_init_exec_full_mask(ctx);
>
> /* The barrier must execute for all shaders in a
> * threadgroup.
> */
> @@ -6478,33 +6488,35 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
> si_build_vs_prolog_function(&ctx, &prolog_key);
> parts[0] = ctx.main_fn;
> }
>
> si_build_wrapper_function(&ctx, parts + !need_prolog,
> 1 + need_prolog, need_prolog, 0);
> } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
> if (sscreen->b.chip_class >= GFX9) {
> struct si_shader_selector *ls = shader->key.part.tcs.ls;
> LLVMValueRef parts[4];
> + bool vs_needs_prolog =
> + si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
>
> /* TCS main part */
> parts[2] = ctx.main_fn;
>
> /* TCS epilog */
> union si_shader_part_key tcs_epilog_key;
> memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
> tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
> si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
> parts[3] = ctx.main_fn;
>
> /* VS prolog */
> - if (ls->vs_needs_prolog) {
> + if (vs_needs_prolog) {
> union si_shader_part_key vs_prolog_key;
> si_get_vs_prolog_key(&ls->info,
> shader->info.num_input_sgprs,
> &shader->key.part.tcs.ls_prolog,
> shader, &vs_prolog_key);
> vs_prolog_key.vs_prolog.is_monolithic = true;
> si_build_vs_prolog_function(&ctx, &vs_prolog_key);
> parts[0] = ctx.main_fn;
> }
>
> @@ -6521,23 +6533,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
> return -1;
> }
> shader->info.uses_instanceid |= ls->info.uses_instanceid;
> parts[1] = ctx.main_fn;
>
> /* Reset the shader context. */
> ctx.shader = shader;
> ctx.type = PIPE_SHADER_TESS_CTRL;
>
> si_build_wrapper_function(&ctx,
> - parts + !ls->vs_needs_prolog,
> - 4 - !ls->vs_needs_prolog, 0,
> - ls->vs_needs_prolog ? 2 : 1);
> + parts + !vs_needs_prolog,
> + 4 - !vs_needs_prolog, 0,
> + vs_needs_prolog ? 2 : 1);
> } else {
> LLVMValueRef parts[2];
> union si_shader_part_key epilog_key;
>
> parts[0] = ctx.main_fn;
>
> memset(&epilog_key, 0, sizeof(epilog_key));
> epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
> si_build_tcs_epilog_function(&ctx, &epilog_key);
> parts[1] = ctx.main_fn;
> @@ -6860,73 +6872,95 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
> * (InstanceID / 2 + StartInstance)
> */
> static void si_build_vs_prolog_function(struct si_shader_context *ctx,
> union si_shader_part_key *key)
> {
> struct gallivm_state *gallivm = &ctx->gallivm;
> struct si_function_info fninfo;
> LLVMTypeRef *returns;
> LLVMValueRef ret, func;
> int num_returns, i;
> - unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
> - key->vs_prolog.num_merged_next_stage_vgprs;
> + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
> unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
> + LLVMValueRef input_vgprs[9];
> unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
> num_input_vgprs;
> unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
>
> si_init_function_info(&fninfo);
>
> /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
> returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
> sizeof(LLVMTypeRef));
> num_returns = 0;
>
> /* Declare input and output SGPRs. */
> for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
> add_arg(&fninfo, ARG_SGPR, ctx->i32);
> returns[num_returns++] = ctx->i32;
> }
>
> /* Preloaded VGPRs (outputs must be floats) */
> for (i = 0; i < num_input_vgprs; i++) {
> - add_arg(&fninfo, ARG_VGPR, ctx->i32);
> + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
> returns[num_returns++] = ctx->f32;
> }
>
> - fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
> - fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
> -
> /* Vertex load indices. */
> for (i = 0; i <= key->vs_prolog.last_input; i++)
> returns[num_returns++] = ctx->f32;
>
> /* Create the function. */
> si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
> func = ctx->main_fn;
>
> - if (key->vs_prolog.num_merged_next_stage_vgprs &&
> - !key->vs_prolog.is_monolithic)
> - si_init_exec_from_input(ctx, 3, 0);
> + if (key->vs_prolog.num_merged_next_stage_vgprs) {
> + if (!key->vs_prolog.is_monolithic)
> + si_init_exec_from_input(ctx, 3, 0);
> +
> + if (key->vs_prolog.as_ls &&
> + (ctx->screen->b.family == CHIP_VEGA10 ||
> + ctx->screen->b.family == CHIP_RAVEN)) {
> + /* If there are no HS threads, SPI loads the LS VGPRs
> + * starting at VGPR 0. Shift them back to where they
> + * belong.
> + */
> + LLVMValueRef has_hs_threads =
> + LLVMBuildICmp(gallivm->builder, LLVMIntNE,
> + unpack_param(ctx, 3, 8, 8),
> + ctx->i32_0, "");
> +
> + for (i = 4; i > 0; --i) {
> + input_vgprs[i + 1] =
> + LLVMBuildSelect(gallivm->builder, has_hs_threads,
> + input_vgprs[i + 1],
> + input_vgprs[i - 1], "");
> + }
> + }
> + }
> +
> + ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
> + ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
>
> /* Copy inputs to outputs. This should be no-op, as the registers match,
> * but it will prevent the compiler from overwriting them unintentionally.
> */
> ret = ctx->return_value;
> for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
> LLVMValueRef p = LLVMGetParam(func, i);
> ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
> }
> - for (; i < fninfo.num_params; i++) {
> - LLVMValueRef p = LLVMGetParam(func, i);
> + for (i = 0; i < num_input_vgprs; i++) {
> + LLVMValueRef p = input_vgprs[i];
> p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
> - ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
> + ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
> + key->vs_prolog.num_input_sgprs + i, "");
> }
>
> /* Compute vertex load indices from instance divisors. */
> LLVMValueRef instance_divisor_constbuf = NULL;
>
> if (key->vs_prolog.states.instance_divisor_is_fetched) {
> LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
> LLVMValueRef buf_index =
> LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
> instance_divisor_constbuf =
> @@ -6973,22 +7007,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
>
> static bool si_get_vs_prolog(struct si_screen *sscreen,
> LLVMTargetMachineRef tm,
> struct si_shader *shader,
> struct pipe_debug_callback *debug,
> struct si_shader *main_part,
> const struct si_vs_prolog_bits *key)
> {
> struct si_shader_selector *vs = main_part->selector;
>
> - /* The prolog is a no-op if there are no inputs. */
> - if (!vs->vs_needs_prolog)
> + if (!si_vs_needs_prolog(vs, key))
> return true;
>
> /* Get the prolog. */
> union si_shader_part_key prolog_key;
> si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
> key, shader, &prolog_key);
>
> shader->prolog =
> si_get_shader_part(sscreen, &sscreen->vs_prologs,
> PIPE_SHADER_VERTEX, true, &prolog_key, tm,
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 0c0fa10f40f..ee6b0c167f9 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -391,20 +391,21 @@ struct si_shader_selector {
> /* Common VS bits between the shader key and the prolog key. */
> struct si_vs_prolog_bits {
> /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
> * divisor is 0.
> * - If "is_one" has a bit set, the instance divisor is 1.
> * - If "is_fetched" has a bit set, the instance divisor will be loaded
> * from the constant buffer.
> */
> uint16_t instance_divisor_is_one; /* bitmask of inputs */
> uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
> + unsigned ls_vgpr_fix:1;
> };
>
> /* Common TCS bits between the shader key and the epilog key. */
> struct si_tcs_epilog_bits {
> unsigned prim_mode:3;
> unsigned tes_reads_tess_factors:1;
> };
>
> struct si_gs_prolog_bits {
> unsigned tri_strip_adj_fix:1;
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1092da403c7..20fd128e533 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -195,25 +195,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
> /* Make sure the output data fits in the offchip buffer */
> *num_patches = MIN2(*num_patches,
> (sctx->screen->tess_offchip_block_dw_size * 4) /
> output_patch_size);
>
> /* Not necessary for correctness, but improves performance. The
> * specific value is taken from the proprietary driver.
> */
> *num_patches = MIN2(*num_patches, 40);
>
> - if (sctx->b.chip_class == SI ||
> - /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
> - * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
> - (sctx->b.chip_class == GFX9 &&
> - num_tcs_input_cp > num_tcs_output_cp)) {
> + if (sctx->b.chip_class == SI) {
> /* SI bug workaround, related to power management. Limit LS-HS
> * threadgroups to only one wave.
> */
> unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
> *num_patches = MIN2(*num_patches, one_wave);
> }
>
> /* The VGT HS block increments the patch ID unconditionally
> * within a single threadgroup. This results in incorrect
> * patch IDs when instanced draws are used.
> @@ -1266,20 +1262,41 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> else if (sctx->tes_shader.cso)
> rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
> else
> rast_prim = info->mode;
>
> if (rast_prim != sctx->current_rast_prim) {
> sctx->current_rast_prim = rast_prim;
> sctx->do_update_shaders = true;
> }
>
> + if (sctx->tes_shader.cso &&
> + (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
> + /* Determine whether the LS VGPR fix should be applied.
> + *
> + * It is only required when num input CPs > num output CPs,
> + * which cannot happen with the fixed function TCS. We should
> + * also update this bit when switching from TCS to fixed
> + * function TCS.
> + */
> + struct si_shader_selector *tcs = sctx->tcs_shader.cso;
> + bool ls_vgpr_fix =
> + tcs &&
> + info->vertices_per_patch >
> + tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
> +
> + if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
> + sctx->ls_vgpr_fix = ls_vgpr_fix;
> + sctx->do_update_shaders = true;
> + }
> + }
> +
> if (sctx->gs_shader.cso) {
> /* Determine whether the GS triangle strip adjacency fix should
> * be applied. Rotate every other triangle if
> * - triangle strips with adjacency are fed to the GS and
> * - primitive restart is disabled (the rotation doesn't help
> * when the restart occurs after an odd number of triangles).
> */
> bool gs_tri_strip_adj_fix =
> !sctx->tes_shader.cso &&
> info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 7898c9196c6..acfccb96168 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -1276,20 +1276,28 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
>
> if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
> key->mono.u.vs_export_prim_id = 1;
> }
> break;
> case PIPE_SHADER_TESS_CTRL:
> if (sctx->b.chip_class >= GFX9) {
> si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
> key, &key->part.tcs.ls_prolog);
> key->part.tcs.ls = sctx->vs_shader.cso;
> +
> + /* When the LS VGPR fix is needed, monolithic shaders
> + * can save:
> + * - redundant EXEC initialization
The redundant EXEC initialization only seems to happen when
(!sel->vs_needs_prolog && ls_vgpr_fix). If sel->vs_needs_prolog ==
true, the main part doesn't initialize EXEC. I think the comment
should clarify that because "redundant EXEC initialization" seems more
confusing than explanatory to me.
Other than that:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
More information about the mesa-dev
mailing list