[Mesa-dev] [PATCH v2] radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug

Tue Sep 5 18:23:49 UTC 2017

On Tue, Sep 5, 2017 at 9:56 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> When the HS wave is empty, the hardware writes the LS VGPRs starting at
> v0 instead of v2. Workaround by shifting them back into place when
> necessary. For simplicity, this is always done in the LS prolog.
>
> According to the hardware team, this will be fixed in future chips,
> so take that into account already.
>
> Note that this is not a bug fix, as the bug was already worked
> around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround
> for a tessellation driver bug"). This change merely replaces the
> workaround by one that should be better.
>
> v2: add workaround code to shader only when necessary
> ---
>  src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
>  src/gallium/drivers/radeonsi/si_shader.c        | 71 ++++++++++++++++++-------
>  src/gallium/drivers/radeonsi/si_shader.h        |  1 +
>  src/gallium/drivers/radeonsi/si_state_draw.c    | 27 ++++++++--
>  src/gallium/drivers/radeonsi/si_state_shaders.c |  8 +++
>  5 files changed, 84 insertions(+), 24 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 3ae06584427..9832fd19ff6 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -382,20 +382,21 @@ struct si_context {
>         bool                    db_flush_stencil_inplace:1;
>         bool                    db_depth_clear:1;
>         bool                    db_depth_disable_expclear:1;
>         bool                    db_stencil_clear:1;
>         bool                    db_stencil_disable_expclear:1;
>         bool                    occlusion_queries_disabled:1;
>         bool                    generate_mipmap_for_depth:1;
>
>         /* Emitted draw state. */
>         bool                    gs_tri_strip_adj_fix:1;
> +       bool                    ls_vgpr_fix:1;
>         int                     last_index_size;
>         int                     last_base_vertex;
>         int                     last_start_instance;
>         int                     last_drawid;
>         int                     last_sh_base_reg;
>         int                     last_primitive_restart_en;
>         int                     last_restart_index;
>         int                     last_gs_out_prim;
>         int                     last_prim;
>         int                     last_multi_vgt_param;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 2cddfe97aa5..bae9b8384dd 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5536,20 +5536,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
>  }
>
>  static void si_dump_shader_key_vs(const struct si_shader_key *key,
>                                   const struct si_vs_prolog_bits *prolog,
>                                   const char *prefix, FILE *f)
>  {
>         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
>                 prefix, prolog->instance_divisor_is_one);
>         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
>                 prefix, prolog->instance_divisor_is_fetched);
> +       fprintf(f, "  %s.ls_vgpr_fix = %u\n",
> +               prefix, prolog->ls_vgpr_fix);
>
>         fprintf(f, "  mono.vs.fix_fetch = {");
>         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
>                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
>         fprintf(f, "}\n");
>  }
>
>  static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
>                                FILE *f)
>  {
> @@ -5728,20 +5730,28 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
>  {
>         LLVMValueRef args[] = {
>                 LLVMGetParam(ctx->main_fn, param),
>                 LLVMConstInt(ctx->i32, bitoffset, 0),
>         };
>         lp_build_intrinsic(ctx->gallivm.builder,
>                            "llvm.amdgcn.init.exec.from.input",
>                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
>  }
>
> +static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
> +                              const struct si_vs_prolog_bits *key)
> +{
> +       /* VGPR initialization fixup for Vega10 and Raven is always done in the
> +        * VS prolog. */
> +       return sel->vs_needs_prolog || key->ls_vgpr_fix;
> +}
> +
>  static bool si_compile_tgsi_main(struct si_shader_context *ctx,
>                                  bool is_monolithic)
>  {
>         struct si_shader *shader = ctx->shader;
>         struct si_shader_selector *sel = shader->selector;
>         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
>
>         // TODO clean all this up!
>         switch (ctx->type) {
>         case PIPE_SHADER_VERTEX:
> @@ -5804,21 +5814,21 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
>          *
>          * For monolithic merged shaders, the first shader is wrapped in an
>          * if-block together with its prolog in si_build_wrapper_function.
>          */
>         if (ctx->screen->b.chip_class >= GFX9) {
>                 if (!is_monolithic &&
>                     sel->info.num_instructions > 1 && /* not empty shader */
>                     (shader->key.as_es || shader->key.as_ls) &&
>                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
>                      (ctx->type == PIPE_SHADER_VERTEX &&
> -                     !sel->vs_needs_prolog))) {
> +                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
>                         si_init_exec_from_input(ctx,
>                                                 ctx->param_merged_wave_info, 0);
>                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
>                            ctx->type == PIPE_SHADER_GEOMETRY) {
>                         if (!is_monolithic)
>                                 si_init_exec_full_mask(ctx);
>
>                         /* The barrier must execute for all shaders in a
>                          * threadgroup.
>                          */
> @@ -6478,33 +6488,35 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>                         si_build_vs_prolog_function(&ctx, &prolog_key);
>                         parts[0] = ctx.main_fn;
>                 }
>
>                 si_build_wrapper_function(&ctx, parts + !need_prolog,
>                                           1 + need_prolog, need_prolog, 0);
>         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
>                 if (sscreen->b.chip_class >= GFX9) {
>                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
>                         LLVMValueRef parts[4];
> +                       bool vs_needs_prolog =
> +                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
>
>                         /* TCS main part */
>                         parts[2] = ctx.main_fn;
>
>                         /* TCS epilog */
>                         union si_shader_part_key tcs_epilog_key;
>                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
>                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
>                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
>                         parts[3] = ctx.main_fn;
>
>                         /* VS prolog */
> -                       if (ls->vs_needs_prolog) {
> +                       if (vs_needs_prolog) {
>                                 union si_shader_part_key vs_prolog_key;
>                                 si_get_vs_prolog_key(&ls->info,
>                                                      shader->info.num_input_sgprs,
>                                                      &shader->key.part.tcs.ls_prolog,
>                                                      shader, &vs_prolog_key);
>                                 vs_prolog_key.vs_prolog.is_monolithic = true;
>                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
>                                 parts[0] = ctx.main_fn;
>                         }
>
> @@ -6521,23 +6533,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>                                 return -1;
>                         }
>                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
>                         parts[1] = ctx.main_fn;
>
>                         /* Reset the shader context. */
>                         ctx.shader = shader;
>                         ctx.type = PIPE_SHADER_TESS_CTRL;
>
>                         si_build_wrapper_function(&ctx,
> -                                                 parts + !ls->vs_needs_prolog,
> -                                                 4 - !ls->vs_needs_prolog, 0,
> -                                                 ls->vs_needs_prolog ? 2 : 1);
> +                                                 parts + !vs_needs_prolog,
> +                                                 4 - !vs_needs_prolog, 0,
> +                                                 vs_needs_prolog ? 2 : 1);
>                 } else {
>                         LLVMValueRef parts[2];
>                         union si_shader_part_key epilog_key;
>
>                         parts[0] = ctx.main_fn;
>
>                         memset(&epilog_key, 0, sizeof(epilog_key));
>                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
>                         si_build_tcs_epilog_function(&ctx, &epilog_key);
>                         parts[1] = ctx.main_fn;
> @@ -6860,73 +6872,95 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
>   *   (InstanceID / 2 + StartInstance)
>   */
>  static void si_build_vs_prolog_function(struct si_shader_context *ctx,
>                                         union si_shader_part_key *key)
>  {
>         struct gallivm_state *gallivm = &ctx->gallivm;
>         struct si_function_info fninfo;
>         LLVMTypeRef *returns;
>         LLVMValueRef ret, func;
>         int num_returns, i;
> -       unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
> -                                key->vs_prolog.num_merged_next_stage_vgprs;
> +       unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
>         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
> +       LLVMValueRef input_vgprs[9];
>         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
>                                       num_input_vgprs;
>         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
>
>         si_init_function_info(&fninfo);
>
>         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
>         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
>                          sizeof(LLVMTypeRef));
>         num_returns = 0;
>
>         /* Declare input and output SGPRs. */
>         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
>                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
>                 returns[num_returns++] = ctx->i32;
>         }
>
>         /* Preloaded VGPRs (outputs must be floats) */
>         for (i = 0; i < num_input_vgprs; i++) {
> -               add_arg(&fninfo, ARG_VGPR, ctx->i32);
> +               add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
>                 returns[num_returns++] = ctx->f32;
>         }
>
> -       fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
> -       fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
> -
>         /* Vertex load indices. */
>         for (i = 0; i <= key->vs_prolog.last_input; i++)
>                 returns[num_returns++] = ctx->f32;
>
>         /* Create the function. */
>         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
>         func = ctx->main_fn;
>
> -       if (key->vs_prolog.num_merged_next_stage_vgprs &&
> -           !key->vs_prolog.is_monolithic)
> -               si_init_exec_from_input(ctx, 3, 0);
> +       if (key->vs_prolog.num_merged_next_stage_vgprs) {
> +               if (!key->vs_prolog.is_monolithic)
> +                       si_init_exec_from_input(ctx, 3, 0);
> +
> +               if (key->vs_prolog.as_ls &&
> +                   (ctx->screen->b.family == CHIP_VEGA10 ||
> +                    ctx->screen->b.family == CHIP_RAVEN)) {
> +                       /* If there are no HS threads, SPI loads the LS VGPRs
> +                        * starting at VGPR 0. Shift them back to where they
> +                        * belong.
> +                        */
> +                       LLVMValueRef has_hs_threads =
> +                               LLVMBuildICmp(gallivm->builder, LLVMIntNE,
> +                                   unpack_param(ctx, 3, 8, 8),
> +                                   ctx->i32_0, "");
> +
> +                       for (i = 4; i > 0; --i) {
> +                               input_vgprs[i + 1] =
> +                                       LLVMBuildSelect(gallivm->builder, has_hs_threads,
> +                                                       input_vgprs[i + 1],
> +                                                       input_vgprs[i - 1], "");
> +                       }
> +               }
> +       }
> +
> +       ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
> +       ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
>
>         /* Copy inputs to outputs. This should be no-op, as the registers match,
>          * but it will prevent the compiler from overwriting them unintentionally.
>          */
>         ret = ctx->return_value;
>         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
>                 LLVMValueRef p = LLVMGetParam(func, i);
>                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
>         }
> -       for (; i < fninfo.num_params; i++) {
> -               LLVMValueRef p = LLVMGetParam(func, i);
> +       for (i = 0; i < num_input_vgprs; i++) {
> +               LLVMValueRef p = input_vgprs[i];
>                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
> -               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
> +               ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
> +                                          key->vs_prolog.num_input_sgprs + i, "");
>         }
>
>         /* Compute vertex load indices from instance divisors. */
>         LLVMValueRef instance_divisor_constbuf = NULL;
>
>         if (key->vs_prolog.states.instance_divisor_is_fetched) {
>                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
>                 LLVMValueRef buf_index =
>                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
>                 instance_divisor_constbuf =
> @@ -6973,22 +7007,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
>
>  static bool si_get_vs_prolog(struct si_screen *sscreen,
>                              LLVMTargetMachineRef tm,
>                              struct si_shader *shader,
>                              struct pipe_debug_callback *debug,
>                              struct si_shader *main_part,
>                              const struct si_vs_prolog_bits *key)
>  {
>         struct si_shader_selector *vs = main_part->selector;
>
> -       /* The prolog is a no-op if there are no inputs. */
> -       if (!vs->vs_needs_prolog)
> +       if (!si_vs_needs_prolog(vs, key))
>                 return true;
>
>         /* Get the prolog. */
>         union si_shader_part_key prolog_key;
>         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
>                              key, shader, &prolog_key);
>
>         shader->prolog =
>                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
>                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 0c0fa10f40f..ee6b0c167f9 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -391,20 +391,21 @@ struct si_shader_selector {
>  /* Common VS bits between the shader key and the prolog key. */
>  struct si_vs_prolog_bits {
>         /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
>          *   divisor is 0.
>          * - If "is_one" has a bit set, the instance divisor is 1.
>          * - If "is_fetched" has a bit set, the instance divisor will be loaded
>          *   from the constant buffer.
>          */
>         uint16_t        instance_divisor_is_one;     /* bitmask of inputs */
>         uint16_t        instance_divisor_is_fetched; /* bitmask of inputs */
> +       unsigned        ls_vgpr_fix:1;
>  };
>
>  /* Common TCS bits between the shader key and the epilog key. */
>  struct si_tcs_epilog_bits {
>         unsigned        prim_mode:3;
>         unsigned        tes_reads_tess_factors:1;
>  };
>
>  struct si_gs_prolog_bits {
>         unsigned        tri_strip_adj_fix:1;
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1092da403c7..20fd128e533 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -195,25 +195,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
>         /* Make sure the output data fits in the offchip buffer */
>         *num_patches = MIN2(*num_patches,
>                             (sctx->screen->tess_offchip_block_dw_size * 4) /
>                             output_patch_size);
>
>         /* Not necessary for correctness, but improves performance. The
>          * specific value is taken from the proprietary driver.
>          */
>         *num_patches = MIN2(*num_patches, 40);
>
> -       if (sctx->b.chip_class == SI ||
> -           /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
> -            * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
> -           (sctx->b.chip_class == GFX9 &&
> -            num_tcs_input_cp > num_tcs_output_cp)) {
> +       if (sctx->b.chip_class == SI) {
>                 /* SI bug workaround, related to power management. Limit LS-HS
>                  * threadgroups to only one wave.
>                  */
>                 unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
>                 *num_patches = MIN2(*num_patches, one_wave);
>         }
>
>         /* The VGT HS block increments the patch ID unconditionally
>          * within a single threadgroup. This results in incorrect
>          * patch IDs when instanced draws are used.
> @@ -1266,20 +1262,41 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>         else if (sctx->tes_shader.cso)
>                 rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
>         else
>                 rast_prim = info->mode;
>
>         if (rast_prim != sctx->current_rast_prim) {
>                 sctx->current_rast_prim = rast_prim;
>                 sctx->do_update_shaders = true;
>         }
>
> +       if (sctx->tes_shader.cso &&
> +           (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
> +               /* Determine whether the LS VGPR fix should be applied.
> +                *
> +                * It is only required when num input CPs > num output CPs,
> +                * which cannot happen with the fixed function TCS. We should
> +                * also update this bit when switching from TCS to fixed
> +                * function TCS.
> +                */
> +               struct si_shader_selector *tcs = sctx->tcs_shader.cso;
> +               bool ls_vgpr_fix =
> +                       tcs &&
> +                       info->vertices_per_patch >
> +                       tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
> +
> +               if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
> +                       sctx->ls_vgpr_fix = ls_vgpr_fix;
> +                       sctx->do_update_shaders = true;
> +               }
> +       }
> +
>         if (sctx->gs_shader.cso) {
>                 /* Determine whether the GS triangle strip adjacency fix should
>                  * be applied. Rotate every other triangle if
>                  * - triangle strips with adjacency are fed to the GS and
>                  * - primitive restart is disabled (the rotation doesn't help
>                  *   when the restart occurs after an odd number of triangles).
>                  */
>                 bool gs_tri_strip_adj_fix =
>                         !sctx->tes_shader.cso &&
>                         info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 7898c9196c6..acfccb96168 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -1276,20 +1276,28 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
>
>                         if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
>                                 key->mono.u.vs_export_prim_id = 1;
>                 }
>                 break;
>         case PIPE_SHADER_TESS_CTRL:
>                 if (sctx->b.chip_class >= GFX9) {
>                         si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
>                                                   key, &key->part.tcs.ls_prolog);
>                         key->part.tcs.ls = sctx->vs_shader.cso;
> +
> +                       /* When the LS VGPR fix is needed, monolithic shaders
> +                        * can save:
> +                        *  - redundant EXEC initialization

The redundant EXEC initialization only seems to happen when
(!sel->vs_needs_prolog && ls_vgpr_fix). If sel->vs_needs_prolog ==
true, the main part doesn't initialize EXEC. I think the comment
should clarify that because "redundant EXEC initialization" seems more
confusing than explanatory to me.

Other than that:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek