[Mesa-stable] [PATCH 2/5] radeonsi/gfx9: always wrap GS and TCS in an if-block

Tue Jul 25 18:47:54 UTC 2017

On Mon, Jul 17, 2017 at 12:57 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> With merged ESGS shaders, the GS part of a wave may be empty, and the
> hardware gets confused if any GS messages are sent from that wave. Since
> S_SENDMSG is executed even when EXEC = 0, we have to wrap even
> non-monolithic GS shaders in an if-block, so that the entire shader and
> hence the S_SENDMSG instructions are skipped in empty waves.
>
> This change is not required for TCS/HS, but applying it there as well
> simplifies the code a bit.
>
> Fixes GL45-CTS.geometry_shader.rendering.rendering.*
>
> Cc: mesa-stable at lists.freedesktop.org
> ---
>  src/gallium/drivers/radeonsi/si_shader.c          | 74 +++++++++++++----------
>  src/gallium/drivers/radeonsi/si_shader_internal.h |  3 +
>  2 files changed, 45 insertions(+), 32 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 7a44e61..9aeda49 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -2713,20 +2713,23 @@ si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
>  }
>
>  /* This only writes the tessellation factor levels. */
>  static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
>  {
>         struct si_shader_context *ctx = si_shader_context(bld_base);
>         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
>
>         si_copy_tcs_inputs(bld_base);
>
> +       if (ctx->screen->b.chip_class >= GFX9)
> +               lp_build_endif(&ctx->merged_wrap_if_state);
> +
>         rel_patch_id = get_rel_patch_id(ctx);
>         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
>         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
>
>         /* Return epilog parameters from this function. */
>         LLVMBuilderRef builder = ctx->gallivm.builder;
>         LLVMValueRef ret = ctx->return_value;
>         unsigned vgpr;
>
>         if (ctx->screen->b.chip_class >= GFX9) {
> @@ -2946,20 +2949,23 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
>         else
>                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
>  }
>
>  static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
>  {
>         struct si_shader_context *ctx = si_shader_context(bld_base);
>
>         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
>                          si_get_gs_wave_id(ctx));
> +
> +       if (ctx->screen->b.chip_class >= GFX9)
> +               lp_build_endif(&ctx->merged_wrap_if_state);
>  }
>
>  static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
>  {
>         struct si_shader_context *ctx = si_shader_context(bld_base);
>         struct gallivm_state *gallivm = &ctx->gallivm;
>         struct tgsi_shader_info *info = &ctx->shader->selector->info;
>         struct si_shader_output_values *outputs = NULL;
>         int i,j;
>
> @@ -5523,39 +5529,55 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
>                 break;
>         default:
>                 assert(!"Unsupported shader type");
>                 return false;
>         }
>
>         create_function(ctx);
>         preload_ring_buffers(ctx);
>
>         /* For GFX9 merged shaders:
> -        * - Set EXEC. If the prolog is present, set EXEC there instead.
> +        * - Set EXEC for the first shader. If the prolog is present, set
> +        *   EXEC there instead.
>          * - Add a barrier before the second shader.
> +        * - In the second shader, reset EXEC to ~0 and wrap the main part in
> +        *   an if-statement. This is required for correctness in geometry
> +        *   shaders, to ensure that empty GS waves do not send GS_EMIT and
> +        *   GS_CUT messages.
>          *
> -        * The same thing for monolithic shaders is done in
> -        * si_build_wrapper_function.
> +        * For monolithic merged shaders, the first shader is wrapped in an
> +        * if-block together with its prolog in si_build_wrapper_function.
>          */
> -       if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
> -               if (sel->info.num_instructions > 1 && /* not empty shader */
> +       if (ctx->screen->b.chip_class >= GFX9) {
> +               if (!is_monolithic &&
> +                   sel->info.num_instructions > 1 && /* not empty shader */
>                     (shader->key.as_es || shader->key.as_ls) &&
>                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
>                      (ctx->type == PIPE_SHADER_VERTEX &&
>                       !sel->vs_needs_prolog))) {
>                         si_init_exec_from_input(ctx,
>                                                 ctx->param_merged_wave_info, 0);
>                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
>                            ctx->type == PIPE_SHADER_GEOMETRY) {
> -                       si_init_exec_from_input(ctx,
> -                                               ctx->param_merged_wave_info, 8);
> +                       if (!is_monolithic)
> +                               si_init_exec_full_mask(ctx);

The TCS epilog will execute with the full EXEC mask, which might cause issues.

Marek