[Mesa-dev] [Mesa-stable] [PATCH v2 2/5] radeonsi/gfx9: always wrap GS and TCS in an if-block (v2)

Sat Jul 29 07:22:32 UTC 2017

Thanks. I think I know what the issue is, testing a fix now.

Cheers,
Nicolai

On 28.07.2017 20:16, Marek Olšák wrote:
> Hi Nicolai,
> 
> FYI, this patch breaks 30 piglit tests on GFX9. Non-monolithic and 
> monolithic variants fail in different ways:
> 
> 1) Non-monolithic:
> 
> R600_DEBUG=nooptvariant ../piglit/bin/shader_runner 
> /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test 
> -auto
> ATTENTION: default value of option vblank_mode overridden by environment.
> LLVM triggered Diagnostic Handler: <unknown>:0:0: in function main <{ 
> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, 
> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, 
> float, float, float }> ([12 x <4 x i32>] addrspace(2)*, i32, i32, i32, 
> i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x 
> i32>] addrspace(2)*, [16 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32, 
> i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x 
> i32>] addrspace(2)*, i32, i32): unsupported dynamic alloca
> 
> LLVM failed to compile shader
> radeonsi: can't compile a main shader part
> 
> 2) Monolithic:
> 
> R600_DEBUG=mono ../piglit/bin/shader_runner 
> /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test 
> -auto
> ATTENTION: default value of option vblank_mode overridden by environment.
> LLVM ERROR: Cannot select: t5: i32,ch = stacksave t4
> In function: wrapper
> Segmentation fault
> 
> 
> Marek
> 
> 
> On Wed, Jul 26, 2017 at 1:56 PM, Nicolai Hähnle <nhaehnle at gmail.com 
> <mailto:nhaehnle at gmail.com>> wrote:
> 
>     From: Nicolai Hähnle <nicolai.haehnle at amd.com
>     <mailto:nicolai.haehnle at amd.com>>
> 
>     With merged ESGS shaders, the GS part of a wave may be empty, and the
>     hardware gets confused if any GS messages are sent from that wave. Since
>     S_SENDMSG is executed even when EXEC = 0, we have to wrap even
>     non-monolithic GS shaders in an if-block, so that the entire shader and
>     hence the S_SENDMSG instructions are skipped in empty waves.
> 
>     This change is not required for TCS/HS, but applying it there as well
>     simplifies the logic a bit.
> 
>     Fixes GL45-CTS.geometry_shader.rendering.rendering.*
> 
>     v2: ensure that the TCS epilog doesn't run for non-existing patches
> 
>     Cc: mesa-stable at lists.freedesktop.org
>     <mailto:mesa-stable at lists.freedesktop.org>
>     ---
>       src/gallium/drivers/radeonsi/si_shader.c          | 109
>     +++++++++++++++-------
>       src/gallium/drivers/radeonsi/si_shader_internal.h |   3 +
>       2 files changed, 79 insertions(+), 33 deletions(-)
> 
>     diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>     b/src/gallium/drivers/radeonsi/si_shader.c
>     index a153cb7..9376d90 100644
>     --- a/src/gallium/drivers/radeonsi/si_shader.c
>     +++ b/src/gallium/drivers/radeonsi/si_shader.c
>     @@ -175,6 +175,20 @@ unsigned si_shader_io_get_unique_index(unsigned
>     semantic_name, unsigned index)
>       }
> 
>       /**
>     + * Helper function that builds an LLVM IR PHI node and immediately adds
>     + * incoming edges.
>     + */
>     +static LLVMValueRef
>     +build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
>     +         unsigned count_incoming, LLVMValueRef *values,
>     +         LLVMBasicBlockRef *blocks)
>     +{
>     +       LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
>     +       LLVMAddIncoming(phi, values, blocks, count_incoming);
>     +       return phi;
>     +}
>     +
>     +/**
>        * Get the value of a shader input parameter and extract a bitfield.
>        */
>       static LLVMValueRef unpack_param(struct si_shader_context *ctx,
>     @@ -2698,6 +2712,7 @@ si_insert_input_ptr_as_2xi32(struct
>     si_shader_context *ctx, LLVMValueRef ret,
>       static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context
>     *bld_base)
>       {
>              struct si_shader_context *ctx = si_shader_context(bld_base);
>     +       LLVMBuilderRef builder = ctx->gallivm.builder;
>              LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
> 
>              si_copy_tcs_inputs(bld_base);
>     @@ -2706,8 +2721,29 @@ static void si_llvm_emit_tcs_epilogue(struct
>     lp_build_tgsi_context *bld_base)
>              invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids,
>     8, 5);
>              tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
> 
>     +       if (ctx->screen->b.chip_class >= GFX9) {
>     +               LLVMBasicBlockRef blocks[2] = {
>     +                       LLVMGetInsertBlock(builder),
>     +                       ctx->merged_wrap_if_state.entry_block
>     +               };
>     +               LLVMValueRef values[2];
>     +
>     +               lp_build_endif(&ctx->merged_wrap_if_state);
>     +
>     +               values[0] = rel_patch_id;
>     +               values[1] = LLVMGetUndef(ctx->i32);
>     +               rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2,
>     values, blocks);
>     +
>     +               values[0] = tf_lds_offset;
>     +               values[1] = LLVMGetUndef(ctx->i32);
>     +               tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2,
>     values, blocks);
>     +
>     +               values[0] = invocation_id;
>     +               values[1] = ctx->i32_1; /* cause the epilog to skip
>     threads */
>     +               invocation_id = build_phi(&ctx->ac, ctx->i32, 2,
>     values, blocks);
>     +       }
>     +
>              /* Return epilog parameters from this function. */
>     -       LLVMBuilderRef builder = ctx->gallivm.builder;
>              LLVMValueRef ret = ctx->return_value;
>              unsigned vgpr;
> 
>     @@ -2935,6 +2971,9 @@ static void si_llvm_emit_gs_epilogue(struct
>     lp_build_tgsi_context *bld_base)
> 
>              ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP |
>     AC_SENDMSG_GS_DONE,
>                               si_get_gs_wave_id(ctx));
>     +
>     +       if (ctx->screen->b.chip_class >= GFX9)
>     +               lp_build_endif(&ctx->merged_wrap_if_state);
>       }
> 
>       static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context
>     *bld_base)
>     @@ -5502,14 +5541,20 @@ static bool si_compile_tgsi_main(struct
>     si_shader_context *ctx,
>              preload_ring_buffers(ctx);
> 
>              /* For GFX9 merged shaders:
>     -        * - Set EXEC. If the prolog is present, set EXEC there instead.
>     +        * - Set EXEC for the first shader. If the prolog is
>     present, set
>     +        *   EXEC there instead.
>               * - Add a barrier before the second shader.
>     +        * - In the second shader, reset EXEC to ~0 and wrap the
>     main part in
>     +        *   an if-statement. This is required for correctness in
>     geometry
>     +        *   shaders, to ensure that empty GS waves do not send
>     GS_EMIT and
>     +        *   GS_CUT messages.
>               *
>     -        * The same thing for monolithic shaders is done in
>     -        * si_build_wrapper_function.
>     +        * For monolithic merged shaders, the first shader is
>     wrapped in an
>     +        * if-block together with its prolog in
>     si_build_wrapper_function.
>               */
>     -       if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
>     -               if (sel->info.num_instructions > 1 && /* not empty
>     shader */
>     +       if (ctx->screen->b.chip_class >= GFX9) {
>     +               if (!is_monolithic &&
>     +                   sel->info.num_instructions > 1 && /* not empty
>     shader */
>                          (shader->key.as_es || shader->key.as_ls) &&
>                          (ctx->type == PIPE_SHADER_TESS_EVAL ||
>                           (ctx->type == PIPE_SHADER_VERTEX &&
>     @@ -5518,9 +5563,19 @@ static bool si_compile_tgsi_main(struct
>     si_shader_context *ctx,
>                                                     
>     ctx->param_merged_wave_info, 0);
>                      } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
>                                 ctx->type == PIPE_SHADER_GEOMETRY) {
>     -                       si_init_exec_from_input(ctx,
>     -                                             
>       ctx->param_merged_wave_info, 8);
>     +                       if (!is_monolithic)
>     +                               si_init_exec_full_mask(ctx);
>     +
>     +                       /* The barrier must execute for all shaders in a
>     +                        * threadgroup.
>     +                        */
>                              si_llvm_emit_barrier(NULL, bld_base, NULL);
>     +
>     +                       LLVMValueRef num_threads = unpack_param(ctx,
>     ctx->param_merged_wave_info, 8, 8);
>     +                       LLVMValueRef ena =
>     +                               LLVMBuildICmp(ctx->ac.builder,
>     LLVMIntULT,
>     +                                         
>       ac_get_thread_id(&ctx->ac), num_threads, "");
>     +                       lp_build_if(&ctx->merged_wrap_if_state,
>     &ctx->gallivm, ena);
>                      }
>              }
> 
>     @@ -5991,15 +6046,9 @@ static void si_build_wrapper_function(struct
>     si_shader_context *ctx,
> 
>                      /* Merged shaders are executed conditionally depending
>                       * on the number of enabled threads passed in the
>     input SGPRs. */
>     -               if (is_merged_shader(ctx->shader) &&
>     -                   (part == 0 || part == next_shader_first_part)) {
>     +               if (is_merged_shader(ctx->shader) && part == 0) {
>                              LLVMValueRef ena, count = initial[3];
> 
>     -                       /* The thread count for the 2nd shader is at
>     bit-offset 8. */
>     -                       if (part == next_shader_first_part) {
>     -                               count = LLVMBuildLShr(builder, count,
>     -                                                   
>       LLVMConstInt(ctx->i32, 8, 0), "");
>     -                       }
>                              count = LLVMBuildAnd(builder, count,
>                                                   LLVMConstInt(ctx->i32,
>     0x7f, 0), "");
>                              ena = LLVMBuildICmp(builder, LLVMIntULT,
>     @@ -6056,26 +6105,20 @@ static void si_build_wrapper_function(struct
>     si_shader_context *ctx,
>                      ret = LLVMBuildCall(builder, parts[part], in,
>     num_params, "");
> 
>                      if (is_merged_shader(ctx->shader) &&
>     -                   (part + 1 == next_shader_first_part ||
>     -                    part + 1 == num_parts)) {
>     +                   part + 1 == next_shader_first_part) {
>                              lp_build_endif(&if_state);
> 
>     -                       if (part + 1 == next_shader_first_part) {
>     -                               /* A barrier is required between 2
>     merged shaders. */
>     -                               si_llvm_emit_barrier(NULL,
>     &ctx->bld_base, NULL);
>     -
>     -                               /* The second half of the merged
>     shader should use
>     -                                * the inputs from the toplevel
>     (wrapper) function,
>     -                                * not the return value from the
>     last call.
>     -                                *
>     -                                * That's because the last call was
>     executed condi-
>     -                                * tionally, so we can't consume it
>     in the main
>     -                                * block.
>     -                                */
>     -                               memcpy(out, initial, sizeof(initial));
>     -                               num_out = initial_num_out;
>     -                               num_out_sgpr = initial_num_out_sgpr;
>     -                       }
>     +                       /* The second half of the merged shader
>     should use
>     +                        * the inputs from the toplevel (wrapper)
>     function,
>     +                        * not the return value from the last call.
>     +                        *
>     +                        * That's because the last call was executed
>     condi-
>     +                        * tionally, so we can't consume it in the main
>     +                        * block.
>     +                        */
>     +                       memcpy(out, initial, sizeof(initial));
>     +                       num_out = initial_num_out;
>     +                       num_out_sgpr = initial_num_out_sgpr;
>                              continue;
>                      }
> 
>     diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h
>     b/src/gallium/drivers/radeonsi/si_shader_internal.h
>     index 6e86e0b..6b98bca 100644
>     --- a/src/gallium/drivers/radeonsi/si_shader_internal.h
>     +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
>     @@ -25,6 +25,7 @@
>       #define SI_SHADER_PRIVATE_H
> 
>       #include "si_shader.h"
>     +#include "gallivm/lp_bld_flow.h"
>       #include "gallivm/lp_bld_init.h"
>       #include "gallivm/lp_bld_tgsi.h"
>       #include "tgsi/tgsi_parse.h"
>     @@ -105,6 +106,8 @@ struct si_shader_context {
>              unsigned flow_depth;
>              unsigned flow_depth_max;
> 
>     +       struct lp_build_if_state merged_wrap_if_state;
>     +
>              struct tgsi_array_info *temp_arrays;
>              LLVMValueRef *temp_array_allocas;
> 
>     --
>     2.9.3
> 
>     _______________________________________________
>     mesa-stable mailing list
>     mesa-stable at lists.freedesktop.org
>     <mailto:mesa-stable at lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/mesa-stable
>     <https://lists.freedesktop.org/mailman/listinfo/mesa-stable>
> 
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.