[Mesa-dev] [Mesa-stable] [PATCH v2 2/5] radeonsi/gfx9: always wrap GS and TCS in an if-block (v2)
Nicolai Hähnle
nhaehnle at gmail.com
Sat Jul 29 07:22:32 UTC 2017
Thanks. I think I know what the issue is, testing a fix now.
Cheers,
Nicolai
On 28.07.2017 20:16, Marek Olšák wrote:
> Hi Nicolai,
>
> FYI, this patch breaks 30 piglit tests on GFX9. Non-monolithic and
> monolithic variants fail in different ways:
>
> 1) Non-monolithic:
>
> R600_DEBUG=nooptvariant ../piglit/bin/shader_runner
> /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test
> -auto
> ATTENTION: default value of option vblank_mode overridden by environment.
> LLVM triggered Diagnostic Handler: <unknown>:0:0: in function main <{
> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float,
> float, float, float }> ([12 x <4 x i32>] addrspace(2)*, i32, i32, i32,
> i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x
> i32>] addrspace(2)*, [16 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32,
> i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x
> i32>] addrspace(2)*, i32, i32): unsupported dynamic alloca
>
> LLVM failed to compile shader
> radeonsi: can't compile a main shader part
>
> 2) Monolithic:
>
> R600_DEBUG=mono ../piglit/bin/shader_runner
> /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test
> -auto
> ATTENTION: default value of option vblank_mode overridden by environment.
> LLVM ERROR: Cannot select: t5: i32,ch = stacksave t4
> In function: wrapper
> Segmentation fault
>
>
> Marek
>
>
> On Wed, Jul 26, 2017 at 1:56 PM, Nicolai Hähnle <nhaehnle at gmail.com
> <mailto:nhaehnle at gmail.com>> wrote:
>
> From: Nicolai Hähnle <nicolai.haehnle at amd.com
> <mailto:nicolai.haehnle at amd.com>>
>
> With merged ESGS shaders, the GS part of a wave may be empty, and the
> hardware gets confused if any GS messages are sent from that wave. Since
> S_SENDMSG is executed even when EXEC = 0, we have to wrap even
> non-monolithic GS shaders in an if-block, so that the entire shader and
> hence the S_SENDMSG instructions are skipped in empty waves.
>
> This change is not required for TCS/HS, but applying it there as well
> simplifies the logic a bit.
>
> Fixes GL45-CTS.geometry_shader.rendering.rendering.*
>
> v2: ensure that the TCS epilog doesn't run for non-existing patches
>
> Cc: mesa-stable at lists.freedesktop.org
> <mailto:mesa-stable at lists.freedesktop.org>
> ---
> src/gallium/drivers/radeonsi/si_shader.c | 109
> +++++++++++++++-------
> src/gallium/drivers/radeonsi/si_shader_internal.h | 3 +
> 2 files changed, 79 insertions(+), 33 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
> b/src/gallium/drivers/radeonsi/si_shader.c
> index a153cb7..9376d90 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -175,6 +175,20 @@ unsigned si_shader_io_get_unique_index(unsigned
> semantic_name, unsigned index)
> }
>
> /**
> + * Helper function that builds an LLVM IR PHI node and immediately adds
> + * incoming edges.
> + */
> +static LLVMValueRef
> +build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
> + unsigned count_incoming, LLVMValueRef *values,
> + LLVMBasicBlockRef *blocks)
> +{
> + LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
> + LLVMAddIncoming(phi, values, blocks, count_incoming);
> + return phi;
> +}
> +
> +/**
> * Get the value of a shader input parameter and extract a bitfield.
> */
> static LLVMValueRef unpack_param(struct si_shader_context *ctx,
> @@ -2698,6 +2712,7 @@ si_insert_input_ptr_as_2xi32(struct
> si_shader_context *ctx, LLVMValueRef ret,
> static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context
> *bld_base)
> {
> struct si_shader_context *ctx = si_shader_context(bld_base);
> + LLVMBuilderRef builder = ctx->gallivm.builder;
> LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
>
> si_copy_tcs_inputs(bld_base);
> @@ -2706,8 +2721,29 @@ static void si_llvm_emit_tcs_epilogue(struct
> lp_build_tgsi_context *bld_base)
> invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids,
> 8, 5);
> tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
>
> + if (ctx->screen->b.chip_class >= GFX9) {
> + LLVMBasicBlockRef blocks[2] = {
> + LLVMGetInsertBlock(builder),
> + ctx->merged_wrap_if_state.entry_block
> + };
> + LLVMValueRef values[2];
> +
> + lp_build_endif(&ctx->merged_wrap_if_state);
> +
> + values[0] = rel_patch_id;
> + values[1] = LLVMGetUndef(ctx->i32);
> + rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2,
> values, blocks);
> +
> + values[0] = tf_lds_offset;
> + values[1] = LLVMGetUndef(ctx->i32);
> + tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2,
> values, blocks);
> +
> + values[0] = invocation_id;
> + values[1] = ctx->i32_1; /* cause the epilog to skip
> threads */
> + invocation_id = build_phi(&ctx->ac, ctx->i32, 2,
> values, blocks);
> + }
> +
> /* Return epilog parameters from this function. */
> - LLVMBuilderRef builder = ctx->gallivm.builder;
> LLVMValueRef ret = ctx->return_value;
> unsigned vgpr;
>
> @@ -2935,6 +2971,9 @@ static void si_llvm_emit_gs_epilogue(struct
> lp_build_tgsi_context *bld_base)
>
> ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP |
> AC_SENDMSG_GS_DONE,
> si_get_gs_wave_id(ctx));
> +
> + if (ctx->screen->b.chip_class >= GFX9)
> + lp_build_endif(&ctx->merged_wrap_if_state);
> }
>
> static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context
> *bld_base)
> @@ -5502,14 +5541,20 @@ static bool si_compile_tgsi_main(struct
> si_shader_context *ctx,
> preload_ring_buffers(ctx);
>
> /* For GFX9 merged shaders:
> - * - Set EXEC. If the prolog is present, set EXEC there instead.
> + * - Set EXEC for the first shader. If the prolog is
> present, set
> + * EXEC there instead.
> * - Add a barrier before the second shader.
> + * - In the second shader, reset EXEC to ~0 and wrap the
> main part in
> + * an if-statement. This is required for correctness in
> geometry
> + * shaders, to ensure that empty GS waves do not send
> GS_EMIT and
> + * GS_CUT messages.
> *
> - * The same thing for monolithic shaders is done in
> - * si_build_wrapper_function.
> + * For monolithic merged shaders, the first shader is
> wrapped in an
> + * if-block together with its prolog in
> si_build_wrapper_function.
> */
> - if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
> - if (sel->info.num_instructions > 1 && /* not empty
> shader */
> + if (ctx->screen->b.chip_class >= GFX9) {
> + if (!is_monolithic &&
> + sel->info.num_instructions > 1 && /* not empty
> shader */
> (shader->key.as_es || shader->key.as_ls) &&
> (ctx->type == PIPE_SHADER_TESS_EVAL ||
> (ctx->type == PIPE_SHADER_VERTEX &&
> @@ -5518,9 +5563,19 @@ static bool si_compile_tgsi_main(struct
> si_shader_context *ctx,
>
> ctx->param_merged_wave_info, 0);
> } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
> ctx->type == PIPE_SHADER_GEOMETRY) {
> - si_init_exec_from_input(ctx,
> -
> ctx->param_merged_wave_info, 8);
> + if (!is_monolithic)
> + si_init_exec_full_mask(ctx);
> +
> + /* The barrier must execute for all shaders in a
> + * threadgroup.
> + */
> si_llvm_emit_barrier(NULL, bld_base, NULL);
> +
> + LLVMValueRef num_threads = unpack_param(ctx,
> ctx->param_merged_wave_info, 8, 8);
> + LLVMValueRef ena =
> + LLVMBuildICmp(ctx->ac.builder,
> LLVMIntULT,
> +
> ac_get_thread_id(&ctx->ac), num_threads, "");
> + lp_build_if(&ctx->merged_wrap_if_state,
> &ctx->gallivm, ena);
> }
> }
>
> @@ -5991,15 +6046,9 @@ static void si_build_wrapper_function(struct
> si_shader_context *ctx,
>
> /* Merged shaders are executed conditionally depending
> * on the number of enabled threads passed in the
> input SGPRs. */
> - if (is_merged_shader(ctx->shader) &&
> - (part == 0 || part == next_shader_first_part)) {
> + if (is_merged_shader(ctx->shader) && part == 0) {
> LLVMValueRef ena, count = initial[3];
>
> - /* The thread count for the 2nd shader is at
> bit-offset 8. */
> - if (part == next_shader_first_part) {
> - count = LLVMBuildLShr(builder, count,
> -
> LLVMConstInt(ctx->i32, 8, 0), "");
> - }
> count = LLVMBuildAnd(builder, count,
> LLVMConstInt(ctx->i32,
> 0x7f, 0), "");
> ena = LLVMBuildICmp(builder, LLVMIntULT,
> @@ -6056,26 +6105,20 @@ static void si_build_wrapper_function(struct
> si_shader_context *ctx,
> ret = LLVMBuildCall(builder, parts[part], in,
> num_params, "");
>
> if (is_merged_shader(ctx->shader) &&
> - (part + 1 == next_shader_first_part ||
> - part + 1 == num_parts)) {
> + part + 1 == next_shader_first_part) {
> lp_build_endif(&if_state);
>
> - if (part + 1 == next_shader_first_part) {
> - /* A barrier is required between 2
> merged shaders. */
> - si_llvm_emit_barrier(NULL,
> &ctx->bld_base, NULL);
> -
> - /* The second half of the merged
> shader should use
> - * the inputs from the toplevel
> (wrapper) function,
> - * not the return value from the
> last call.
> - *
> - * That's because the last call was
> executed condi-
> - * tionally, so we can't consume it
> in the main
> - * block.
> - */
> - memcpy(out, initial, sizeof(initial));
> - num_out = initial_num_out;
> - num_out_sgpr = initial_num_out_sgpr;
> - }
> + /* The second half of the merged shader
> should use
> + * the inputs from the toplevel (wrapper)
> function,
> + * not the return value from the last call.
> + *
> + * That's because the last call was executed
> condi-
> + * tionally, so we can't consume it in the main
> + * block.
> + */
> + memcpy(out, initial, sizeof(initial));
> + num_out = initial_num_out;
> + num_out_sgpr = initial_num_out_sgpr;
> continue;
> }
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h
> b/src/gallium/drivers/radeonsi/si_shader_internal.h
> index 6e86e0b..6b98bca 100644
> --- a/src/gallium/drivers/radeonsi/si_shader_internal.h
> +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
> @@ -25,6 +25,7 @@
> #define SI_SHADER_PRIVATE_H
>
> #include "si_shader.h"
> +#include "gallivm/lp_bld_flow.h"
> #include "gallivm/lp_bld_init.h"
> #include "gallivm/lp_bld_tgsi.h"
> #include "tgsi/tgsi_parse.h"
> @@ -105,6 +106,8 @@ struct si_shader_context {
> unsigned flow_depth;
> unsigned flow_depth_max;
>
> + struct lp_build_if_state merged_wrap_if_state;
> +
> struct tgsi_array_info *temp_arrays;
> LLVMValueRef *temp_array_allocas;
>
> --
> 2.9.3
>
> _______________________________________________
> mesa-stable mailing list
> mesa-stable at lists.freedesktop.org
> <mailto:mesa-stable at lists.freedesktop.org>
> https://lists.freedesktop.org/mailman/listinfo/mesa-stable
> <https://lists.freedesktop.org/mailman/listinfo/mesa-stable>
>
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list