<div dir="ltr"><div>Hi Nicolai,<br><br>FYI, this patch breaks 30 piglit tests on GFX9. Non-monolithic and monolithic variants fail in different ways:<br><br></div>1) Non-monolithic:<br><div><br>R600_DEBUG=nooptvariant ../piglit/bin/shader_runner /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test -auto<br>ATTENTION: default value of option vblank_mode overridden by environment.<br>LLVM triggered Diagnostic Handler: <unknown>:0:0: in function main <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> ([12 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x i32>] addrspace(2)*, [16 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x i32>] addrspace(2)*, i32, i32): unsupported dynamic alloca<br><br>LLVM failed to compile shader<br>radeonsi: can't compile a main shader part<br><br></div><div>2) Monolithic:<br><br>R600_DEBUG=mono ../piglit/bin/shader_runner /home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test -auto<br>ATTENTION: default value of option vblank_mode overridden by environment.<br>LLVM ERROR: Cannot select: t5: i32,ch = stacksave t4<br>In function: wrapper<br>Segmentation fault<br><br><br></div><div>Marek<br></div><div><br></div></div><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Jul 26, 2017 at 1:56 PM, Nicolai Hähnle <span dir="ltr"><<a href="mailto:nhaehnle@gmail.com" target="_blank">nhaehnle@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Nicolai Hähnle <<a href="mailto:nicolai.haehnle@amd.com">nicolai.haehnle@amd.com</a>><br>
<br>
With merged ESGS shaders, the GS part of a wave may be empty, and the<br>
hardware gets confused if any GS messages are sent from that wave. Since<br>
S_SENDMSG is executed even when EXEC = 0, we have to wrap even<br>
non-monolithic GS shaders in an if-block, so that the entire shader and<br>
hence the S_SENDMSG instructions are skipped in empty waves.<br>
<br>
This change is not required for TCS/HS, but applying it there as well<br>
simplifies the logic a bit.<br>
<br>
Fixes GL45-CTS.geometry_shader.<wbr>rendering.rendering.*<br>
<br>
v2: ensure that the TCS epilog doesn't run for non-existing patches<br>
<br>
Cc: <a href="mailto:mesa-stable@lists.freedesktop.org">mesa-stable@lists.freedesktop.<wbr>org</a><br>
---<br>
 src/gallium/drivers/radeonsi/<wbr>si_shader.c          | 109 +++++++++++++++-------<br>
 src/gallium/drivers/radeonsi/<wbr>si_shader_internal.h |   3 +<br>
 2 files changed, 79 insertions(+), 33 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_shader.c b/src/gallium/drivers/<wbr>radeonsi/si_shader.c<br>
index a153cb7..9376d90 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_shader.c<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_shader.c<br>
@@ -175,6 +175,20 @@ unsigned si_shader_io_get_unique_index(<wbr>unsigned semantic_name, unsigned index)<br>
 }<br>
<br>
 /**<br>
+ * Helper function that builds an LLVM IR PHI node and immediately adds<br>
+ * incoming edges.<br>
+ */<br>
+static LLVMValueRef<br>
+build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,<br>
+         unsigned count_incoming, LLVMValueRef *values,<br>
+         LLVMBasicBlockRef *blocks)<br>
+{<br>
+       LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");<br>
+       LLVMAddIncoming(phi, values, blocks, count_incoming);<br>
+       return phi;<br>
+}<br>
+<br>
+/**<br>
  * Get the value of a shader input parameter and extract a bitfield.<br>
  */<br>
 static LLVMValueRef unpack_param(struct si_shader_context *ctx,<br>
@@ -2698,6 +2712,7 @@ si_insert_input_ptr_as_2xi32(<wbr>struct si_shader_context *ctx, LLVMValueRef ret,<br>
 static void si_llvm_emit_tcs_epilogue(<wbr>struct lp_build_tgsi_context *bld_base)<br>
 {<br>
        struct si_shader_context *ctx = si_shader_context(bld_base);<br>
+       LLVMBuilderRef builder = ctx->gallivm.builder;<br>
        LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;<br>
<br>
        si_copy_tcs_inputs(bld_base);<br>
@@ -2706,8 +2721,29 @@ static void si_llvm_emit_tcs_epilogue(<wbr>struct lp_build_tgsi_context *bld_base)<br>
        invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);<br>
        tf_lds_offset = get_tcs_out_current_patch_<wbr>data_offset(ctx);<br>
<br>
+       if (ctx->screen->b.chip_class >= GFX9) {<br>
+               LLVMBasicBlockRef blocks[2] = {<br>
+                       LLVMGetInsertBlock(builder),<br>
+                       ctx->merged_wrap_if_state.<wbr>entry_block<br>
+               };<br>
+               LLVMValueRef values[2];<br>
+<br>
+               lp_build_endif(&ctx->merged_<wbr>wrap_if_state);<br>
+<br>
+               values[0] = rel_patch_id;<br>
+               values[1] = LLVMGetUndef(ctx->i32);<br>
+               rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);<br>
+<br>
+               values[0] = tf_lds_offset;<br>
+               values[1] = LLVMGetUndef(ctx->i32);<br>
+               tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);<br>
+<br>
+               values[0] = invocation_id;<br>
+               values[1] = ctx->i32_1; /* cause the epilog to skip threads */<br>
+               invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);<br>
+       }<br>
+<br>
        /* Return epilog parameters from this function. */<br>
-       LLVMBuilderRef builder = ctx->gallivm.builder;<br>
        LLVMValueRef ret = ctx->return_value;<br>
        unsigned vgpr;<br>
<br>
@@ -2935,6 +2971,9 @@ static void si_llvm_emit_gs_epilogue(<wbr>struct lp_build_tgsi_context *bld_base)<br>
<br>
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,<br>
                         si_get_gs_wave_id(ctx));<br>
+<br>
+       if (ctx->screen->b.chip_class >= GFX9)<br>
+               lp_build_endif(&ctx->merged_<wbr>wrap_if_state);<br>
 }<br>
<br>
 static void si_llvm_emit_vs_epilogue(<wbr>struct lp_build_tgsi_context *bld_base)<br>
@@ -5502,14 +5541,20 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,<br>
        preload_ring_buffers(ctx);<br>
<br>
        /* For GFX9 merged shaders:<br>
-        * - Set EXEC. If the prolog is present, set EXEC there instead.<br>
+        * - Set EXEC for the first shader. If the prolog is present, set<br>
+        *   EXEC there instead.<br>
         * - Add a barrier before the second shader.<br>
+        * - In the second shader, reset EXEC to ~0 and wrap the main part in<br>
+        *   an if-statement. This is required for correctness in geometry<br>
+        *   shaders, to ensure that empty GS waves do not send GS_EMIT and<br>
+        *   GS_CUT messages.<br>
         *<br>
-        * The same thing for monolithic shaders is done in<br>
-        * si_build_wrapper_function.<br>
+        * For monolithic merged shaders, the first shader is wrapped in an<br>
+        * if-block together with its prolog in si_build_wrapper_function.<br>
         */<br>
-       if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {<br>
-               if (sel->info.num_instructions > 1 && /* not empty shader */<br>
+       if (ctx->screen->b.chip_class >= GFX9) {<br>
+               if (!is_monolithic &&<br>
+                   sel->info.num_instructions > 1 && /* not empty shader */<br>
                    (shader->key.as_es || shader->key.as_ls) &&<br>
                    (ctx->type == PIPE_SHADER_TESS_EVAL ||<br>
                     (ctx->type == PIPE_SHADER_VERTEX &&<br>
@@ -5518,9 +5563,19 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,<br>
                                                ctx->param_merged_wave_info, 0);<br>
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||<br>
                           ctx->type == PIPE_SHADER_GEOMETRY) {<br>
-                       si_init_exec_from_input(ctx,<br>
-                                               ctx->param_merged_wave_info, 8);<br>
+                       if (!is_monolithic)<br>
+                               si_init_exec_full_mask(ctx);<br>
+<br>
+                       /* The barrier must execute for all shaders in a<br>
+                        * threadgroup.<br>
+                        */<br>
                        si_llvm_emit_barrier(NULL, bld_base, NULL);<br>
+<br>
+                       LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);<br>
+                       LLVMValueRef ena =<br>
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,<br>
+                                           ac_get_thread_id(&ctx->ac), num_threads, "");<br>
+                       lp_build_if(&ctx->merged_wrap_<wbr>if_state, &ctx->gallivm, ena);<br>
                }<br>
        }<br>
<br>
@@ -5991,15 +6046,9 @@ static void si_build_wrapper_function(<wbr>struct si_shader_context *ctx,<br>
<br>
                /* Merged shaders are executed conditionally depending<br>
                 * on the number of enabled threads passed in the input SGPRs. */<br>
-               if (is_merged_shader(ctx->shader) &&<br>
-                   (part == 0 || part == next_shader_first_part)) {<br>
+               if (is_merged_shader(ctx->shader) && part == 0) {<br>
                        LLVMValueRef ena, count = initial[3];<br>
<br>
-                       /* The thread count for the 2nd shader is at bit-offset 8. */<br>
-                       if (part == next_shader_first_part) {<br>
-                               count = LLVMBuildLShr(builder, count,<br>
-                                                     LLVMConstInt(ctx->i32, 8, 0), "");<br>
-                       }<br>
                        count = LLVMBuildAnd(builder, count,<br>
                                             LLVMConstInt(ctx->i32, 0x7f, 0), "");<br>
                        ena = LLVMBuildICmp(builder, LLVMIntULT,<br>
@@ -6056,26 +6105,20 @@ static void si_build_wrapper_function(<wbr>struct si_shader_context *ctx,<br>
                ret = LLVMBuildCall(builder, parts[part], in, num_params, "");<br>
<br>
                if (is_merged_shader(ctx->shader) &&<br>
-                   (part + 1 == next_shader_first_part ||<br>
-                    part + 1 == num_parts)) {<br>
+                   part + 1 == next_shader_first_part) {<br>
                        lp_build_endif(&if_state);<br>
<br>
-                       if (part + 1 == next_shader_first_part) {<br>
-                               /* A barrier is required between 2 merged shaders. */<br>
-                               si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);<br>
-<br>
-                               /* The second half of the merged shader should use<br>
-                                * the inputs from the toplevel (wrapper) function,<br>
-                                * not the return value from the last call.<br>
-                                *<br>
-                                * That's because the last call was executed condi-<br>
-                                * tionally, so we can't consume it in the main<br>
-                                * block.<br>
-                                */<br>
-                               memcpy(out, initial, sizeof(initial));<br>
-                               num_out = initial_num_out;<br>
-                               num_out_sgpr = initial_num_out_sgpr;<br>
-                       }<br>
+                       /* The second half of the merged shader should use<br>
+                        * the inputs from the toplevel (wrapper) function,<br>
+                        * not the return value from the last call.<br>
+                        *<br>
+                        * That's because the last call was executed condi-<br>
+                        * tionally, so we can't consume it in the main<br>
+                        * block.<br>
+                        */<br>
+                       memcpy(out, initial, sizeof(initial));<br>
+                       num_out = initial_num_out;<br>
+                       num_out_sgpr = initial_num_out_sgpr;<br>
                        continue;<br>
                }<br>
<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_shader_internal.h b/src/gallium/drivers/<wbr>radeonsi/si_shader_internal.h<br>
index 6e86e0b..6b98bca 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_shader_internal.h<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_shader_internal.h<br>
@@ -25,6 +25,7 @@<br>
 #define SI_SHADER_PRIVATE_H<br>
<br>
 #include "si_shader.h"<br>
+#include "gallivm/lp_bld_flow.h"<br>
 #include "gallivm/lp_bld_init.h"<br>
 #include "gallivm/lp_bld_tgsi.h"<br>
 #include "tgsi/tgsi_parse.h"<br>
@@ -105,6 +106,8 @@ struct si_shader_context {<br>
        unsigned flow_depth;<br>
        unsigned flow_depth_max;<br>
<br>
+       struct lp_build_if_state merged_wrap_if_state;<br>
+<br>
        struct tgsi_array_info *temp_arrays;<br>
        LLVMValueRef *temp_array_allocas;<br>
<span class="HOEnZb"><font color="#888888"><br>
--<br>
2.9.3<br>
<br>
______________________________<wbr>_________________<br>
mesa-stable mailing list<br>
<a href="mailto:mesa-stable@lists.freedesktop.org">mesa-stable@lists.freedesktop.<wbr>org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-stable" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-stable</a><br>
</font></span></blockquote></div><br></div>