Mesa (main): pan/bi: Lower fragment output with <4 components

Thu Jul 29 23:53:26 UTC 2021

Module: Mesa
Branch: main
Commit: c4f8b52e064b393765615ef1d79697604a354925
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c4f8b52e064b393765615ef1d79697604a354925

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Tue Jul 27 19:46:24 2021 -0400

pan/bi: Lower fragment output with <4 components

This avoids undefined behaviour in the shader, which will fail
validation added later in the series. shader-db results are neglible --
the extra moves required in a few cases are cancelled out by the extra
moves eliminated by allowing register allocation to work properly.

total instructions in shared programs: 146903 -> 146907 (<.01%)
instructions in affected programs: 33 -> 37 (12.12%)
helped: 0
HURT: 1

total tuples in shared programs: 123616 -> 123613 (<.01%)
tuples in affected programs: 764 -> 761 (-0.39%)
helped: 6
HURT: 4
helped stats (abs) min: 1.0 max: 4.0 x̄: 1.67 x̃: 1
helped stats (rel) min: 0.54% max: 5.88% x̄: 2.64% x̃: 1.86%
HURT stats (abs)   min: 1.0 max: 2.0 x̄: 1.75 x̃: 2
HURT stats (rel)   min: 4.55% max: 13.33% x̄: 8.57% x̃: 8.19%
95% mean confidence interval for tuples value: -1.73 1.13
95% mean confidence interval for tuples %-change: -2.72% 6.41%
Inconclusive result (value mean confidence interval includes 0).

total clauses in shared programs: 25656 -> 25654 (<.01%)
clauses in affected programs: 43 -> 41 (-4.65%)
helped: 2
HURT: 1
helped stats (abs) min: 1.0 max: 2.0 x̄: 1.50 x̃: 1
helped stats (rel) min: 6.25% max: 12.50% x̄: 9.38% x̃: 9.38%
HURT stats (abs)   min: 1.0 max: 1.0 x̄: 1.00 x̃: 1
HURT stats (rel)   min: 33.33% max: 33.33% x̄: 33.33% x̃: 33.33%

total cycles in shared programs: 12114.21 -> 12114.12 (<.01%)
cycles in affected programs: 27.42 -> 27.33 (-0.30%)
helped: 4
HURT: 3
helped stats (abs) min: 0.04166700000000034 max: 0.08333299999999966 x̄: 0.06 x̃: 0
helped stats (rel) min: 0.57% max: 1.59% x̄: 1.02% x̃: 0.96%
HURT stats (abs)   min: 0.0416669999999999 max: 0.08333299999999999 x̄: 0.06 x̃: 0
HURT stats (rel)   min: 4.17% max: 16.67% x̄: 8.80% x̃: 5.56%
95% mean confidence interval for cycles value: -0.07 0.05
95% mean confidence interval for cycles %-change: -2.90% 9.27%
Inconclusive result (value mean confidence interval includes 0).

total arith in shared programs: 4601.08 -> 4601.04 (<.01%)
arith in affected programs: 29 -> 28.96 (-0.14%)
helped: 6
HURT: 4
helped stats (abs) min: 0.04166700000000001 max: 0.08333299999999966 x̄: 0.06 x̃: 0
helped stats (rel) min: 0.57% max: 10.00% x̄: 3.63% x̃: 1.39%
HURT stats (abs)   min: 0.04166700000000001 max: 0.08333399999999991 x̄: 0.07 x̃: 0
HURT stats (rel)   min: 5.56% max: 16.67% x̄: 10.85% x̃: 10.60%
95% mean confidence interval for arith value: -0.05 0.05
95% mean confidence interval for arith %-change: -3.95% 8.28%
Inconclusive result (value mean confidence interval includes 0).

total quadwords in shared programs: 110008 -> 110002 (<.01%)
quadwords in affected programs: 1090 -> 1084 (-0.55%)
helped: 11
HURT: 8
helped stats (abs) min: 1.0 max: 7.0 x̄: 2.18 x̃: 1
helped stats (rel) min: 0.61% max: 13.16% x̄: 4.07% x̃: 1.82%
HURT stats (abs)   min: 1.0 max: 6.0 x̄: 2.25 x̃: 1
HURT stats (rel)   min: 3.70% max: 42.86% x̄: 12.55% x̃: 7.50%
95% mean confidence interval for quadwords value: -1.76 1.13
95% mean confidence interval for quadwords %-change: -2.95% 8.81%
Inconclusive result (value mean confidence interval includes 0).

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12130>

---

 src/panfrost/bifrost/bifrost_compile.c | 63 ++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index a386999c507..cc6bb8741fb 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -3179,6 +3179,62 @@ nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
         return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
 }
 
+/* Ensure we write exactly 4 components */
+static nir_ssa_def *
+bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
+                          unsigned channel, unsigned first, unsigned mask)
+{
+        if (!(mask & BITFIELD_BIT(channel)))
+                channel = first;
+
+        return nir_channel(b, in, channel);
+}
+
+/* Lower fragment store_output instructions to always write 4 components,
+ * matching the hardware semantic. This may require additional moves. Skipping
+ * these moves is possible in theory, but invokes undefined behaviour in the
+ * compiler. The DDK inserts these moves, so we will as well. */
+
+static bool
+bifrost_nir_lower_blend_components(struct nir_builder *b,
+                                   nir_instr *instr, void *data)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return false;
+
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        if (intr->intrinsic != nir_intrinsic_store_output)
+                return false;
+
+        nir_ssa_def *in = intr->src[0].ssa;
+        unsigned first = nir_intrinsic_component(intr);
+        unsigned mask = nir_intrinsic_write_mask(intr);
+
+        assert(first == 0 && "shouldn't get nonzero components");
+
+        /* Nothing to do */
+        if (mask == BITFIELD_MASK(4))
+                return false;
+
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* Replicate the first valid component instead */
+        nir_ssa_def *replicated =
+                nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
+                            bifrost_nir_valid_channel(b, in, 1, first, mask),
+                            bifrost_nir_valid_channel(b, in, 2, first, mask),
+                            bifrost_nir_valid_channel(b, in, 3, first, mask));
+
+        /* Rewrite to use our replicated version */
+        nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
+        nir_intrinsic_set_component(intr, 0);
+        nir_intrinsic_set_write_mask(intr, 0xF);
+        intr->num_components = 4;
+
+        return true;
+}
+
 static void
 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
 {
@@ -3281,6 +3337,13 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
         NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
         NIR_PASS(progress, nir, nir_opt_dce);
 
+        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+                NIR_PASS_V(nir, nir_shader_instructions_pass,
+                           bifrost_nir_lower_blend_components,
+                           nir_metadata_block_index | nir_metadata_dominance,
+                           NULL);
+        }
+
         /* Backend scheduler is purely local, so do some global optimizations
          * to reduce register pressure. */
         nir_move_options move_all =