Mesa (master): broadcom/compiler: simplify ldvary pipelining

Wed Mar 10 08:06:19 UTC 2021

Module: Mesa
Branch: master
Commit: 947e9e42cc27481adc9a8626bbc9d5f8c15ad4c3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=947e9e42cc27481adc9a8626bbc9d5f8c15ad4c3

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Fri Mar  5 12:36:51 2021 +0100

broadcom/compiler: simplify ldvary pipelining

We get optimal ldvary pipelining by doing the following:

1) Carefully merge a paired ldvary into the previous instruction when
   possible.
2) When the above succeeds, flag the ldvary as scheduled immediately so
   we can merge one of its children into the current instruction.
3) When scheduling ldvary sequences, only pick up instructions that are
   part of the sequence to avoid picking up something that prevents
   successful pipelining.

This patch skips 3) assuming some hurt shaders in exchange for better
scheduling flexibility during ldvary sequences. Besides eliminating most
of the code dedicated to special handling ldvary sequences, this also
usually allows us to produce better code by merging instructions that are
unrelated to ldvary sequences into the ldvary sequences, which is
particularly effective to fill up the gaps produced when scheduling the
first and last ldvary sequences as well as the gaps produced by flat
and noperspective varyings sequences that don't have both mul and add
instructions.

Notice that there are some hurt shaders, because some times the extra
scheduler flexibility can lead to picking up instructions that will
break a sequence without compensating for that, typically an ldunif
that prevents us from doing the fixup for a follow-up ldvary. We will
try to correct some of these cases with the next patch.

total instructions in shared programs: 13786037 -> 13760415 (-0.19%)
instructions in affected programs: 3201387 -> 3175765 (-0.80%)
helped: 16155
HURT: 4146
Instructions are helped.

total max-temps in shared programs: 2324834 -> 2322991 (-0.08%)
max-temps in affected programs: 22160 -> 20317 (-8.32%)
helped: 1340
HURT: 103
Max-temps are helped.

total sfu-stalls in shared programs: 30685 -> 31827 (3.72%)
sfu-stalls in affected programs: 782 -> 1924 (146.04%)
helped: 253
HURT: 1416
Inconclusive result.

total inst-and-stalls in shared programs: 13816722 -> 13792242 (-0.18%)
inst-and-stalls in affected programs: 3171642 -> 3147162 (-0.77%)
helped: 15331
HURT: 4179
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9471>

---

 src/broadcom/compiler/nir_to_vir.c   | 19 ++--------
 src/broadcom/compiler/qpu_schedule.c | 72 +++---------------------------------
 src/broadcom/compiler/v3d_compiler.h |  3 --
 3 files changed, 8 insertions(+), 86 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 3e13835100d..96bfd86e475 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -921,30 +921,18 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
 }
 
-static struct qreg
-ldvary_sequence_inst(struct v3d_compile *c, struct qreg result)
-{
-        struct qinst *producer =
-                   (struct qinst *) c->cur_block->instructions.prev;
-        assert(producer);
-        producer->is_ldvary_sequence = true;
-        return result;
-}
-
 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
                     struct qreg vary, struct qreg w, struct qreg r5)
 {
-        return ldvary_sequence_inst(c, vir_FADD(c,
-               ldvary_sequence_inst(c, vir_FMUL(c, vary, w)), r5));
+        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
 }
 
 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
                            struct qreg vary, struct qreg r5)
 {
-        return ldvary_sequence_inst(c, vir_FADD(c,
-               ldvary_sequence_inst(c, vir_MOV(c, vary)), r5));
+        return vir_FADD(c, vir_MOV(c, vary), r5);
 }
 
 static struct qreg
@@ -952,7 +940,7 @@ emit_flat_varying(struct v3d_compile *c,
                   struct qreg vary, struct qreg r5)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return ldvary_sequence_inst(c, vir_MOV(c, r5));
+        return vir_MOV(c, r5);
 }
 
 static struct qreg
@@ -968,7 +956,6 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                       c->undef, c->undef);
                 ldvary->qpu.sig.ldvary = true;
-                ldvary->is_ldvary_sequence = true;
                 vary = vir_emit_def(c, ldvary);
         } else {
                 vir_NOP(c)->qpu.sig.ldvary = true;
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 7f6ac5af0b4..092b9252f83 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -459,9 +459,7 @@ struct choose_scoreboard {
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
         bool tlb_locked;
-        bool ldvary_pipelining;
         bool fixup_ldvary;
-        int ldvary_count;
 };
 
 static bool
@@ -893,14 +891,6 @@ choose_instruction_to_schedule(struct v3d_compile *c,
 
         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
                             dag.link) {
-                /* If we are scheduling a pipelined varying sequence then
-                 * we want to pick up the next instruction in the sequence.
-                 */
-                if (scoreboard->ldvary_pipelining &&
-                    !n->inst->is_ldvary_sequence) {
-                        continue;
-                }
-
                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
 
 
@@ -991,17 +981,6 @@ choose_instruction_to_schedule(struct v3d_compile *c,
                                             &prev_inst->inst->qpu, inst)) {
                                 continue;
                         }
-
-                        /* If we find an ldvary inside an ongoing pipelineable
-                         * ldvary sequence we want to pick that and start
-                         * pipelining the new sequence into the previous one.
-                         */
-                        if (scoreboard->ldvary_pipelining && inst->sig.ldvary) {
-                                assert(n->inst->is_ldvary_sequence);
-                                scoreboard->ldvary_count++;
-                                scoreboard->fixup_ldvary = true;
-                                return n;
-                        }
                 }
 
                 int prio = get_instruction_priority(c->devinfo, inst);
@@ -1042,51 +1021,11 @@ choose_instruction_to_schedule(struct v3d_compile *c,
                 }
         }
 
-        /* Update ldvary pipelining state */
-        if (chosen) {
-                if (chosen->inst->qpu.sig.ldvary &&
-                    chosen->inst->is_ldvary_sequence) {
-                        scoreboard->ldvary_pipelining =
-                            c->num_inputs > ++scoreboard->ldvary_count;
-                }
-        } else if (scoreboard->ldvary_pipelining) {
-                /* If we are in the middle of an ldvary sequence we only pick
-                 * up instructions that can continue the sequence so we can
-                 * pipeline them, however, if we failed to find anything to
-                 * schedule (!prev_inst) then we can't possibly continue the
-                 * sequence and we need to stop the pipelining process and try
-                 * again.
-                 *
-                 * There is one exception to the above: noperspective or flat
-                 * varyings can cause us to not be able to pick an instruction
-                 * because they need a nop between the ldvary and the next
-                 * instruction to account for the ldvary r5 write latency. We
-                 * can try to detect this by checking if we are also unable to
-                 * schedule an instruction after disabling pipelining.
-                 *
-                 * FIXME: dropping pipelining and picking up another instruction
-                 * could break the sequence for flat/noperspective varyings we
-                 * could've been able to continue if we returned NULL here and
-                 * scheduled a NOP as a result, but detecting this case would
-                 * require us to know in advance that emitting the next NOP will
-                 * guarantee that we will be able to continue the sequence.
-                 *
-                 * If we failed to pair up (prev_inst != NULL), then we disable
-                 * pipelining if we have already scheduled the last ldvary. This
-                 * may allow any other instruction that is not part of an ldvary
-                 * sequence to be merged into the last instruction of the last
-                 * ldvary sequence for optimal results.
-                 */
-                if (!prev_inst) {
-                        scoreboard->ldvary_pipelining = false;
-                        chosen = choose_instruction_to_schedule(c, scoreboard,
-                                                                prev_inst);
-                        scoreboard->ldvary_pipelining = !chosen;
-                } else {
-                        scoreboard->ldvary_pipelining =
-                                c->num_inputs > scoreboard->ldvary_count;
-                }
-        }
+        /* If we are pairing an ldvary, flag it so we can fix it up for optimal
+         * pipelining of ldvary sequences.
+         */
+        if (prev_inst && chosen && chosen->inst->qpu.sig.ldvary)
+                scoreboard->fixup_ldvary = true;
 
         return chosen;
 }
@@ -1741,7 +1680,6 @@ schedule_instructions(struct v3d_compile *c,
                                 }
 
                                 if (scoreboard->fixup_ldvary) {
-                                        assert(scoreboard->ldvary_pipelining);
                                         scoreboard->fixup_ldvary = false;
                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
                                                 /* Flag the ldvary as scheduled
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index fafdf5a208f..12fbb64841f 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -162,9 +162,6 @@ struct qinst {
          * otherwise.
          */
         int uniform;
-
-        /* Set if this instruction participates in a varying setup. */
-        bool is_ldvary_sequence;
 };
 
 enum quniform_contents {