Mesa (master): broadcom/compiler: pipeline smooth ldvary sequences

Tue Mar 2 07:17:53 UTC 2021

Module: Mesa
Branch: master
Commit: 1784dd22a32dccff0fee0428f7cf7fea8dccc574
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1784dd22a32dccff0fee0428f7cf7fea8dccc574

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Fri Feb 26 12:31:52 2021 +0100

broadcom/compiler: pipeline smooth ldvary sequences

Typically, we would schedule smooth varyings like this:

nop                  ; nop               ; ldvary.r4
nop                  ; fmul  r0, r4, rf0
fadd  rf13, r0, r5   ; nop               ; ldvary.r1
nop                  ; fmul  r2, r1, rf0
fadd  rf12, r2, r5   ; nop               ; ldvary.r3
nop                  ; fmul  r4, r3, rf0
fadd  rf11, r4, r5   ; nop               ; ldvary.r0

where we pair up an ldvary with the fadd of the previous sequence
instead of the previous fmul. This is because ldvary has an implicit
write to r5 which is read by the fadd of the previous sequence, so
our dependency tracking doesn't allow us to move the ldvary before the
fadd, however, the r5 write of the ldvary instruction happens in the
instruction after it is emitted so we can actually move it to the fmul
and the r5 write would still happen in the same instruction as the fadd,
which is fine.

This patch allows us to pipeline these sequences optimally. For that,
after merging an ldvary into a previous instruction in the middle of
a pipelineable ldvary sequence, we check if we can manually move it
to the last scheduled instruction instead (the one before the
instruction we are currently scheduling).

If we are successful at moving the ldvary to the previous instruction,
then we flag the ldvary as scheduled immediately, which may promote
its children (the follow-up fmul instruction for that ldvary) to DAG
heads and continue the merge loop so that fmul can be picked and
merged into the final fadd of the previous sequence (where we had
originally merged the ldvary). This leads to a result that looks like
this:

nop                  ; nop               ; ldvary.r4
nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ; ldvary.r3
fadd  rf12, r2, r5   ; fmul  r4, r3, rf0 ; ldvary.r0

Shader-db results:

total instructions in shared programs: 14071591 -> 13820690 (-1.78%)
instructions in affected programs: 7809692 -> 7558791 (-3.21%)
helped: 41209
HURT: 4528
Instructions are helped.

total max-temps in shared programs: 2335784 -> 2326435 (-0.40%)
max-temps in affected programs: 84302 -> 74953 (-11.09%)
helped: 4561
HURT: 293
Max-temps are helped.

total sfu-stalls in shared programs: 31537 -> 30683 (-2.71%)
sfu-stalls in affected programs: 3551 -> 2697 (-24.05%)
helped: 1713
HURT: 750
Sfu-stalls are helped.

total inst-and-stalls in shared programs: 14103128 -> 13851373 (-1.79%)
inst-and-stalls in affected programs: 7820726 -> 7568971 (-3.22%)
helped: 41411
HURT: 4535
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9304>

---

 src/broadcom/compiler/qpu_schedule.c | 205 ++++++++++++++++++++++++++++++++++-
 1 file changed, 199 insertions(+), 6 deletions(-)

diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 4d23a80c15a..571a89fb7be 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -459,6 +459,8 @@ struct choose_scoreboard {
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
         bool tlb_locked;
+        bool ldvary_pipelining;
+        bool fixup_ldvary;
 };
 
 static bool
@@ -890,6 +892,20 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
 
         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
                             dag.link) {
+                /* If we are scheduling a pipelined smooth varying sequence then
+                 * we want to pick up the next instruction in the sequence.
+                 */
+                if (scoreboard->ldvary_pipelining &&
+                    !n->inst->ldvary_pipelining) {
+                        continue;
+                }
+
+                /* Sanity check: if we are scheduling a smooth ldvary sequence
+                 * we cannot be starting another sequence in the middle of it.
+                 */
+                assert(!scoreboard->ldvary_pipelining ||
+                       !n->inst->ldvary_pipelining_start);
+
                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
 
                 /* Simulator complains if we have two uniforms loaded in the
@@ -946,12 +962,6 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
                  * r5 in the same tick.
-                 *
-                 * XXX perf: To get good pipelining of a sequence of varying
-                 * loads, we need to figure out how to pair the ldvary signal
-                 * up to the instruction before the last r5 user in the
-                 * previous ldvary sequence.  Currently, it usually pairs with
-                 * the last r5 user.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -985,6 +995,16 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                                             &prev_inst->inst->qpu, inst)) {
                                 continue;
                         }
+
+                        /* If we find an ldvary inside an ongoing pipelineable
+                         * ldvary sequence we want to pick that and start
+                         * pipelining the new sequence into the previous one.
+                         */
+                        if (scoreboard->ldvary_pipelining && inst->sig.ldvary) {
+                                assert(n->inst->ldvary_pipelining);
+                                scoreboard->fixup_ldvary = true;
+                                return n;
+                        }
                 }
 
                 int prio = get_instruction_priority(devinfo, inst);
@@ -1025,6 +1045,26 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                 }
         }
 
+        /* If we are in the middle of an ldvary sequence we only pick up
+         * instructions that can continue the sequence so we can pipeline
+         * them, however, if we failed to find anything to schedule then we
+         * can't possibly continue the sequence and we need to stop the
+         * pipelining process and try again.
+         */
+        if (scoreboard->ldvary_pipelining && !prev_inst && !chosen) {
+                scoreboard->ldvary_pipelining = false;
+                chosen = choose_instruction_to_schedule(devinfo, scoreboard, prev_inst);
+        } else if (chosen) {
+                if (scoreboard->ldvary_pipelining) {
+                        assert(chosen->inst->ldvary_pipelining);
+                        if (chosen->inst->ldvary_pipelining_end)
+                                scoreboard->ldvary_pipelining = false;
+                } else if (chosen->inst->ldvary_pipelining_start) {
+                        assert(chosen->inst->qpu.sig.ldvary);
+                        scoreboard->ldvary_pipelining = true;
+                }
+        }
+
         return chosen;
 }
 
@@ -1460,6 +1500,144 @@ emit_thrsw(struct v3d_compile *c,
         return time;
 }
 
+static bool
+alu_reads_register(struct v3d_qpu_instr *inst,
+                   bool add, bool magic, uint32_t index)
+{
+        uint32_t num_src;
+        enum v3d_qpu_mux mux_a, mux_b;
+
+        if (add) {
+                num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
+                mux_a = inst->alu.add.a;
+                mux_b = inst->alu.add.b;
+        } else {
+                num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+                mux_a = inst->alu.mul.a;
+                mux_b = inst->alu.mul.b;
+        }
+
+        for (int i = 0; i < num_src; i++) {
+                if (magic) {
+                        if (i == 0 && mux_a == index)
+                                return true;
+                        if (i == 1 && mux_b == index)
+                                return true;
+                } else {
+                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                            inst->raddr_a == index) {
+                                return true;
+                        }
+                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                            inst->raddr_b == index) {
+                                return true;
+                        }
+                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                            inst->raddr_a == index) {
+                                return true;
+                        }
+                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                            inst->raddr_b == index) {
+                                return true;
+                        }
+                }
+        }
+
+        return false;
+}
+
+/**
+ * This takes and ldvary signal merged into 'inst' and tries to move it up to
+ * the previous instruction to get good pipelining of ldvary sequences,
+ * transforming this:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ;
+ * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
+ *
+ * into:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
+ * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
+ *
+ * If we manage to do this successfully (we return true here), then flagging
+ * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
+ * we will be able to pick up to merge into 'inst', leading to code like this:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
+ * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
+ */
+static bool
+fixup_pipelined_ldvary(struct v3d_compile *c,
+                       struct choose_scoreboard *scoreboard,
+                       struct qblock *block,
+                       struct v3d_qpu_instr *inst)
+{
+        /* We only call this if we have successfuly merged an ldvary into a
+         * previous instruction.
+         */
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->sig.ldvary);
+        uint32_t ldvary_magic = inst->sig_magic;
+        uint32_t ldvary_index = inst->sig_addr;
+
+        /* The instruction in which we merged the ldvary cannot read
+         * the ldvary destination, if it does, then moving the ldvary before
+         * it would overwrite it.
+         */
+        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+                return false;
+        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+                return false;
+
+        /* The previous instruction can't write to the same destination as the
+         * ldvary.
+         */
+        struct qinst *prev = (struct qinst *) block->instructions.prev;
+        if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                if (prev->qpu.alu.add.magic_write == ldvary_magic &&
+                    prev->qpu.alu.add.waddr == ldvary_index) {
+                        return false;
+                }
+        }
+
+        if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
+                if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
+                    prev->qpu.alu.mul.waddr == ldvary_index) {
+                        return false;
+                }
+        }
+
+        /* The previous instruction cannot have a conflicting signal */
+        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+                return false;
+
+        /* The previous instruction cannot use flags since ldvary uses the
+         * 'cond' instruction field to store the destination.
+         */
+        if (v3d_qpu_writes_flags(&prev->qpu))
+                return false;
+
+        /* Move the ldvary to the previous instruction and remove it from the
+         * current one.
+         */
+        prev->qpu.sig.ldvary = true;
+        prev->qpu.sig_magic = ldvary_magic;
+        prev->qpu.sig_addr = ldvary_index;
+        scoreboard->last_ldvary_tick = scoreboard->tick - 1;
+
+        inst->sig.ldvary = false;
+        inst->sig_magic = false;
+        inst->sig_addr = 0;
+
+        return true;
+}
+
 static uint32_t
 schedule_instructions(struct v3d_compile *c,
                       struct choose_scoreboard *scoreboard,
@@ -1530,6 +1708,21 @@ schedule_instructions(struct v3d_compile *c,
                                         v3d_qpu_dump(devinfo, inst);
                                         fprintf(stderr, "\n");
                                 }
+
+                                if (scoreboard->fixup_ldvary) {
+                                        assert(scoreboard->ldvary_pipelining);
+                                        scoreboard->fixup_ldvary = false;
+                                        if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
+                                                /* Flag the ldvary as scheduled
+                                                 * now so we can try to merge the
+                                                 * follow-up fmul into the current
+                                                 * instruction.
+                                                 */
+                                                mark_instruction_scheduled(
+                                                        devinfo, scoreboard->dag,
+                                                        time, merge);
+                                        }
+                                }
                         }
                         if (mux_read_stalls(scoreboard, inst))
                                 c->qpu_inst_stalled_count++;