Mesa (master): broadcom/compiler: try to fill up delay slots after a branch instruction

Wed Mar 31 06:09:24 UTC 2021

Module: Mesa
Branch: master
Commit: e266e6c634aa04172484a98466f173c1bda9671c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e266e6c634aa04172484a98466f173c1bda9671c

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Mon Mar 29 14:28:14 2021 +0200

broadcom/compiler: try to fill up delay slots after a branch instruction

For this we do something similar to what we do with thrsw where we try to
move the branch instruction earlier so the previous instructions execute
in the delay slots of the branch.

Generally, we can do this with any instruction except:
 - If the instruction reads a uniform: since our branches do as well and
   uniforms come from an ordered FIFO stream.
 - If the instruction writes flags, since our branch instruction will
   probably read them.
 - If the instruction is in the delay slots of another thread switch,
   branch, or unifa write, which is disallowed.

total instructions in shared programs: 13648140 -> 13613972 (-0.25%)
instructions in affected programs: 2209552 -> 2175384 (-1.55%)
helped: 6765
HURT: 0
Instructions are helped.

total max-temps in shared programs: 2318687 -> 2318436 (-0.01%)
max-temps in affected programs: 5046 -> 4795 (-4.97%)
helped: 152
HURT: 0
Max-temps are helped.

total inst-and-stalls in shared programs: 13680494 -> 13646326 (-0.25%)
inst-and-stalls in affected programs: 2220394 -> 2186226 (-1.54%)
helped: 6765
HURT: 0
Inst-and-stalls are helped.

total nops in shared programs: 399818 -> 365640 (-8.55%)
nops in affected programs: 127311 -> 93133 (-26.85%)
helped: 6765
HURT: 0
Nops are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9918>

---

 src/broadcom/compiler/qpu_schedule.c | 137 ++++++++++++++++++++++++++++++-----
 1 file changed, 119 insertions(+), 18 deletions(-)

diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index e6a07723618..3dd5d246a45 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -490,6 +490,7 @@ struct choose_scoreboard {
         int last_unifa_write_tick;
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
+        int last_branch_tick;
         bool tlb_locked;
         bool fixup_ldvary;
         int ldvary_count;
@@ -1078,6 +1079,16 @@ retry:
                         continue;
                 }
 
+                /* Don't try to put a branch in the delay slots of another
+                 * branch or a unifa write.
+                 */
+                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                        if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
+                                continue;
+                        if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
+                                continue;
+                }
+
                 /* If we're trying to pair with another instruction, check
                  * that they're compatible.
                  */
@@ -1674,11 +1685,17 @@ emit_thrsw(struct v3d_compile *c,
         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
 
-        /* Don't try to emit a thrsw in the delay slots of a previous thrsw */
+        /* Don't try to emit a thrsw in the delay slots of a previous thrsw
+         * or branch.
+         */
         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
                 emit_nop(c, block, scoreboard);
                 time++;
         }
+        while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
+                emit_nop(c, block, scoreboard);
+                time++;
+        }
 
         /* Find how far back into previous instructions we can put the THRSW. */
         int slots_filled = 0;
@@ -1745,6 +1762,97 @@ emit_thrsw(struct v3d_compile *c,
         return time;
 }
 
+static bool
+qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
+{
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+                return false;
+
+        if (inst->qpu.sig.thrsw)
+                return false;
+
+        if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
+                return false;
+
+        if (vir_has_uniform(inst))
+                return false;
+
+        return true;
+}
+
+static void
+emit_branch(struct v3d_compile *c,
+           struct qblock *block,
+           struct choose_scoreboard *scoreboard,
+           struct qinst *inst)
+{
+        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+
+        /* We should've not picked up a branch for the delay slots of a previous
+         * thrsw, branch or unifa write instruction.
+         */
+        int branch_tick = scoreboard->tick;
+        assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
+        assert(scoreboard->last_branch_tick + 3 < branch_tick);
+        assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
+
+        /* Insert the branch instruction */
+        insert_scheduled_instruction(c, block, scoreboard, inst);
+
+        /* Now see if we can move the branch instruction back into the
+         * instruction stream to fill its delay slots
+         */
+        int slots_filled = 0;
+        while (slots_filled < 3 && block->instructions.next != &inst->link) {
+                struct qinst *prev_inst = (struct qinst *) inst->link.prev;
+                assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
+
+                /* Can't move the branch instruction if that would place it
+                 * in the delay slots of other instructions.
+                 */
+                if (scoreboard->last_branch_tick + 3 >=
+                    branch_tick - slots_filled - 1) {
+                        break;
+                }
+
+                if (scoreboard->last_thrsw_tick + 2 >=
+                    branch_tick - slots_filled - 1) {
+                        break;
+                }
+
+                if (scoreboard->last_unifa_write_tick + 3 >=
+                    branch_tick - slots_filled - 1) {
+                        break;
+                }
+
+                /* Can't move a conditional branch before the instruction
+                 * that writes the flags for its condition.
+                 */
+                if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
+                    inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
+                        break;
+                }
+
+                if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
+                        break;
+
+                list_del(&prev_inst->link);
+                list_add(&prev_inst->link, &inst->link);
+                slots_filled++;
+        }
+
+        block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
+        scoreboard->last_branch_tick = branch_tick - slots_filled;
+
+        /* Fill any remaining delay slots.
+         *
+         * FIXME: For unconditional branches we could fill these with the
+         * first instructions in the successor block.
+         */
+        for (int i = 0; i < 3 - slots_filled; i++)
+                emit_nop(c, block, scoreboard);
+}
+
 static bool
 alu_reads_register(struct v3d_qpu_instr *inst,
                    bool add, bool magic, uint32_t index)
@@ -2025,23 +2133,11 @@ schedule_instructions(struct v3d_compile *c,
 
                 if (inst->sig.thrsw) {
                         time += emit_thrsw(c, block, scoreboard, qinst, false);
+                } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                        emit_branch(c, block, scoreboard, qinst);
                 } else {
                         insert_scheduled_instruction(c, block,
                                                      scoreboard, qinst);
-
-                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
-                                block->branch_qpu_ip = c->qpu_inst_count - 1;
-                                /* Fill the delay slots.
-                                 *
-                                 * We should fill these with actual instructions,
-                                 * instead, but that will probably need to be done
-                                 * after this, once we know what the leading
-                                 * instructions of the successors are (so we can
-                                 * handle A/B register file write latency)
-                                 */
-                                for (int i = 0; i < 3; i++)
-                                        emit_nop(c, block, scoreboard);
-                        }
                 }
         }
 
@@ -2111,11 +2207,15 @@ qpu_set_branch_targets(struct v3d_compile *c)
                 /* Walk back through the delay slots to find the branch
                  * instr.
                  */
+                struct qinst *branch = NULL;
                 struct list_head *entry = block->instructions.prev;
-                for (int i = 0; i < 3; i++)
+                for (int i = 0; i < 3; i++) {
                         entry = entry->prev;
-                struct qinst *branch = container_of(entry, struct qinst, link);
-                assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+                        branch = container_of(entry, struct qinst, link);
+                        if (branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+                                break;
+                }
+                assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
 
                 /* Make sure that the if-we-don't-jump
                  * successor was scheduled just after the
@@ -2169,6 +2269,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_magic_sfu_write_tick = -10;
         scoreboard.last_uniforms_reset_tick = -10;
         scoreboard.last_thrsw_tick = -10;
+        scoreboard.last_branch_tick = -10;
         scoreboard.last_stallable_sfu_tick = -10;
 
         if (debug) {