Mesa (main): ir3/sched: Rewrite delay handling

Wed Nov 17 14:16:15 UTC 2021

Module: Mesa
Branch: main
Commit: d9a91318b1cee4d3c0a4b2b02c5c2968f3bdae31
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d9a91318b1cee4d3c0a4b2b02c5c2968f3bdae31

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Fri Oct 29 22:56:05 2021 +0200

ir3/sched: Rewrite delay handling

The old code walked the instructions between each ready instruction and
each of its parents for every instruction, which can quickly become
accidentally quadratic. Instead we keep track of the current
"instruction pointer" of the to-be-scheduled instruction, and for each
ready instruction calculate an "earliest possible IP" which is the IP
that needs to be reached before we can schedule it. Because this stays
constant as soon as an instruction becomes ready, we never have to
recompute it and each call to ir3_delay_calc_prera() becomes a simple
comparison and subtract. We only need to iterate over the children and
update their earliest_ip when scheduling an instruction, and we already
do that in util_day_prune_head() so it should be cheap.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13722>

---

 src/freedreno/ir3/ir3.h       |  2 --
 src/freedreno/ir3/ir3_delay.c | 68 -------------------------------------------
 src/freedreno/ir3/ir3_sched.c | 52 ++++++++++++++++++++++++++++-----
 3 files changed, 44 insertions(+), 78 deletions(-)

diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 5cdf85d2970..0c3b54d7ba9 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1632,8 +1632,6 @@ void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *i
 /* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
                    struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delay_calc_prera(struct ir3_block *block,
-                              struct ir3_instruction *instr);
 unsigned ir3_delay_calc_postra(struct ir3_block *block,
                                struct ir3_instruction *instr, bool soft,
                                bool mergedregs);
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index d06347f85c6..8a5a57dc756 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -119,74 +119,6 @@ count_instruction(struct ir3_instruction *n)
           (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }
 
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
-{
-   unsigned d = 0;
-
-   /* Note that this relies on incrementally building up the block's
-    * instruction list.. but this is how scheduling and nopsched
-    * work.
-    */
-   foreach_instr_rev (n, &block->instr_list) {
-      if ((n == instr) || (d >= maxd))
-         return MIN2(maxd, d + n->nop);
-      if (count_instruction(n))
-         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
-   }
-
-   return maxd;
-}
-
-static unsigned
-delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
-                      struct ir3_instruction *consumer, unsigned srcn)
-{
-   unsigned delay = 0;
-
-   if (assigner->opc == OPC_META_PHI)
-      return 0;
-
-   if (is_meta(assigner)) {
-      foreach_src_n (src, n, assigner) {
-         unsigned d;
-
-         if (!src->def)
-            continue;
-
-         d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
-         delay = MAX2(delay, d);
-      }
-   } else {
-      delay = ir3_delayslots(assigner, consumer, srcn, false);
-      delay -= distance(block, assigner, delay);
-   }
-
-   return delay;
-}
-
-/**
- * Calculate delay for instruction before register allocation, using SSA
- * source pointers. This can't handle inter-block dependencies.
- */
-unsigned
-ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
-{
-   unsigned delay = 0;
-
-   foreach_src_n (src, i, instr) {
-      unsigned d = 0;
-
-      if (src->def && src->def->instr->block == block) {
-         d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
-      }
-
-      delay = MAX2(delay, d);
-   }
-
-   return delay;
-}
-
 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
  * and have to handle relative accesses specially.
  */
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 6be356d17b0..a3ad5b93b7a 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -106,6 +106,8 @@ struct ir3_sched_ctx {
 
    bool error;
 
+   unsigned ip;
+
    int sfu_delay;
    int tex_delay;
 
@@ -128,6 +130,11 @@ struct ir3_sched_node {
    unsigned tex_index;
    unsigned sfu_index;
 
+   /* For ready instructions, the earliest possible ip that it could be
+    * scheduled.
+    */
+   unsigned earliest_ip;
+
    /* For instructions that are a meta:collect src, once we schedule
     * the first src of the collect, the entire vecN is live (at least
     * from the PoV of the first RA pass.. the 2nd scalar pass can fill
@@ -302,6 +309,23 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
       }
    }
 
+   bool counts_for_delay = is_alu(instr) || is_flow(instr);
+
+   /* TODO: switch to "cycles". For now try to match ir3_delay. */
+   unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
+
+   /* We insert any nop's needed to get to earliest_ip, then advance
+    * delay_cycles by scheduling the instruction.
+    */
+   ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
+
+   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+      unsigned delay = (unsigned)(uintptr_t)edge->data;
+      struct ir3_sched_node *child =
+         container_of(edge->child, struct ir3_sched_node, dag);
+      child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
+   }
+
    dag_prune_head(ctx->dag, &n->dag);
 
    unsigned cycles = cycle_count(instr);
@@ -335,6 +359,7 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
    } else if (ctx->tex_delay > 0) {
       ctx->tex_delay -= MIN2(cycles, ctx->tex_delay);
    }
+
 }
 
 struct ir3_sched_notes {
@@ -619,6 +644,12 @@ dec_rank_name(enum choose_instr_dec_rank rank)
    }
 }
 
+static unsigned
+node_delay(struct ir3_sched_ctx *ctx, struct ir3_sched_node *n)
+{
+   return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
+}
+
 /**
  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
  * Scheduling for Register pressure) heuristic.
@@ -638,8 +669,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
       if (defer && should_defer(ctx, n->instr))
          continue;
 
-      /* Note: mergedregs is only used post-RA, just set it to false */
-      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
+      unsigned d = node_delay(ctx, n);
 
       int live = live_effect(n->instr);
       if (live > 0)
@@ -737,7 +767,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
       if (!check_instr(ctx, notes, n->instr))
          continue;
 
-      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
+      unsigned d = node_delay(ctx, n);
 
       enum choose_instr_inc_rank rank;
       if (d == 0)
@@ -804,7 +834,7 @@ dump_state(struct ir3_sched_ctx *ctx)
 
    foreach_sched_node (n, &ctx->dag->heads) {
       di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay,
-         live_effect(n->instr), ir3_delay_calc_prera(ctx->block, n->instr));
+         live_effect(n->instr), node_delay(ctx, n));
 
       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
          struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
@@ -994,11 +1024,17 @@ sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
    if (instr->opc == OPC_META_COLLECT)
       sn->collect = instr;
 
-   dag_add_edge(&sn->dag, &n->dag, 0);
+   unsigned d_soft = ir3_delayslots(src, instr, i, true);
+   unsigned d = ir3_delayslots(src, instr, i, false);
 
-   unsigned d = ir3_delayslots(src, instr, i, true);
+   /* delays from (ss) and (sy) are considered separately and more accurately in
+    * the scheduling heuristic, so ignore it when calculating the ip of
+    * instructions, but do consider it when prioritizing which instructions to
+    * schedule.
+    */
+   dag_add_edge_max_data(&sn->dag, &n->dag, (uintptr_t)d);
 
-   n->delay = MAX2(n->delay, d);
+   n->delay = MAX2(n->delay, d_soft);
 }
 
 static void
@@ -1172,7 +1208,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
       instr = choose_instr(ctx, &notes);
       if (instr) {
-         unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
+         unsigned delay = node_delay(ctx, instr->data);
          d("delay=%u", delay);
 
          /* and if we run out of instructions that can be scheduled,