Mesa (main): ir3/postsched: Handle sync dependencies better

Wed Nov 17 14:16:15 UTC 2021

Module: Mesa
Branch: main
Commit: a54e7baa65ad32345c839170dc9726bdae06975e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=a54e7baa65ad32345c839170dc9726bdae06975e

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Wed Nov  3 18:00:51 2021 +0100

ir3/postsched: Handle sync dependencies better

We want to model soft dependencies, but because of how there's only a
single bit to wait on all of them, there may be unnecessary delays
inserted when a (sy)-consumer follows an unrelated (sy)-producer.
Previously there was some code to try to work around this, but we can
just model it directly using the sfu_delay and tex_delay cycle counts
that we have to maintain anyway and delete it.

This also gets rid of the calls to ir3_delay_postra with soft=true which
would be more complicated to handle in the next commit.

There is a functional change here: the idea of preferring less nop's
over critical path length (max_delay) up to 3 nops is kept (and we
delete the TODO which is already sort-of resolved by it), but delays due
to (ss)/(sy) and nops are now treated equally, rather than always
preferring nops over syncs. So if our estimate indicates that scheduling
an (ss) consumer will result in a wait of one cycle and there's another
instruction that will require one nop, we will treat them otherwise
equal and choose based on max_delay instead. This results in more
sstall, but the decrease in nops is much greater.

total nops in shared programs: 376613 -> 345482 (-8.27%)
nops in affected programs: 275483 -> 244352 (-11.30%)
helped: 3226
HURT: 110
helped stats (abs) min: 1 max: 78 x̄: 9.73 x̃: 7
helped stats (rel) min: 0.19% max: 100.00% x̄: 19.48% x̃: 13.68%
HURT stats (abs)   min: 1 max: 16 x̄: 2.43 x̃: 2
HURT stats (rel)   min: 0.00% max: 150.00% x̄: 13.34% x̃: 4.36%
95% mean confidence interval for nops value: -9.61 -9.06
95% mean confidence interval for nops %-change: -19.01% -17.78%
Nops are helped.

total sstall in shared programs: 126195 -> 133806 (6.03%)
sstall in affected programs: 79440 -> 87051 (9.58%)
helped: 300
HURT: 1922
helped stats (abs) min: 1 max: 15 x̄: 4.72 x̃: 4
helped stats (rel) min: 1.05% max: 100.00% x̄: 17.15% x̃: 14.55%
HURT stats (abs)   min: 1 max: 29 x̄: 4.70 x̃: 4
HURT stats (rel)   min: 0.00% max: 900.00% x̄: 25.38% x̃: 10.53%
95% mean confidence interval for sstall value: 3.22 3.63
95% mean confidence interval for sstall %-change: 17.50% 21.78%
Sstall are HURT.

total (ss) in shared programs: 35190 -> 35472 (0.80%)
(ss) in affected programs: 6433 -> 6715 (4.38%)
helped: 163
HURT: 401
helped stats (abs) min: 1 max: 2 x̄: 1.06 x̃: 1
helped stats (rel) min: 1.92% max: 33.33% x̄: 11.53% x̃: 10.00%
HURT stats (abs)   min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel)   min: 1.56% max: 100.00% x̄: 15.33% x̃: 12.50%
95% mean confidence interval for (ss) value: 0.41 0.59
95% mean confidence interval for (ss) %-change: 6.22% 8.93%
(ss) are HURT.

total (sy) in shared programs: 13476 -> 13521 (0.33%)
(sy) in affected programs: 669 -> 714 (6.73%)
helped: 30
HURT: 78
helped stats (abs) min: 1 max: 2 x̄: 1.13 x̃: 1
helped stats (rel) min: 4.00% max: 50.00% x̄: 21.22% x̃: 21.11%
HURT stats (abs)   min: 1 max: 2 x̄: 1.01 x̃: 1
HURT stats (rel)   min: 3.45% max: 100.00% x̄: 31.93% x̃: 25.00%
95% mean confidence interval for (sy) value: 0.23 0.60
95% mean confidence interval for (sy) %-change: 11.19% 23.15%
(sy) are HURT.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13722>

---

 src/freedreno/ir3/ir3_postsched.c | 96 +++++++++++++++------------------------
 1 file changed, 37 insertions(+), 59 deletions(-)

diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index f13a864ec7e..fa2b7100c16 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -154,25 +154,26 @@ dump_state(struct ir3_postsched_ctx *ctx)
    }
 }
 
-/* Determine if this is an instruction that we'd prefer not to schedule
- * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
- * counter, ie. the more cycles it has been since the last SFU, the less
- * costly a sync would be.
- */
-static bool
-would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+static unsigned
+node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
 {
-   if (ctx->sfu_delay) {
-      if (has_sfu_src(instr))
-         return true;
-   }
+   return ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+}
 
-   if (ctx->tex_delay) {
-      if (has_tex_src(instr))
-         return true;
-   }
+static unsigned
+node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
+{
+   unsigned delay = node_delay(ctx, n);
+
+   /* This takes into account that as when we schedule multiple tex or sfu, the
+    * first user has to wait for all of them to complete.
+    */
+   if (n->has_sfu_src)
+      delay = MAX2(delay, ctx->sfu_delay);
+   if (n->has_tex_src)
+      delay = MAX2(delay, ctx->tex_delay);
 
-   return false;
+   return delay;
 }
 
 /* find instruction to schedule: */
@@ -215,8 +216,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
    /* Next prioritize discards: */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay(ctx, n);
 
       if (d > 0)
          continue;
@@ -235,8 +235,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
    /* Next prioritize expensive instructions: */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay(ctx, n);
 
       if (d > 0)
          continue;
@@ -253,49 +252,32 @@ choose_instr(struct ir3_postsched_ctx *ctx)
       return chosen->instr;
    }
 
-   /*
-    * Sometimes be better to take a nop, rather than scheduling an
-    * instruction that would require an (ss) shortly after another
-    * SFU..  ie. if last SFU was just one or two instr ago, and we
-    * could choose between taking a nop and then scheduling
-    * something else, vs scheduling the immed avail instruction that
-    * would require (ss), we are better with the nop.
-    */
-   for (unsigned delay = 0; delay < 4; delay++) {
-      foreach_sched_node (n, &ctx->dag->heads) {
-         if (would_sync(ctx, n->instr))
-            continue;
-
-         unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
-                                            ctx->v->mergedregs);
-
-         if (d > delay)
-            continue;
-
-         if (!chosen || (chosen->max_delay < n->max_delay))
-            chosen = n;
-      }
-
-      if (chosen) {
-         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
-         return chosen->instr;
-      }
-   }
-
    /* Next try to find a ready leader w/ soft delay (ie. including extra
     * delay for things like tex fetch which can be synchronized w/ sync
     * bit (but we probably do want to schedule some other instructions
-    * while we wait)
+    * while we wait). We also allow a small amount of nops, to prefer now-nops
+    * over future-nops up to a point, as that gives better results.
     */
+   unsigned chosen_delay = 0;
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
+      unsigned d = node_delay_soft(ctx, n);
 
-      if (d > 0)
+      if (d > 3)
          continue;
 
-      if (!chosen || (chosen->max_delay < n->max_delay))
+      if (!chosen || d < chosen_delay) {
          chosen = n;
+         chosen_delay = d;
+         continue;
+      }
+
+      if (d > chosen_delay)
+         continue;
+
+      if (chosen->max_delay < n->max_delay) {
+         chosen = n;
+         chosen_delay = d;
+      }
    }
 
    if (chosen) {
@@ -308,8 +290,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
     * stalls.. but we've already decided there is not a better option.
     */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay(ctx, n);
 
       if (d > 0)
          continue;
@@ -324,9 +305,6 @@ choose_instr(struct ir3_postsched_ctx *ctx)
    }
 
    /* Otherwise choose leader with maximum cost:
-    *
-    * TODO should we try to balance cost and delays?  I guess it is
-    * a balance between now-nop's and future-nop's?
     */
    foreach_sched_node (n, &ctx->dag->heads) {
       if (!chosen || chosen->max_delay < n->max_delay)