Mesa (main): freedreno/ir3: Reduce choose_instr_dec() and _inc() overhead.

Mon Jul 19 23:33:08 UTC 2021

Module: Mesa
Branch: main
Commit: bda26dfcfc5f9012ab1bd22f2bbaa664315e2671
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bda26dfcfc5f9012ab1bd22f2bbaa664315e2671

Author: Emma Anholt <emma at anholt.net>
Date:   Tue Jul 13 13:39:36 2021 -0700

freedreno/ir3: Reduce choose_instr_dec() and _inc() overhead.

If you didn't have a freed+ready instruction, you'd redo the live_effect
and check_instr() logic multiple times per instr.  Replace the multiple
loops in each function with a ranking that I think is more readable,
reducing the overhead in the process.

debugoptimized dEQP-GLES31.functional.ubo.random.all_per_block_buffers.20
runtime goes from ~3.5s -> ~3.0s on my lazor.  No shader-db change.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11855>

---

 src/freedreno/ir3/ir3_sched.c | 185 +++++++++++++++++++-----------------------
 1 file changed, 82 insertions(+), 103 deletions(-)

diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 7b452d630cc..12ef6954821 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -593,6 +593,30 @@ static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx,
                                                struct ir3_sched_notes *notes,
                                                bool defer, bool avoid_output);
 
+enum choose_instr_dec_rank {
+   DEC_NEUTRAL,
+   DEC_NEUTRAL_READY,
+   DEC_FREED,
+   DEC_FREED_READY,
+};
+
+static const char *
+dec_rank_name(enum choose_instr_dec_rank rank)
+{
+   switch (rank) {
+   case DEC_NEUTRAL:
+      return "neutral";
+   case DEC_NEUTRAL_READY:
+      return "neutral+ready";
+   case DEC_FREED:
+      return "freed";
+   case DEC_FREED_READY:
+      return "freed+ready";
+   default:
+      return NULL;
+   }
+}
+
 /**
  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
  * Scheduling for Register pressure) heuristic.
@@ -606,8 +630,8 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 {
    const char *mode = defer ? "-d" : "";
    struct ir3_sched_node *chosen = NULL;
+   enum choose_instr_dec_rank chosen_rank = DEC_NEUTRAL;
 
-   /* Find a ready inst with regs freed and pick the one with max cost. */
    foreach_sched_node (n, &ctx->dag->heads) {
       if (defer && should_defer(ctx, n->instr))
          continue;
@@ -615,97 +639,70 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
       /* Note: mergedregs is only used post-RA, just set it to false */
       unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
-      if (d > 0)
-         continue;
-
-      if (live_effect(n->instr) > -1)
+      int live = live_effect(n->instr);
+      if (live > 0)
          continue;
 
       if (!check_instr(ctx, notes, n->instr))
          continue;
 
-      if (!chosen || chosen->max_delay < n->max_delay) {
-         chosen = n;
+      enum choose_instr_dec_rank rank;
+      if (live < 0) {
+         /* Prioritize instrs which free up regs and can be scheduled with no
+          * delay.
+          */
+         if (d == 0)
+            rank = DEC_FREED_READY;
+         else
+            rank = DEC_FREED;
+      } else {
+         /* Contra the paper, pick a leader with no effect on used regs.  This
+          * may open up new opportunities, as otherwise a single-operand instr
+          * consuming a value will tend to block finding freeing that value.
+          * This had a massive effect on reducing spilling on V3D.
+          *
+          * XXX: Should this prioritize ready?
+          */
+         if (d == 0)
+            rank = DEC_NEUTRAL_READY;
+         else
+            rank = DEC_NEUTRAL;
       }
-   }
-
-   if (chosen) {
-      di(chosen->instr, "dec%s: chose (freed+ready)", mode);
-      return chosen;
-   }
-
-   /* Find a leader with regs freed and pick the one with max cost. */
-   foreach_sched_node (n, &ctx->dag->heads) {
-      if (defer && should_defer(ctx, n->instr))
-         continue;
-
-      if (live_effect(n->instr) > -1)
-         continue;
 
-      if (!check_instr(ctx, notes, n->instr))
-         continue;
-
-      if (!chosen || chosen->max_delay < n->max_delay) {
+      /* Prefer higher-ranked instructions, or in the case of a rank tie, the
+       * highest latency-to-end-of-program instruction.
+       */
+      if (!chosen || rank > chosen_rank ||
+          (rank == chosen_rank && chosen->max_delay < n->max_delay)) {
          chosen = n;
+         chosen_rank = rank;
       }
    }
 
    if (chosen) {
-      di(chosen->instr, "dec%s: chose (freed)", mode);
+      di(chosen->instr, "dec%s: chose (%s)", mode, dec_rank_name(chosen_rank));
       return chosen;
    }
 
-   /* Contra the paper, pick a leader with no effect on used regs.  This may
-    * open up new opportunities, as otherwise a single-operand instr consuming
-    * a value will tend to block finding freeing that value.  This had a
-    * massive effect on reducing spilling on V3D.
-    *
-    * XXX: Should this prioritize ready?
-    */
-   foreach_sched_node (n, &ctx->dag->heads) {
-      if (defer && should_defer(ctx, n->instr))
-         continue;
-
-      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
-
-      if (d > 0)
-         continue;
-
-      if (live_effect(n->instr) > 0)
-         continue;
-
-      if (!check_instr(ctx, notes, n->instr))
-         continue;
-
-      if (!chosen || chosen->max_delay < n->max_delay)
-         chosen = n;
-   }
-
-   if (chosen) {
-      di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
-      return chosen;
-   }
-
-   foreach_sched_node (n, &ctx->dag->heads) {
-      if (defer && should_defer(ctx, n->instr))
-         continue;
-
-      if (live_effect(n->instr) > 0)
-         continue;
-
-      if (!check_instr(ctx, notes, n->instr))
-         continue;
+   return choose_instr_inc(ctx, notes, defer, true);
+}
 
-      if (!chosen || chosen->max_delay < n->max_delay)
-         chosen = n;
-   }
+enum choose_instr_inc_rank {
+   INC_DISTANCE,
+   INC_DISTANCE_READY,
+};
 
-   if (chosen) {
-      di(chosen->instr, "dec%s: chose (neutral)", mode);
-      return chosen;
+static const char *
+inc_rank_name(enum choose_instr_inc_rank rank)
+{
+   switch (rank) {
+   case INC_DISTANCE:
+      return "distance";
+   case INC_DISTANCE_READY:
+      return "distance+ready";
+   default:
+      return NULL;
    }
-
-   return choose_instr_inc(ctx, notes, defer, true);
 }
 
 /**
@@ -718,6 +715,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 {
    const char *mode = defer ? "-d" : "";
    struct ir3_sched_node *chosen = NULL;
+   enum choose_instr_inc_rank chosen_rank = INC_DISTANCE;
 
    /*
     * From hear on out, we are picking something that increases
@@ -734,48 +732,29 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
       if (defer && should_defer(ctx, n->instr))
          continue;
 
-      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
-
-      if (d > 0)
-         continue;
-
       if (!check_instr(ctx, notes, n->instr))
          continue;
 
-      unsigned distance = nearest_use(n->instr);
-
-      if (!chosen || distance < chosen_distance) {
-         chosen = n;
-         chosen_distance = distance;
-      }
-   }
-
-   if (chosen) {
-      di(chosen->instr, "inc%s: chose (distance+ready)", mode);
-      return chosen;
-   }
-
-   /* Pick the max delay of the remaining leaders. */
-   foreach_sched_node (n, &ctx->dag->heads) {
-      if (avoid_output && n->output)
-         continue;
-
-      if (defer && should_defer(ctx, n->instr))
-         continue;
+      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
-      if (!check_instr(ctx, notes, n->instr))
-         continue;
+      enum choose_instr_inc_rank rank;
+      if (d == 0)
+         rank = INC_DISTANCE_READY;
+      else
+         rank = INC_DISTANCE;
 
       unsigned distance = nearest_use(n->instr);
 
-      if (!chosen || distance < chosen_distance) {
+      if (!chosen || rank > chosen_rank ||
+          (rank == chosen_rank && distance < chosen_distance)) {
          chosen = n;
          chosen_distance = distance;
+         chosen_rank = rank;
       }
    }
 
    if (chosen) {
-      di(chosen->instr, "inc%s: chose (distance)", mode);
+      di(chosen->instr, "inc%s: chose (%s)", mode, inc_rank_name(chosen_rank));
       return chosen;
    }