Mesa (master): freedreno/ir3: scheduler improvements

Mon Jun 3 19:44:33 UTC 2019

Module: Mesa
Branch: master
Commit: 771d04c82d007dae289b5429b4549f006140993d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=771d04c82d007dae289b5429b4549f006140993d

Author: Rob Clark <robdclark at chromium.org>
Date:   Thu May 30 10:44:16 2019 -0700

freedreno/ir3: scheduler improvements

For instructions that increase the # of live values, apply a threshold
to avoid scheduling them too early.  And factor the net change of # of
live values that would result from scheduling an instruction, to
prioritize instructions that reduce number of live values as the number
of live values increases.

For manhattan:

  total instructions in shared programs: 27869 -> 28413 (1.95%)
  instructions in affected programs: 26756 -> 27300 (2.03%)
  helped: 102
  HURT: 87

  total full in shared programs: 1903 -> 1719 (-9.67%)
  full in affected programs: 1390 -> 1206 (-13.24%)
  helped: 124
  HURT: 9

The reduction in register usage nets ~20% gain in manhattan.  (So
getting mediump support should be a huge win for gles gfxbench.)

Also significantly helps some of the more complex shadertoy shaders,
like IQ's Piano (32 to 18 regs, doubles fps).

The effect is less pronounced on smaller shaders.

Signed-off-by: Rob Clark <robdclark at chromium.org>
Reviewed-by: Eric Anholt <eric at anholt.net>

---

 src/freedreno/ir3/ir3.h       |   5 ++
 src/freedreno/ir3/ir3_sched.c | 123 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index f3c25ea2792..ccd102b8e44 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -292,6 +292,11 @@ struct ir3_instruction {
 	};
 
 	/* used for per-pass extra instruction data.
+	 *
+	 * TODO we should remove the per-pass data like this and 'use_count'
+	 * and do something similar to what RA does w/ ir3_ra_instr_data..
+	 * ie. use the ir3_count_instructions pass, and then use instr->ip
+	 * to index into a table of pass-private data.
 	 */
 	void *data;
 
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 16199ca3fb9..1b07bf8c1dd 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -216,7 +216,7 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 		return NULL;
 
 	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->sun > d->sun))
+		if (srcs[i] && (srcs[i]->depth > d->depth))
 			d = srcs[id = i];
 
 	srcs[id] = NULL;
@@ -500,13 +500,63 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return NULL;
 }
 
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+	int new_live = dest_regs(instr);
+	int old_live = 0;
+
+	foreach_ssa_src_n(src, n, instr) {
+		if (__is_false_dep(instr, n))
+			continue;
+
+		if (instr->block != src->block)
+			continue;
+
+		/* for fanout/split, just pass things along to the real src: */
+		if (src->opc == OPC_META_FO)
+			src = ssa(src->regs[1]);
+
+		/* for fanin/collect, if this is the last use of *each* src,
+		 * then it will decrease the live values, since RA treats
+		 * them as a whole:
+		 */
+		if (src->opc == OPC_META_FI) {
+			struct ir3_instruction *src2;
+			bool last_use = true;
+
+			foreach_ssa_src(src2, src) {
+				if (src2->use_count > 1) {
+					last_use = false;
+					break;
+				}
+			}
+
+			if (last_use)
+				old_live += dest_regs(src);
+
+		} else {
+			debug_assert(src->use_count > 0);
+
+			if (src->use_count == 1) {
+				old_live += dest_regs(src);
+			}
+		}
+	}
+
+	return new_live - old_live;
+}
+
 /* find instruction to schedule: */
 static struct ir3_instruction *
 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		bool soft)
 {
 	struct ir3_instruction *best_instr = NULL;
-	unsigned min_delay = ~0;
+	int best_rank = INT_MAX;      /* lower is better */
+	unsigned deepest = 0;
 
 	/* TODO we'd really rather use the list/array of block outputs.  But we
 	 * don't have such a thing.  Recursing *every* instruction in the list
@@ -516,23 +566,70 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 */
 	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 		struct ir3_instruction *candidate;
-		unsigned delay;
 
 		candidate = find_instr_recursive(ctx, notes, instr);
 		if (!candidate)
 			continue;
 
-		if (ctx->live_values > 16*4) {
-			/* under register pressure, only care about reducing live values: */
-			if (!best_instr || (candidate->sun > best_instr->sun))
-				best_instr = candidate;
-		} else {
-			delay = delay_calc(ctx->block, candidate, soft, false);
-			if ((delay < min_delay) ||
-					((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
-				best_instr = candidate;
-				min_delay = delay;
+		deepest = MAX2(deepest, candidate->depth);
+	}
+
+	/* traverse the list a second time.. but since we cache the result of
+	 * find_instr_recursive() it isn't as bad as it looks.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
+
+		/* determine net change to # of live values: */
+		int le = live_effect(candidate);
+
+		/* if there is a net increase in # of live values, then apply some
+		 * threshold to avoid instructions getting scheduled *too* early
+		 * and increasing register pressure.
+		 */
+		if (le >= 1) {
+			unsigned threshold;
+
+			if (ctx->live_values > 4*4) {
+				threshold = 4;
+			} else {
+				threshold = 6;
 			}
+
+			/* Filter out any "shallow" instructions which would otherwise
+			 * tend to get scheduled too early to fill delay slots even
+			 * when they are not needed for a while.  There will probably
+			 * be later delay slots that they could just as easily fill.
+			 *
+			 * A classic case where this comes up is frag shaders that
+			 * write a constant value (like 1.0f) to one of the channels
+			 * of the output color(s).  Since the mov from immed has no
+			 * dependencies, it would otherwise get scheduled early to
+			 * fill delay slots, occupying a register until the end of
+			 * the program.
+			 */
+			if ((deepest - candidate->depth) > threshold)
+				continue;
+		}
+
+		int rank = delay_calc(ctx->block, candidate, soft, false);
+
+		/* if too many live values, prioritize instructions that reduce the
+		 * number of live values:
+		 */
+		if (ctx->live_values > 16*4) {
+			rank = le;
+		} else if (ctx->live_values > 4*4) {
+			rank += le;
+		}
+
+		if (rank < best_rank) {
+			best_instr = candidate;
+			best_rank = rank;
 		}
 	}