Mesa (master): freedreno/ir3: track register pressure in sched

Sun Mar 3 18:32:44 UTC 2019

Module: Mesa
Branch: master
Commit: 00f838fa730f5c765902fe2e5ce9754df5276e91
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=00f838fa730f5c765902fe2e5ce9754df5276e91

Author: Rob Clark <robdclark at gmail.com>
Date:   Wed Feb 27 09:56:18 2019 -0500

freedreno/ir3: track register pressure in sched

Not a perfect solution, and the "pressure" target is hard-coded.  But it
doesn't really seem to much in the common case, and avoids exploding
register usage in dEQP ssbo tests.

So this should serve as a stop-gap solution until I have time to re-
write the scheduler.

Hurts slightly in instruction count, but gains (reduces) slightly the
register usage in shader-db.  Fixes ~150 dEQP-GLES31.functional.ssbo.*
that were failing due to RA fail.

Signed-off-by: Rob Clark <robdclark at gmail.com>

---

 src/freedreno/ir3/ir3_depth.c |  2 +-
 src/freedreno/ir3/ir3_sched.c | 97 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c
index e0500b43161..071b5e3c528 100644
--- a/src/freedreno/ir3/ir3_depth.c
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -90,7 +90,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 	 * handled with sync bits
 	 */
 
-	if (is_meta(assigner))
+	if (is_meta(assigner) || is_meta(consumer))
 		return 0;
 
 	if (writes_addr(assigner))
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 6552980d90c..4380fdf2f9a 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -55,6 +55,7 @@ struct ir3_sched_ctx {
 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
+	int live_values;                   /* estimate of current live values */
 	bool error;
 };
 
@@ -63,6 +64,77 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
 	return is_sfu(instr) || is_mem(instr);
 }
 
+static void
+unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+
+	foreach_ssa_src_n(src, n, instr) {
+		if (__is_false_dep(instr, n))
+			continue;
+		if (instr->block != src->block)
+			continue;
+		if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+			unuse_each_src(ctx, src);
+		} else {
+			debug_assert(src->use_count > 0);
+
+			if (--src->use_count == 0) {
+				ctx->live_values -= dest_regs(src);
+				debug_assert(ctx->live_values >= 0);
+			}
+		}
+	}
+}
+
+static void
+use_each_src(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+
+	foreach_ssa_src_n(src, n, instr) {
+		if (__is_false_dep(instr, n))
+			continue;
+		if (instr->block != src->block)
+			continue;
+		if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+			use_each_src(src);
+		} else {
+			src->use_count++;
+		}
+	}
+}
+
+static void
+update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+		return;
+
+	ctx->live_values += dest_regs(instr);
+	unuse_each_src(ctx, instr);
+}
+
+/* This is *slightly* different than how ir3_cp uses use_count, in that
+ * we just track it per block (because we schedule a block at a time) and
+ * because we don't track meta instructions and false dependencies (since
+ * they don't contribute real register pressure).
+ */
+static void
+update_use_count(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		instr->use_count = 0;
+	}
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+			continue;
+
+		use_each_src(instr);
+	}
+}
+
 #define NULL_INSTR ((void *)~0)
 
 static void
@@ -105,6 +177,8 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
 
+	update_live_values(ctx, instr);
+
 	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 		clear_cache(ctx, NULL);
 	} else {
@@ -126,7 +200,7 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 		return NULL;
 
 	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
+		if (srcs[i] && (srcs[i]->sun > d->sun))
 			d = srcs[id = i];
 
 	srcs[id] = NULL;
@@ -432,14 +506,18 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (!candidate)
 			continue;
 
-		delay = delay_calc(ctx->block, candidate, soft, false);
-		if (delay < min_delay) {
-			best_instr = candidate;
-			min_delay = delay;
+		if (ctx->live_values > 16*4) {
+			/* under register pressure, only care about reducing live values: */
+			if (!best_instr || (candidate->sun > best_instr->sun))
+				best_instr = candidate;
+		} else {
+			delay = delay_calc(ctx->block, candidate, soft, false);
+			if ((delay < min_delay) ||
+					((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
+				best_instr = candidate;
+				min_delay = delay;
+			}
 		}
-
-		if (min_delay == 0)
-			break;
 	}
 
 	return best_instr;
@@ -714,6 +792,8 @@ int ir3_sched(struct ir3 *ir)
 	ir3_clear_mark(ir);
 
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ctx.live_values = 0;
+		update_use_count(block);
 		sched_block(&ctx, block);
 	}
 
@@ -723,6 +803,7 @@ int ir3_sched(struct ir3 *ir)
 
 	if (ctx.error)
 		return -1;
+
 	return 0;
 }