Mesa (master): freedreno/ir3: new pre-RA scheduler

Mon Apr 13 21:06:28 UTC 2020

Module: Mesa
Branch: master
Commit: d2f4d332dbb552af62fe5caabe67664d98f32229
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d2f4d332dbb552af62fe5caabe67664d98f32229

Author: Rob Clark <robdclark at chromium.org>
Date:   Fri Nov 22 11:14:25 2019 -0800

freedreno/ir3: new pre-RA scheduler

This replaces the depth-first search scheduler with a more traditional
ready-list scheduler.  It primarily tries to reduce register pressure
(number of live values), with the exception of trying to schedule kills
as early as possible.  (Earlier iterations of this scheduler had a
tendency to push kills later, and in particular moving texture fetches
which may not be necessary ahead of kills.)

Signed-off-by: Rob Clark <robdclark at chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4440>

---

 src/freedreno/ir3/ir3.h       |   1 -
 src/freedreno/ir3/ir3_depth.c |  19 -
 src/freedreno/ir3/ir3_sched.c | 802 ++++++++++++++++++++++--------------------
 3 files changed, 426 insertions(+), 396 deletions(-)

diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 98146f158a8..4dfcdf0da51 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1190,7 +1190,6 @@ void ir3_remove_nops(struct ir3 *ir);
 
 /* depth calculation: */
 struct ir3_shader_variant;
-void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
 void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
 
 /* fp16 conversion folding */
diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c
index 6d389772570..00d0c54fac5 100644
--- a/src/freedreno/ir3/ir3_depth.c
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -48,23 +48,6 @@
  * blocks depth sorted list, which is used by the scheduling pass.
  */
 
-void
-ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
-{
-	/* remove from existing spot in list: */
-	list_delinit(&instr->node);
-
-	/* find where to re-insert instruction: */
-	foreach_instr (pos, list) {
-		if (pos->depth > instr->depth) {
-			list_add(&instr->node, &pos->node);
-			return;
-		}
-	}
-	/* if we get here, we didn't find an insertion spot: */
-	list_addtail(&instr->node, list);
-}
-
 static void
 ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
 {
@@ -97,8 +80,6 @@ ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
 
 	if (!is_meta(instr))
 		instr->depth++;
-
-	ir3_insert_by_depth(instr, &instr->block->instr_list);
 }
 
 static bool
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 9d0bf69d193..95c89ae8461 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -25,6 +25,7 @@
  */
 
 
+#include "util/dag.h"
 #include "util/u_math.h"
 
 #include "ir3.h"
@@ -47,12 +48,17 @@
 /*
  * Instruction Scheduling:
  *
- * A recursive depth based scheduling algo.  Recursively find an eligible
- * instruction to schedule from the deepest instruction (recursing through
- * it's unscheduled src instructions).  Normally this would result in a
- * lot of re-traversal of the same instructions, so we cache results in
- * instr->data (and clear cached results that would be no longer valid
- * after scheduling an instruction).
+ * A block-level pre-RA scheduler, which works by creating a DAG of
+ * instruction dependencies, and heuristically picking a DAG head
+ * (instruction with no unscheduled dependencies).
+ *
+ * Where possible, it tries to pick instructions that avoid nop delay
+ * slots, but it will prefer to pick instructions that reduce (or do
+ * not increase) the number of live values.
+ *
+ * If the only possible choices are instructions that increase the
+ * number of live values, it will try to pick the one with the earliest
+ * consumer (based on pre-sched program order).
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -62,159 +68,57 @@
  * To solve this, when we are in such a scheduling "critical section",
  * and we encounter a conflicting write to a special register, we try
  * to schedule any remaining instructions that use that value first.
+ *
+ * TODO we can detect too-large live_values here.. would be a good place
+ * to "spill" cheap things, like move from uniform/immed.  (Constructing
+ * list of ssa def consumers before sched pass would make this easier.
+ * Also, in general it is general it might be best not to re-use load_immed
+ * across blocks.
+ *
+ * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
+ * the # of immediates in play (or at least that would help with
+ * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
+ * do this in a nir pass that inserts fneg/etc?  The cp pass should fold
+ * these into src modifiers..
  */
 
 struct ir3_sched_ctx {
 	struct ir3_block *block;           /* the current block */
-	struct list_head depth_list;       /* depth sorted unscheduled instrs */
-	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
+	struct dag *dag;
+
+	struct list_head unscheduled_list; /* unscheduled instructions */
+	struct ir3_instruction *scheduled; /* last scheduled instr */
 	struct ir3_instruction *addr0;     /* current a0.x user, if any */
 	struct ir3_instruction *addr1;     /* current a1.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
-	int live_values;                   /* estimate of current live values */
-	int half_live_values;              /* estimate of current half precision live values */
-	bool error;
-
-	unsigned live_threshold_hi;
-	unsigned live_threshold_lo;
-	unsigned depth_threshold_hi;
-	unsigned depth_threshold_lo;
-};
-
-static bool is_scheduled(struct ir3_instruction *instr)
-{
-	return !!(instr->flags & IR3_INSTR_MARK);
-}
-
-static void
-unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-
-	foreach_ssa_src_n (src, n, instr) {
-		if (__is_false_dep(instr, n))
-			continue;
-		if (instr->block != src->block)
-			continue;
-		if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
-			unuse_each_src(ctx, src);
-		} else {
-			debug_assert(src->use_count > 0);
-
-			if (--src->use_count == 0) {
-				if (is_half(src)) {
-					ctx->half_live_values -= dest_regs(src);
-					debug_assert(ctx->half_live_values >= 0);
-				} else {
-					ctx->live_values -= dest_regs(src);
-					debug_assert(ctx->live_values >= 0);
-				}
-			}
-		}
-	}
-}
 
-static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
-static void use_instr(struct ir3_instruction *instr);
+	int remaining_kills;
 
-/* transfers a use-count to new instruction, for cases where we
- * "spill" address or predicate.  Note this might cause the
- * previous instruction that loaded a0.x/p0.x to become live
- * again, when we previously thought it was dead.
- */
-static void
-transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
-		struct ir3_instruction *new_instr)
-{
-	struct ir3_instruction *src;
-
-	debug_assert(is_scheduled(orig_instr));
-
-	foreach_ssa_src_n (src, n, new_instr) {
-		if (__is_false_dep(new_instr, n))
-			continue;
-		if (is_half(new_instr)) {
-			ctx->half_live_values += dest_regs(src);
-		} else {
-			ctx->live_values += dest_regs(src);
-		}
-		use_instr(src);
-	}
-
-	clear_cache(ctx, orig_instr);
-}
-
-static void
-use_each_src(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-
-	foreach_ssa_src_n (src, n, instr) {
-		if (__is_false_dep(instr, n))
-			continue;
-		use_instr(src);
-	}
-}
-
-static void
-use_instr(struct ir3_instruction *instr)
-{
-	if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
-		use_each_src(instr);
-	} else {
-		instr->use_count++;
-	}
-}
-
-static void
-update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled)
-{
-	if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT))
-		return;
-
-	if ((scheduled->regs_count > 0) && is_half(scheduled)) {
-		ctx->half_live_values += dest_regs(scheduled);
-	} else {
-		ctx->live_values += dest_regs(scheduled);
-	}
-
-	unuse_each_src(ctx, scheduled);
-}
-
-static void
-update_use_count(struct ir3 *ir)
-{
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			instr->use_count = 0;
-		}
-	}
+	bool error;
+};
 
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
-				continue;
+struct ir3_sched_node {
+	struct dag_node dag;     /* must be first for util_dynarray_foreach */
+	struct ir3_instruction *instr;
 
-			use_each_src(instr);
-		}
-	}
+	unsigned delay;
+	unsigned max_delay;
 
-	/* Shader outputs are also used:
+	/* Is this instruction a direct or indirect dependency for a kill?
+	 * If so, we should prioritize it when possible
 	 */
-	struct ir3_instruction *out;
-	foreach_output (out, ir)
-		use_instr(out);
-}
+	bool kill_path;
+};
 
-#define NULL_INSTR ((void *)~0)
+#define foreach_sched_node(__n, __list) \
+	list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
 
-static void
-clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
+static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
+
+static bool is_scheduled(struct ir3_instruction *instr)
 {
-	foreach_instr (instr2, &ctx->depth_list) {
-		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
-			instr2->data = NULL;
-	}
+	return !!(instr->flags & IR3_INSTR_MARK);
 }
 
 static void
@@ -248,35 +152,13 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
 
-	update_live_values(ctx, instr);
-
-	if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) {
-		clear_cache(ctx, NULL);
-	} else {
-		/* invalidate only the necessary entries.. */
-		clear_cache(ctx, instr);
+	if (is_kill(instr)){
+		ctx->remaining_kills--;
 	}
-}
-
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
 
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
+	struct ir3_sched_node *n = instr->data;
 
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
+	dag_prune_head(ctx->dag, &n->dag);
 }
 
 struct ir3_sched_notes {
@@ -313,11 +195,23 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 {
 	debug_assert(!is_scheduled(instr));
 
+	if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
+		/* avoid texture/memory access if we have unscheduled kills
+		 * that could make the expensive operation unnecessary.  By
+		 * definition, if there are remaining kills, and this instr
+		 * is not a dependency of a kill, there are other instructions
+		 * that we can choose from.
+		 */
+		struct ir3_sched_node *n = instr->data;
+		if (!n->kill_path)
+			return false;
+	}
+
 	/* For instructions that write address register we need to
 	 * make sure there is at least one instruction that uses the
 	 * addr value which is otherwise ready.
 	 *
-	 * TODO if any instructions use pred register and have other
+	 * NOTE if any instructions use pred register and have other
 	 * src args, we would need to do the same for writes_pred()..
 	 */
 	if (writes_addr0(instr)) {
@@ -384,10 +278,7 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * virtual ssa src for the kill instruction.  But we have
 	 * fixed length instr->regs[].
 	 *
-	 * TODO this wouldn't be quite right if we had multiple
-	 * basic blocks, if any block was conditional.  We'd need
-	 * to schedule the bary.f's outside of any block which
-	 * was conditional that contained a kill.. I think..
+	 * TODO we could handle this by false-deps now, probably.
 	 */
 	if (is_kill(instr)) {
 		struct ir3 *ir = instr->block->shader;
@@ -406,63 +297,43 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return true;
 }
 
-/* Find the best instruction to schedule from specified instruction or
- * recursively it's ssa sources.
+/* Find the instr->ip of the closest use of an instruction, in
+ * pre-sched order.  This isn't going to be the same as post-sched
+ * order, but it is a reasonable approximation to limit scheduling
+ * instructions *too* early.  This is mostly to prevent bad behavior
+ * in cases where we have a large number of possible instructions
+ * to choose, to avoid creating too much parallelism (ie. blowing
+ * up register pressure)
+ *
+ * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
  */
-static struct ir3_instruction *
-find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
+static int
+nearest_use(struct ir3_instruction *instr)
 {
-	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
-	struct ir3_instruction *src;
-	unsigned nsrcs = 0;
-
-	if (is_scheduled(instr))
-		return NULL;
-
-	/* use instr->data to cache the results of recursing up the
-	 * instr src's.  Otherwise the recursive algo can scale quite
-	 * badly w/ shader size.  But this takes some care to clear
-	 * the cache appropriately when instructions are scheduled.
+	unsigned nearest = ~0;
+	foreach_ssa_use (use, instr)
+		if (!is_scheduled(use))
+			nearest = MIN2(nearest, use->ip);
+
+	/* slight hack.. this heuristic tends to push bary.f's to later
+	 * in the shader, closer to their uses.  But we actually would
+	 * prefer to get these scheduled earlier, to unlock varying
+	 * storage for more VS jobs:
 	 */
-	if (instr->data) {
-		if (instr->data == NULL_INSTR)
-			return NULL;
-		return instr->data;
-	}
-
-	/* find unscheduled srcs: */
-	foreach_ssa_src (src, instr) {
-		if (!is_scheduled(src) && (src->block == instr->block)) {
-			debug_assert(nsrcs < ARRAY_SIZE(srcs));
-			srcs[nsrcs++] = src;
-		}
-	}
-
-	/* if all our src's are already scheduled: */
-	if (nsrcs == 0) {
-		if (check_instr(ctx, notes, instr)) {
-			instr->data = instr;
-			return instr;
-		}
-		return NULL;
-	}
+	if (is_input(instr))
+		nearest /= 2;
 
-	while ((src = deepest(srcs, nsrcs))) {
-		struct ir3_instruction *candidate;
-
-		candidate = find_instr_recursive(ctx, notes, src);
-		if (!candidate)
-			continue;
-
-		if (check_instr(ctx, notes, candidate)) {
-			instr->data = candidate;
-			return candidate;
-		}
-	}
+	return nearest;
+}
 
-	instr->data = NULL_INSTR;
-	return NULL;
+static int
+use_count(struct ir3_instruction *instr)
+{
+	unsigned cnt = 0;
+	foreach_ssa_use (use, instr)
+		if (!is_scheduled(use))
+			cnt++;
+	return cnt;
 }
 
 /* find net change to live values if instruction were scheduled: */
@@ -471,7 +342,7 @@ live_effect(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *src;
 	int new_live = dest_regs(instr);
-	int old_live = 0;
+	int freed_live = 0;
 
 	foreach_ssa_src_n (src, n, instr) {
 		if (__is_false_dep(instr, n))
@@ -480,137 +351,226 @@ live_effect(struct ir3_instruction *instr)
 		if (instr->block != src->block)
 			continue;
 
-		/* for split, just pass things along to the real src: */
-		if (src->opc == OPC_META_SPLIT)
-			src = ssa(src->regs[1]);
+		if (use_count(src) == 1)
+			freed_live += dest_regs(src);
+	}
 
-		/* for collect, if this is the last use of *each* src,
-		 * then it will decrease the live values, since RA treats
-		 * them as a whole:
-		 */
-		if (src->opc == OPC_META_COLLECT) {
-			struct ir3_instruction *src2;
-			bool last_use = true;
-
-			foreach_ssa_src (src2, src) {
-				if (src2->use_count > 1) {
-					last_use = false;
-					break;
-				}
-			}
+	return new_live - freed_live;
+}
 
-			if (last_use)
-				old_live += dest_regs(src);
+/**
+ * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
+ * Scheduling for Register pressure) heuristic.
+ */
+static struct ir3_sched_node *
+choose_instr_csr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+{
+	struct ir3_sched_node *chosen = NULL;
 
-		} else {
-			debug_assert(src->use_count > 0);
+	/* Find a ready inst with regs freed and pick the one with max cost. */
+	foreach_sched_node (n, &ctx->dag->heads) {
+		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 
-			if (src->use_count == 1) {
-				old_live += dest_regs(src);
-			}
+		if (d > 0)
+			continue;
+
+		if (live_effect(n->instr) > -1)
+			continue;
+
+		if (!check_instr(ctx, notes, n->instr))
+			continue;
+
+		if (!chosen || chosen->max_delay < n->max_delay) {
+			chosen = n;
 		}
 	}
 
-	return new_live - old_live;
-}
+	if (chosen) {
+		di(chosen->instr, "csr: chose (freed+ready)");
+		return chosen;
+	}
 
-/* find instruction to schedule: */
-static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool soft)
-{
-	struct ir3_instruction *best_instr = NULL;
-	int best_rank = INT_MAX;      /* lower is better */
-	unsigned deepest = 0;
-
-	/* TODO we'd really rather use the list/array of block outputs.  But we
-	 * don't have such a thing.  Recursing *every* instruction in the list
-	 * will result in a lot of repeated traversal, since instructions will
-	 * get traversed both when they appear as ssa src to a later instruction
-	 * as well as where they appear in the depth_list.
-	 */
-	foreach_instr_rev (instr, &ctx->depth_list) {
-		struct ir3_instruction *candidate;
+	/* Find a leader with regs freed and pick the one with max cost. */
+	foreach_sched_node (n, &ctx->dag->heads) {
+		if (live_effect(n->instr) > -1)
+			continue;
 
-		candidate = find_instr_recursive(ctx, notes, instr);
-		if (!candidate)
+		if (!check_instr(ctx, notes, n->instr))
 			continue;
 
-		if (is_meta(candidate))
-			return candidate;
+		if (!chosen || chosen->max_delay < n->max_delay) {
+			chosen = n;
+		}
+	}
 
-		deepest = MAX2(deepest, candidate->depth);
+	if (chosen) {
+		di(chosen->instr, "csr: chose (freed)");
+		return chosen;
 	}
 
-	/* traverse the list a second time.. but since we cache the result of
-	 * find_instr_recursive() it isn't as bad as it looks.
+	/* Contra the paper, pick a leader with no effect on used regs.  This may
+	 * open up new opportunities, as otherwise a single-operand instr consuming
+	 * a value will tend to block finding freeing that value.  This had a
+	 * massive effect on reducing spilling on V3D.
+	 *
+	 * XXX: Should this prioritize ready?
 	 */
-	foreach_instr_rev (instr, &ctx->depth_list) {
-		struct ir3_instruction *candidate;
+	foreach_sched_node (n, &ctx->dag->heads) {
+		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 
-		candidate = find_instr_recursive(ctx, notes, instr);
-		if (!candidate)
+		if (d > 0)
 			continue;
 
-		/* determine net change to # of live values: */
-		int le = live_effect(candidate);
-		unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values;
+		if (live_effect(n->instr) > 0)
+			continue;
 
-		/* if there is a net increase in # of live values, then apply some
-		 * threshold to avoid instructions getting scheduled *too* early
-		 * and increasing register pressure.
-		 */
-		if (le >= 1) {
-			unsigned threshold;
+		if (!check_instr(ctx, notes, n->instr))
+			continue;
 
-			if (live_values > ctx->live_threshold_lo) {
-				threshold = ctx->depth_threshold_lo;
-			} else {
-				threshold = ctx->depth_threshold_hi;
-			}
+		if (!chosen || chosen->max_delay < n->max_delay)
+			chosen = n;
+	}
 
-			/* Filter out any "shallow" instructions which would otherwise
-			 * tend to get scheduled too early to fill delay slots even
-			 * when they are not needed for a while.  There will probably
-			 * be later delay slots that they could just as easily fill.
-			 *
-			 * A classic case where this comes up is frag shaders that
-			 * write a constant value (like 1.0f) to one of the channels
-			 * of the output color(s).  Since the mov from immed has no
-			 * dependencies, it would otherwise get scheduled early to
-			 * fill delay slots, occupying a register until the end of
-			 * the program.
-			 */
-			if ((deepest - candidate->depth) > threshold)
-				continue;
+	if (chosen) {
+		di(chosen->instr, "csr: chose (neutral+ready)");
+		return chosen;
+	}
+
+	foreach_sched_node (n, &ctx->dag->heads) {
+		if (live_effect(n->instr) > 0)
+			continue;
+
+		if (!check_instr(ctx, notes, n->instr))
+			continue;
+
+		if (!chosen || chosen->max_delay < n->max_delay)
+			chosen = n;
+	}
+
+	if (chosen) {
+		di(chosen->instr, "csr: chose (neutral)");
+		return chosen;
+	}
+
+	/*
+	 * From hear on out, we are picking something that increases
+	 * register pressure.  So try to pick something which will
+	 * be consumed soon:
+	 */
+	unsigned chosen_distance = 0;
+
+	/* Pick the max delay of the remaining ready set. */
+	foreach_sched_node (n, &ctx->dag->heads) {
+		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+
+		if (d > 0)
+			continue;
+
+		if (!check_instr(ctx, notes, n->instr))
+			continue;
+
+		unsigned distance = nearest_use(n->instr);
+
+		if (!chosen || distance < chosen_distance) {
+			chosen = n;
+			chosen_distance = distance;
 		}
+	}
+
+	if (chosen) {
+		di(chosen->instr, "csr: chose (distance+ready)");
+		return chosen;
+	}
+
+	/* Pick the max delay of the remaining leaders. */
+	foreach_sched_node (n, &ctx->dag->heads) {
+		if (!check_instr(ctx, notes, n->instr))
+			continue;
 
-		int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
+		unsigned distance = nearest_use(n->instr);
 
-		/* if too many live values, prioritize instructions that reduce the
-		 * number of live values:
-		 */
-		if (live_values > ctx->live_threshold_hi) {
-			rank = le;
-		} else if (live_values > ctx->live_threshold_lo) {
-			rank += le;
+		if (!chosen || distance < chosen_distance) {
+			chosen = n;
+			chosen_distance = distance;
 		}
+	}
+
+	if (chosen) {
+		di(chosen->instr, "csr: chose (distance)");
+		return chosen;
+	}
+
+	return NULL;
+}
+
+/* Handles instruction selections for instructions we want to prioritize
+ * even if csp/csr would not pick them.
+ */
+static struct ir3_sched_node *
+choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+{
+	struct ir3_sched_node *chosen = NULL;
+
+	foreach_sched_node (n, &ctx->dag->heads) {
+		if (!is_meta(n->instr))
+			continue;
+
+		if (!chosen || (chosen->max_delay < n->max_delay))
+			chosen = n;
+	}
+
+	if (chosen) {
+		di(chosen->instr, "prio: chose (meta)");
+		return chosen;
+	}
+
+	return NULL;
+}
 
-		if (rank < best_rank) {
-			best_instr = candidate;
-			best_rank = rank;
+static void
+dump_state(struct ir3_sched_ctx *ctx)
+{
+	if (!SCHED_DEBUG)
+		return;
+
+	foreach_sched_node (n, &ctx->dag->heads) {
+		di(n->instr, "maxdel=%3d le=%d del=%u ",
+				n->max_delay, live_effect(n->instr),
+				ir3_delay_calc(ctx->block, n->instr, false, false));
+
+		util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+			struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
+
+			di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 		}
 	}
+}
 
-	return best_instr;
+/* find instruction to schedule: */
+static struct ir3_instruction *
+choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+{
+	struct ir3_sched_node *chosen;
+
+	dump_state(ctx);
+
+	chosen = choose_instr_prio(ctx, notes);
+	if (chosen)
+		return chosen->instr;
+
+	chosen = choose_instr_csr(ctx, notes);
+	if (chosen)
+		return chosen->instr;
+
+	return NULL;
 }
 
 static struct ir3_instruction *
 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
 {
 	struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
-	ir3_insert_by_depth(new_instr, &ctx->depth_list);
-	transfer_use(ctx, orig_instr, new_instr);
+	di(new_instr, "split instruction");
+	sched_node_init(ctx, new_instr);
 	return new_instr;
 }
 
@@ -646,8 +606,12 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
 				/* original addr is scheduled, but new one isn't: */
 				new_addr->flags &= ~IR3_INSTR_MARK;
 			}
-			indirect->address = NULL;
-			ir3_instr_set_address(indirect, new_addr);
+			indirect->address = new_addr;
+			/* don't need to remove old dag edge since old addr is
+			 * already scheduled:
+			 */
+			sched_node_add_dep(indirect, new_addr, 0);
+			di(indirect, "new address");
 		}
 	}
 
@@ -692,6 +656,11 @@ split_pred(struct ir3_sched_ctx *ctx)
 				new_pred->flags &= ~IR3_INSTR_MARK;
 			}
 			predicated->regs[1]->instr = new_pred;
+			/* don't need to remove old dag edge since old pred is
+			 * already scheduled:
+			 */
+			sched_node_add_dep(predicated, new_pred, 0);
+			di(predicated, "new predicate");
 		}
 	}
 
@@ -702,10 +671,108 @@ split_pred(struct ir3_sched_ctx *ctx)
 }
 
 static void
-sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
+
+	dag_init_node(ctx->dag, &n->dag);
+
+	n->instr = instr;
+	instr->data = n;
+}
+
+static void
+sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
+{
+	/* don't consider dependencies in other blocks: */
+	if (src->block != instr->block)
+		return;
+
+	/* we could have false-dep's that end up unused: */
+	if (src->flags & IR3_INSTR_UNUSED) {
+		debug_assert(__is_false_dep(instr, i));
+		return;
+	}
+
+	struct ir3_sched_node *n = instr->data;
+	struct ir3_sched_node *sn = src->data;
+
+	dag_add_edge(&sn->dag, &n->dag, NULL);
+
+	unsigned d = ir3_delayslots(src, instr, i, true);
+	n->delay = MAX2(n->delay, d);
+}
+
+static void
+mark_kill_path(struct ir3_instruction *instr)
+{
+	struct ir3_sched_node *n = instr->data;
+	n->kill_path = true;
+	struct ir3_instruction *src;
+	foreach_ssa_src (src, instr) {
+		if (src->block != instr->block)
+			continue;
+		mark_kill_path(src);
+	}
+}
+
+static void
+sched_node_add_deps(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+
+	/* Since foreach_ssa_src() already handles false-dep's we can construct
+	 * the DAG easily in a single pass.
+	 */
+	foreach_ssa_src_n (src, i, instr) {
+		sched_node_add_dep(instr, src, i);
+	}
+
+	/* NOTE that all inputs must be scheduled before a kill, so
+	 * mark these to be prioritized as well:
+	 */
+	if (is_kill(instr) || is_input(instr)) {
+		mark_kill_path(instr);
+	}
+}
+
+static void
+sched_dag_max_delay_cb(struct dag_node *node, void *state)
+{
+	struct ir3_sched_node *n = (struct ir3_sched_node *)node;
+	uint32_t max_delay = 0;
+
+	util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+		struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
+		max_delay = MAX2(child->max_delay, max_delay);
+	}
+
+	n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+}
+
+static void
+sched_dag_init(struct ir3_sched_ctx *ctx)
 {
-	struct list_head unscheduled_list;
+	ctx->dag = dag_create(ctx);
 
+	foreach_instr (instr, &ctx->unscheduled_list) {
+		sched_node_init(ctx, instr);
+		sched_node_add_deps(instr);
+	}
+
+	dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
+}
+
+static void
+sched_dag_destroy(struct ir3_sched_ctx *ctx)
+{
+	ralloc_free(ctx->dag);
+	ctx->dag = NULL;
+}
+
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
 	ctx->block = block;
 
 	/* addr/pred writes are per-block: */
@@ -717,9 +784,16 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 * empty the block's instruction list (to which we will
 	 * be inserting).
 	 */
-	list_replace(&block->instr_list, &unscheduled_list);
+	list_replace(&block->instr_list, &ctx->unscheduled_list);
 	list_inithead(&block->instr_list);
-	list_inithead(&ctx->depth_list);
+
+	sched_dag_init(ctx);
+
+	ctx->remaining_kills = 0;
+	foreach_instr_safe (instr, &ctx->unscheduled_list) {
+		if (is_kill(instr))
+			ctx->remaining_kills++;
+	}
 
 	/* First schedule all meta:input instructions, followed by
 	 * tex-prefetch.  We want all of the instructions that load
@@ -729,29 +803,20 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 * a FS's bary_ij input may not actually be live in the
 	 * shader, but it should not be scheduled on top of any
 	 * other input (but can be overwritten by a tex prefetch)
-	 *
-	 * Finally, move all the remaining instructions to the depth-
-	 * list
 	 */
-	foreach_instr_safe (instr, &unscheduled_list)
+	foreach_instr_safe (instr, &ctx->unscheduled_list)
 		if (instr->opc == OPC_META_INPUT)
 			schedule(ctx, instr);
 
-	foreach_instr_safe (instr, &unscheduled_list)
+	foreach_instr_safe (instr, &ctx->unscheduled_list)
 		if (instr->opc == OPC_META_TEX_PREFETCH)
 			schedule(ctx, instr);
 
-	foreach_instr_safe (instr, &unscheduled_list)
-		ir3_insert_by_depth(instr, &ctx->depth_list);
-
-	while (!list_is_empty(&ctx->depth_list)) {
+	while (!list_is_empty(&ctx->unscheduled_list)) {
 		struct ir3_sched_notes notes = {0};
 		struct ir3_instruction *instr;
 
-		instr = find_eligible_instr(ctx, &notes, true);
-		if (!instr)
-			instr = find_eligible_instr(ctx, &notes, false);
-
+		instr = choose_instr(ctx, &notes);
 		if (instr) {
 			unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 			d("delay=%u", delay);
@@ -784,62 +849,47 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			} else if (notes.pred_conflict) {
 				new_instr = split_pred(ctx);
 			} else {
+				d("unscheduled_list:");
+				foreach_instr (instr, &ctx->unscheduled_list)
+					di(instr, "unscheduled: ");
 				debug_assert(0);
 				ctx->error = true;
 				return;
 			}
 
 			if (new_instr) {
-				/* clearing current addr/pred can change what is
-				 * available to schedule, so clear cache..
-				 */
-				clear_cache(ctx, NULL);
-
-				ir3_insert_by_depth(new_instr, &ctx->depth_list);
-				/* the original instr that wrote addr/pred may have
-				 * originated from a different block:
-				 */
-				new_instr->block = block;
+				list_delinit(&new_instr->node);
+				list_addtail(&new_instr->node, &ctx->unscheduled_list);
 			}
 		}
 	}
-}
 
-static void
-setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir)
-{
-	if (ir3_has_latency_to_hide(ir)) {
-		ctx->live_threshold_hi = 2 * 16 * 4;
-		ctx->live_threshold_lo = 2 * 4 * 4;
-		ctx->depth_threshold_hi = 6;
-		ctx->depth_threshold_lo = 4;
-	} else {
-		ctx->live_threshold_hi = 2 * 16 * 4;
-		ctx->live_threshold_lo = 2 * 12 * 4;
-		ctx->depth_threshold_hi = 16;
-		ctx->depth_threshold_lo = 16;
-	}
+	sched_dag_destroy(ctx);
 }
 
 int ir3_sched(struct ir3 *ir)
 {
-	struct ir3_sched_ctx ctx = {0};
+	struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
 
-	setup_thresholds(&ctx, ir);
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			instr->data = NULL;
+		}
+	}
 
+	ir3_count_instructions(ir);
 	ir3_clear_mark(ir);
-	update_use_count(ir);
+	ir3_find_ssa_uses(ir, ctx, false);
 
 	foreach_block (block, &ir->block_list) {
-		ctx.live_values = 0;
-		ctx.half_live_values = 0;
-		sched_block(&ctx, block);
+		sched_block(ctx, block);
 	}
 
-	if (ctx.error)
-		return -1;
+	int ret = ctx->error ? -1 : 0;
+
+	ralloc_free(ctx);
 
-	return 0;
+	return ret;
 }
 
 static unsigned