Mesa (master): ir3: do not move varying inputs that depend on unmovable instrs

Fri Apr 30 15:10:00 UTC 2021

Module: Mesa
Branch: master
Commit: 1201aa9332a9b74c231bd7733b0aef5af42dc6fe
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1201aa9332a9b74c231bd7733b0aef5af42dc6fe

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Tue Apr 27 14:51:26 2021 +0300

ir3: do not move varying inputs that depend on unmovable instrs

Not all varying fetches could be pulled into the start block.
If there are fetches we couldn't pull, like load_interpolated_input
with offset which depends on a non-reorderable ssbo load or on a
phi node, this pass is skipped since it would be hard to find a place
to set (ei) flag (beside at the very end).

We also don't have to manually set (ei) in such cases since a5xx and
a6xx do automatically release varying storage at the end.
Earlier gens need further testing, however they do not support
interpolateAt* functions at moment, so unless we would like to support
sample shading on them - they are fine.

Fixes crash in GTA V.

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10483>

---

 src/freedreno/ir3/ir3_legalize.c                | 30 +++++++-
 src/freedreno/ir3/ir3_nir_move_varying_inputs.c | 97 +++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 4 deletions(-)

diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index fa193b5791e..1defd2874fb 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -50,6 +50,7 @@ struct ir3_legalize_ctx {
 	struct ir3_shader_variant *so;
 	gl_shader_stage type;
 	int max_bary;
+	bool early_input_release;
 };
 
 struct ir3_legalize_state {
@@ -122,8 +123,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 
 	unsigned inputs_remaining = input_count;
 
-	/* Inputs can only be in the first block */
-	assert(input_count == 0 ||
+	/* Either inputs are in the first block or we expect inputs to be released
+	 * with the end of the program.
+	 */
+	assert(input_count == 0 || !ctx->early_input_release ||
 		   block == list_first_entry(&block->shader->block_list, struct ir3_block, node));
 
 	/* remove all the instructions from the list, we'll be adding
@@ -292,7 +295,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			}
 		}
 
-		if (is_input(n)) {
+		if (ctx->early_input_release && is_input(n)) {
 			last_input_needs_ss |= (n->opc == OPC_LDLV);
 
 			assert(inputs_remaining > 0);
@@ -330,7 +333,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		last_n = n;
 	}
 
-	assert(inputs_remaining == 0);
+	assert(inputs_remaining == 0 || !ctx->early_input_release);
 
 	if (has_tex_prefetch && input_count == 0) {
 		/* texture prefetch, but *no* inputs.. we need to insert a
@@ -785,6 +788,25 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 
 	ir3_remove_nops(ir);
 
+	/* We may have failed to pull all input loads into the first block.
+	 * In such case at the moment we aren't able to find a better place
+	 * to for (ei) than the end of the program.
+	 * a5xx and a6xx do automatically release varying storage at the end.
+	 */
+	ctx->early_input_release = true;
+	struct ir3_block *first_block =
+		list_first_entry(&ir->block_list, struct ir3_block, node);
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			if (is_input(instr) && block != first_block) {
+				ctx->early_input_release = false;
+				break;
+			}
+		}
+	}
+
+	assert(ctx->early_input_release || ctx->compiler->gpu_id > 500);
+
 	/* process each block: */
 	do {
 		progress = false;
diff --git a/src/freedreno/ir3/ir3_nir_move_varying_inputs.c b/src/freedreno/ir3/ir3_nir_move_varying_inputs.c
index 91590393c78..9c4ec75d24d 100644
--- a/src/freedreno/ir3/ir3_nir_move_varying_inputs.c
+++ b/src/freedreno/ir3/ir3_nir_move_varying_inputs.c
@@ -36,16 +36,93 @@
  *
  * This may come at the cost of additional register usage.  OTOH setting
  * the (ei) flag earlier probably frees up more VS to run.
+ *
+ * Not all varying fetches could be pulled into the start block.
+ * If there are fetches we couldn't pull, like load_interpolated_input
+ * with offset which depends on a non-reorderable ssbo load or on a
+ * phi node, this pass is skipped since it would be hard to find a place
+ * to set (ei) flag (beside at the very end).
+ * a5xx and a6xx do automatically release varying storage at the end.
  */
 
+typedef struct {
+	nir_block *start_block;
+	bool precondition_failed;
+} precond_state;
 
 typedef struct {
 	nir_shader *shader;
 	nir_block *start_block;
 } state;
 
+
+
+static void check_precondition_instr(precond_state *state, nir_instr *instr);
 static void move_instruction_to_start_block(state *state, nir_instr *instr);
 
+static bool
+check_precondition_src(nir_src *src, void *state)
+{
+	check_precondition_instr(state, src->ssa->parent_instr);
+	return true;
+}
+
+/* Recursively check if there is even a single dependency which
+ * cannot be moved.
+ */
+static void
+check_precondition_instr(precond_state *state, nir_instr *instr)
+{
+	if (instr->block == state->start_block)
+		return;
+
+	switch (instr->type) {
+		case nir_instr_type_alu:
+		case nir_instr_type_deref:
+		case nir_instr_type_load_const:
+		case nir_instr_type_ssa_undef:
+			/* These could be safely moved around */
+			break;
+		case nir_instr_type_intrinsic: {
+			nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+			if (!nir_intrinsic_can_reorder(intr)) {
+				state->precondition_failed = true;
+				return;
+			}
+			break;
+		}
+		default:
+			state->precondition_failed = true;
+			return;
+	}
+
+	nir_foreach_src(instr, check_precondition_src, state);
+}
+
+static void
+check_precondition_block(precond_state *state, nir_block *block)
+{
+	nir_foreach_instr_safe (instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+		switch (intr->intrinsic) {
+		case nir_intrinsic_load_interpolated_input:
+		case nir_intrinsic_load_input:
+			break;
+		default:
+			continue;
+		}
+
+		check_precondition_instr(state, instr);
+
+		if (state->precondition_failed)
+			return;
+	}
+}
+
 static bool
 move_src(nir_src *src, void *state)
 {
@@ -111,6 +188,26 @@ ir3_nir_move_varying_inputs(nir_shader *shader)
 
 	debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
 
+	nir_foreach_function (function, shader) {
+		precond_state state;
+
+		if (!function->impl)
+			continue;
+
+		state.precondition_failed = false;
+		state.start_block = nir_start_block(function->impl);
+
+		nir_foreach_block (block, function->impl) {
+			if (block == state.start_block)
+				continue;
+
+			check_precondition_block(&state, block);
+
+			if (state.precondition_failed)
+				return false;
+		}
+	}
+
 	nir_foreach_function (function, shader) {
 		state state;