Mesa (master): freedreno/ir3: relative dst

Sun Mar 8 21:57:11 UTC 2015

Module: Mesa
Branch: master
Commit: 060d3499202c339a27fbc366335f2122ed4ff7bc
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=060d3499202c339a27fbc366335f2122ed4ff7bc

Author: Rob Clark <robclark at freedesktop.org>
Date:   Fri Jan 23 15:04:46 2015 -0500

freedreno/ir3: relative dst

To simplify RA, assign arrays that are written to first.  Since enough
dependency information is in the graph to preserve order of reads and
writes of array, so all SSA names for the array collapse into one, just
assign the entire thing by array-id.

Signed-off-by: Rob Clark <robclark at freedesktop.org>

---

 src/gallium/drivers/freedreno/ir3/ir3.h          |   22 ++++
 src/gallium/drivers/freedreno/ir3/ir3_compiler.c |   99 +++++++++++++--
 src/gallium/drivers/freedreno/ir3/ir3_cp.c       |    2 +-
 src/gallium/drivers/freedreno/ir3/ir3_dump.c     |   16 ++-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c       |  147 +++++++++++++++++-----
 5 files changed, 244 insertions(+), 42 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 3093285..430bcf2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -205,6 +205,9 @@ struct ir3_instruction {
 			int off;              /* component/offset */
 		} fo;
 		struct {
+			int aid;
+		} fi;
+		struct {
 			struct ir3_block *if_block, *else_block;
 		} flow;
 		struct {
@@ -264,6 +267,19 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;
 
+	/* in case of a instruction with relative dst instruction, we need to
+	 * capture the dependency on the fanin for the previous values of
+	 * the array elements.  Since we don't know at compile time actually
+	 * which array elements are written, this serves to preserve the
+	 * unconditional write to array elements prior to the conditional
+	 * write.
+	 *
+	 * TODO only cat1 can do indirect write.. we could maybe move this
+	 * into instr->cat1.fanin (but would require the frontend to insert
+	 * the extra mov)
+	 */
+	struct ir3_instruction *fanin;
+
 	struct ir3_instruction *next;
 #ifdef DEBUG
 	uint32_t serialno;
@@ -373,6 +389,8 @@ static inline int ir3_instr_regno(struct ir3_instruction *instr,
 }
 
 
+#define MAX_ARRAYS 16
+
 /* comp:
  *   0 - x
  *   1 - y
@@ -498,6 +516,8 @@ static inline bool reg_gpr(struct ir3_register *r)
 
 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
+	if (instr->fanin)
+		return instr->regs_count + 2;
 	if (instr->address)
 		return instr->regs_count + 1;
 	return instr->regs_count;
@@ -505,6 +525,8 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 
 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
+	if (n == (instr->regs_count + 1))
+		return instr->fanin;
 	if (n == (instr->regs_count + 0))
 		return instr->address;
 	return ssa(instr->regs[n]);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index d755bab..df428eb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -64,7 +64,7 @@ struct ir3_compile_context {
 	 */
 	struct {
 		struct ir3_instruction *instr, **instrp;
-	} output_updates[16];
+	} output_updates[64];
 	unsigned num_output_updates;
 
 	/* are we in a sequence of "atomic" instructions?
@@ -97,7 +97,7 @@ struct ir3_compile_context {
 	struct {
 		unsigned first, last;
 		struct ir3_instruction *fanin;
-	} array[16];
+	} array[MAX_ARRAYS];
 	uint32_t array_dirty;
 	/* offset into array[], per file, of first array info */
 	uint8_t array_offsets[TGSI_FILE_COUNT];
@@ -247,10 +247,6 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
 	memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
 
 #define FM(x) (1 << TGSI_FILE_##x)
-	/* optimize can't deal with relative addressing: */
-	if (info->indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-		return TGSI_PARSE_ERROR;
-
 	/* NOTE: if relative addressing is used, we set constlen in
 	 * the compiler (to worst-case value) since we don't know in
 	 * the assembler what the max addr reg value can be:
@@ -595,6 +591,16 @@ ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
 	return instr;
 }
 
+static int dst_array_id(struct ir3_compile_context *ctx,
+		const struct tgsi_dst_register *dst)
+{
+	// XXX complete hack to recover tgsi_full_dst_register...
+	// nothing that isn't wrapped in a tgsi_full_dst_register
+	// should be indirect
+	const struct tgsi_full_dst_register *fdst = (const void *)dst;
+	return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
+}
+
 static int src_array_id(struct ir3_compile_context *ctx,
 		const struct tgsi_src_register *src)
 {
@@ -639,7 +645,56 @@ static void
 ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 		const struct tgsi_dst_register *dst, unsigned chan)
 {
-	ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+	if (dst->Indirect) {
+		struct ir3_register *reg = instr->regs[0];
+		unsigned i, aid = dst_array_id(ctx, dst);
+		unsigned first = ctx->array[aid].first;
+		unsigned last  = ctx->array[aid].last;
+		unsigned off   = dst->Index - first; /* vec4 offset */
+
+		reg->size = 4 * (1 + last - first);
+		reg->offset = regid(off, chan);
+
+		instr->fanin = array_fanin(ctx, aid, dst->File);
+
+		/* annotate with the array-id, to help out the register-
+		 * assignment stage.  At least for the case of indirect
+		 * writes, we should capture enough dependencies to
+		 * preserve the order of reads/writes of the array, so
+		 * the multiple "names" for the array should end up all
+		 * assigned to the same registers.
+		 */
+		instr->fanin->fi.aid = aid;
+
+		/* Since we are scalarizing vec4 tgsi instructions/regs, we
+		 * run into a slight complication here.  To do the naive thing
+		 * and setup a fanout for each scalar array element would end
+		 * up with the result that the instructions generated for each
+		 * component of the vec4 would end up clobbering each other.
+		 * So we take advantage here of knowing that the array index
+		 * (after the shl.b) will be a multiple of four, and only set
+		 * every fourth scalar component in the array.  See also
+		 * fixup_ssa_dst_array()
+		 */
+		for (i = first; i <= last; i++) {
+			struct ir3_instruction *split;
+			unsigned n = regid(i, chan);
+			int off = (4 * (i - first)) + chan;
+
+			if (is_meta(instr) && (instr->opc == OPC_META_FO))
+				off -= instr->fo.off;
+
+			split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
+			split->fo.off = off;
+			ir3_reg_create(split, 0, 0);
+			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
+
+			ssa_instr_set(ctx, dst->File, n, split);
+		}
+	} else {
+		/* normal case (not relative addressed GPR) */
+		ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+	}
 }
 
 static void
@@ -705,12 +760,22 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 		break;
 	}
 
-	if (dst->Indirect)
+	if (dst->Indirect) {
 		flags |= IR3_REG_RELATIV;
 
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
+		/* shouldn't happen, and we can't cope with it below: */
+		compile_assert(ctx, wrmask == 0x1);
+
+		compile_assert(ctx, ctx->block->address);
+		if (instr->address)
+			compile_assert(ctx, ctx->block->address == instr->address);
+
+		instr->address = ctx->block->address;
+	}
 
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
 	reg->wrmask = wrmask;
+
 	if (wrmask == 0x1) {
 		/* normal case */
 		ssa_dst(ctx, instr, dst, chan);
@@ -720,6 +785,8 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 		struct ir3_instruction *prev = NULL;
 		unsigned i;
 
+		compile_assert(ctx, !dst->Indirect);
+
 		/* if instruction writes multiple, we need to create
 		 * some place-holder collect the registers:
 		 */
@@ -2539,10 +2606,16 @@ instr_cat1(const struct instr_translater *t,
 		struct ir3_compile_context *ctx,
 		struct tgsi_full_instruction *inst)
 {
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
 	struct tgsi_src_register *src = &inst->Src[0].Register;
+
+	/* NOTE: atomic start/end, rather than in create_mov() since
+	 * create_mov() is used already w/in atomic sequences (and
+	 * we aren't clever enough to deal with the nesting)
+	 */
+	instr_atomic_start(ctx);
 	create_mov(ctx, dst, src);
-	put_dst(ctx, inst, dst);
+	instr_atomic_end(ctx);
 }
 
 static void
@@ -3322,6 +3395,10 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		goto out;
 	}
 
+	/* for now, until the edge cases are worked out: */
+	if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+		cp = false;
+
 	compile_instructions(&ctx);
 
 	block = ctx.block;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 898ed70..8ad1894 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -49,7 +49,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
-		if (dst->flags & IR3_REG_ADDR)
+		if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV))
 			return false;
 		/* TODO: propagate abs/neg modifiers if possible */
 		if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
index a846777..dc25165 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
@@ -414,8 +414,20 @@ ir3_dump_instr_single(struct ir3_instruction *instr)
 		fprintf(ctx.f, "]");
 	}
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FO))
-		printf(", off=%d", instr->fo.off);
+	if (instr->fanin) {
+		fprintf(ctx.f, ", fanin=_");
+		fprintf(ctx.f, "[");
+		dump_instr_name(&ctx, instr->fanin);
+		fprintf(ctx.f, "]");
+	}
+
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_FO) {
+			printf(", off=%d", instr->fo.off);
+		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+			printf(", aid=%d", instr->fi.aid);
+		}
+	}
 
 	printf("\n");
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 0f6d40f..a4235a7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -47,6 +47,12 @@
  * I'm not really sure a sane way for the CP stage to realize when it
  * cannot remove a mov due to multi-register constraints..
  *
+ * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has
+ * some ideas to handle array allocation with a more conventional
+ * graph coloring algorithm for register assignment, which might be
+ * a good alternative to the current algo.  However afaict it cannot
+ * handle overlapping arrays, which is a scenario that we have to
+ * deal with
  */
 
 struct ir3_ra_ctx {
@@ -56,6 +62,10 @@ struct ir3_ra_ctx {
 	bool frag_face;
 	int cnt;
 	bool error;
+	struct {
+		unsigned base;
+		unsigned size;
+	} arrays[MAX_ARRAYS];
 };
 
 #ifdef DEBUG
@@ -141,6 +151,26 @@ static bool instr_is_output(struct ir3_instruction *instr)
 	return false;
 }
 
+static void mark_sources(struct ir3_instruction *instr,
+		struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written)
+{
+	unsigned i;
+
+	for (i = 1; i < n->regs_count; i++) {
+		struct ir3_register *r = reg_check(n, i);
+		if (r)
+			regmask_set_if_not(liveregs, r, written);
+
+		/* if any src points back to the instruction(s) in
+		 * the block of neighbors that we are assigning then
+		 * mark any written (clobbered) registers as live:
+		 */
+		if (instr_used_by(instr, n->regs[i]))
+			regmask_or(liveregs, liveregs, written);
+	}
+
+}
+
 /* live means read before written */
 static void compute_liveregs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, regmask_t *liveregs)
@@ -159,18 +189,13 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
 			continue;
 
 		/* check first src's read: */
-		for (i = 1; i < n->regs_count; i++) {
-			r = reg_check(n, i);
-			if (r)
-				regmask_set_if_not(liveregs, r, &written);
-
-			/* if any src points back to the instruction(s) in
-			 * the block of neighbors that we are assigning then
-			 * mark any written (clobbered) registers as live:
-			 */
-			if (instr_used_by(instr, n->regs[i]))
-				regmask_or(liveregs, liveregs, &written);
-		}
+		mark_sources(instr, n, liveregs, &written);
+
+		/* for instructions that write to an array, we need to
+		 * capture the dependency on the array elements:
+		 */
+		if (n->fanin)
+			mark_sources(instr, n->fanin, liveregs, &written);
 
 		/* meta-instructions don't actually get scheduled,
 		 * so don't let it's write confuse us.. what we
@@ -383,14 +408,32 @@ static void instr_assign_src(struct ir3_ra_ctx *ctx,
 	}
 }
 
-static void instr_assign(struct ir3_ra_ctx *ctx,
+static void instr_assign_srcs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, unsigned name)
 {
 	struct ir3_instruction *n, *src;
+
+	for (n = instr->next; n && !ctx->error; n = n->next) {
+		foreach_ssa_src_n(src, i, n) {
+			unsigned r = i + 1;
+
+			/* skip address / etc (non real sources): */
+			if (r >= n->regs_count)
+				continue;
+
+			if (src == instr)
+				instr_assign_src(ctx, n, r, name);
+		}
+	}
+}
+
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name)
+{
 	struct ir3_register *reg = instr->regs[0];
 
-	if ((reg->flags & IR3_REG_RELATIV))
-		name += reg->offset;
+	if (reg->flags & IR3_REG_RELATIV)
+		return;
 
 	/* check if already assigned: */
 	if (!(reg->flags & IR3_REG_SSA)) {
@@ -403,18 +446,7 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
 	reg_assign(instr, 0, name);
 
 	/* and rename any subsequent use of result of this instr: */
-	for (n = instr->next; n && !ctx->error; n = n->next) {
-		foreach_ssa_src_n(src, i, n) {
-			unsigned r = i + 1;
-
-			/* skip address / etc (non real sources): */
-			if (r >= n->regs_count)
-				continue;
-
-			if (src == instr)
-				instr_assign_src(ctx, n, r, name);
-		}
-	}
+	instr_assign_srcs(ctx, instr, name);
 
 	/* To simplify the neighbor logic, and to "avoid" dealing with
 	 * instructions which write more than one output, we actually
@@ -423,6 +455,8 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
 	 * to the actual instruction:
 	 */
 	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
+		struct ir3_instruction *src;
+
 		debug_assert(name >= instr->fo.off);
 
 		foreach_ssa_src(src, instr)
@@ -444,7 +478,8 @@ static int check_partial_assignment(struct ir3_ra_ctx *ctx,
 
 	for (n = instr; n; n = n->cp.right) {
 		struct ir3_register *dst = n->regs[0];
-		if (!(dst->flags & IR3_REG_SSA)) {
+		if ((n->depth != DEPTH_UNUSED) &&
+				!(dst->flags & IR3_REG_SSA)) {
 			int name = dst->num - off;
 			debug_assert(name >= 0);
 			return name;
@@ -472,6 +507,23 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 
 	dst = instr->regs[0];
 
+	/* For indirect dst, take the register assignment from the
+	 * fanin and propagate it forward.
+	 */
+	if (dst->flags & IR3_REG_RELATIV) {
+		/* NOTE can be grouped, if for example outputs:
+		 * for now disable cp if indirect writes
+		 */
+		instr_alloc_and_assign(ctx, instr->fanin);
+
+		dst->num += instr->fanin->regs[0]->num;
+		dst->flags &= ~IR3_REG_SSA;
+
+		instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num);
+
+		return;
+	}
+
 	/* for instructions w/ fanouts, do the actual register assignment
 	 * on the group of fanout neighbor nodes and propagate the reg
 	 * name back up to the texture instruction.
@@ -510,6 +562,33 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 	}
 }
 
+static void instr_assign_array(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+	int name, aid = instr->fi.aid;
+
+	if (ctx->arrays[aid].base == ~0) {
+		int size = instr->regs_count - 1;
+		ctx->arrays[aid].base = alloc_block(ctx, instr, size);
+		ctx->arrays[aid].size = size;
+	}
+
+	name = ctx->arrays[aid].base;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned r = i + 1;
+
+		/* skip address / etc (non real sources): */
+		if (r >= instr->regs_count)
+			break;
+
+		instr_assign(ctx, src, name);
+		name++;
+	}
+
+}
+
 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	struct ir3_instruction *n;
@@ -531,6 +610,16 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 	ra_dump_list("-------\n", block->head);
 
+	/* first pass, assign arrays: */
+	for (n = block->head; n && !ctx->error; n = n->next) {
+		if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) {
+			debug_assert(!n->cp.left);  /* don't think this should happen */
+			ra_dump_instr("ASSIGN ARRAY: ", n);
+			instr_assign_array(ctx, n);
+			ra_dump_list("-------\n", block->head);
+		}
+	}
+
 	for (n = block->head; n && !ctx->error; n = n->next) {
 		ra_dump_instr("ASSIGN: ", n);
 		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
@@ -552,6 +641,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 	};
 	int ret;
 
+	memset(&ctx.arrays, ~0, sizeof(ctx.arrays));
+
 	/* mark dst registers w/ SSA flag so we can see which
 	 * have been assigned so far:
 	 * NOTE: we really should set SSA flag consistently on