Mesa (main): ir3/lower_parallelcopy: Use SWZ

Tue Jun 29 08:25:32 UTC 2021

Module: Mesa
Branch: main
Commit: 42b3d83dd4356b55825f332450804c5775432764
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=42b3d83dd4356b55825f332450804c5775432764

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Thu Apr 22 19:49:29 2021 +0200

ir3/lower_parallelcopy: Use SWZ

shader-db results on a650:

total instructions in shared programs: 1575484 -> 1574866 (-0.04%)
instructions in affected programs: 32579 -> 31961 (-1.90%)
helped: 75
HURT: 0
helped stats (abs) min: 1 max: 98 x̄: 8.24 x̃: 7
helped stats (rel) min: 0.41% max: 30.12% x̄: 2.47% x̃: 1.13%
95% mean confidence interval for instructions value: -10.97 -5.51
95% mean confidence interval for instructions %-change: -3.44% -1.51%
Instructions are helped.

total nops in shared programs: 355742 -> 355628 (-0.03%)
nops in affected programs: 18635 -> 18521 (-0.61%)
helped: 55
HURT: 147
helped stats (abs) min: 1 max: 14 x̄: 4.76 x̃: 6
helped stats (rel) min: 1.41% max: 100.00% x̄: 8.13% x̃: 4.76%
HURT stats (abs)   min: 1 max: 2 x̄: 1.01 x̃: 1
HURT stats (rel)   min: 0.56% max: 25.00% x̄: 2.09% x̃: 1.20%
95% mean confidence interval for nops value: -0.98 -0.15
95% mean confidence interval for nops %-change: -1.93% 0.55%
Inconclusive result (%-change mean confidence interval includes 0).

total non-nops in shared programs: 1219742 -> 1219238 (-0.04%)
non-nops in affected programs: 61125 -> 60621 (-0.82%)
helped: 220
HURT: 0
helped stats (abs) min: 1 max: 99 x̄: 2.29 x̃: 1
helped stats (rel) min: 0.19% max: 29.17% x̄: 0.90% x̃: 0.40%
95% mean confidence interval for non-nops value: -3.26 -1.32
95% mean confidence interval for non-nops %-change: -1.24% -0.56%
Non-nops are helped.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11565>

---

 src/freedreno/ir3/ir3_lower_parallelcopy.c | 66 +++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c
index 1144fa056c6..10c512e7e01 100644
--- a/src/freedreno/ir3/ir3_lower_parallelcopy.c
+++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c
@@ -83,7 +83,8 @@ do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsig
 }
 
 static void
-do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
+do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+		const struct copy_entry *entry)
 {
 	assert(!entry->src.flags);
 	/* TODO implement shared swaps */
@@ -104,21 +105,21 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
 			physreg_t tmp = entry->dst < 2 ? 2 : 0;
 
 			/* Swap src and the temporary */
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = entry->src.reg & ~1u },
 				.dst = tmp,
 				.flags = entry->flags & ~IR3_REG_HALF,
 			});
 
 			/* Do the original swap with src replaced with tmp */
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = tmp + (entry->src.reg & 1) },
 				.dst = entry->dst,
 				.flags = entry->flags,
 			});
 
 			/* Swap src and the temporary back */
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = entry->src.reg & ~1u },
 				.dst = tmp,
 				.flags = entry->flags & ~IR3_REG_HALF,
@@ -130,7 +131,7 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
 		 * let the case above handle it.
 		 */
 		if (entry->dst >= RA_HALF_SIZE) {
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = entry->dst },
 				.dst = entry->src.reg,
 				.flags = entry->flags,
@@ -142,13 +143,29 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
 	unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
 	unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
 
-	do_xor(instr, dst_num, dst_num, src_num, entry->flags);
-	do_xor(instr, src_num, src_num, dst_num, entry->flags);
-	do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+	/* a5xx+ is known to support swz, which enables us to swap two registers
+	 * in-place. If unsupported we emulate it using the xor trick.
+	 */
+	if (compiler->gpu_id < 500) {
+		do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+		do_xor(instr, src_num, src_num, dst_num, entry->flags);
+		do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+	} else {
+		struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2);
+		ir3_dst_create(swz, dst_num, entry->flags)->wrmask = 1;
+		ir3_dst_create(swz, src_num, entry->flags)->wrmask = 1;
+		ir3_src_create(swz, src_num, entry->flags)->wrmask = 1;
+		ir3_src_create(swz, dst_num, entry->flags)->wrmask = 1;
+		swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+		swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+		swz->repeat = 1;
+		ir3_instr_move_before(swz, instr);
+	}
 }
 
 static void
-do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
+do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+		const struct copy_entry *entry)
 {
 	/* TODO implement shared copies */
 	assert(!(entry->flags & IR3_REG_SHARED));
@@ -159,19 +176,19 @@ do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
 			/* TODO: is there a hw instruction we can use for this case? */
 			physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
 
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = entry->dst & ~1u },
 				.dst = tmp,
 				.flags = entry->flags & ~IR3_REG_HALF,
 			});
 
-			do_copy(instr, &(struct copy_entry) {
+			do_copy(compiler, instr, &(struct copy_entry) {
 				.src = entry->src,
 				.dst = tmp + (entry->dst & 1),
 				.flags = entry->flags,
 			});
 
-			do_swap(instr, &(struct copy_entry) {
+			do_swap(compiler, instr, &(struct copy_entry) {
 				.src = { .reg = entry->dst & ~1u },
 				.dst = tmp,
 				.flags = entry->flags & ~IR3_REG_HALF,
@@ -262,7 +279,8 @@ split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
 }
 
 static void
-_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
+_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+			   struct copy_ctx *ctx)
 {
 	/* Set up the bookkeeping */
 	memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
@@ -298,7 +316,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
 			if (!entry->done && !entry_blocked(entry, ctx)) {
 				entry->done = true;
 				progress = true;
-				do_copy(instr, entry);
+				do_copy(compiler, instr, entry);
 				for (unsigned j = 0; j < copy_entry_size(entry); j++) {
 					if (!entry->src.flags)
 						ctx->physreg_use_count[entry->src.reg + j]--;
@@ -383,7 +401,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
 			continue;
 		}
 
-		do_swap(instr, entry);
+		do_swap(compiler, instr, entry);
 
 		/* Split any blocking copies whose sources are only partially
 		 * contained within our destination.
@@ -419,18 +437,18 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
 }
 
 static void
-handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
-			  unsigned entry_count, bool mergedregs)
+handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
+			  struct copy_entry *entries, unsigned entry_count)
 {
 	struct copy_ctx ctx;	
 
-	if (mergedregs) {
+	if (v->mergedregs) {
 		/* Half regs and full regs are in the same file, so handle everything
 		 * at once.
 		 */
 		memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count);
 		ctx.entry_count = entry_count;
-		_handle_copies(instr, &ctx);
+		_handle_copies(v->shader->compiler, instr, &ctx);
 	} else {
 		/* There may be both half copies and full copies, so we have to split
 		 * them up since they don't interfere.
@@ -440,14 +458,14 @@ handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
 			if (entries[i].flags & IR3_REG_HALF)
 				ctx.entries[ctx.entry_count++] = entries[i];
 		}
-		_handle_copies(instr, &ctx);
+		_handle_copies(v->shader->compiler, instr, &ctx);
 
 		ctx.entry_count = 0;
 		for (unsigned i = 0; i < entry_count; i++) {
 			if (!(entries[i].flags & IR3_REG_HALF))
 				ctx.entries[ctx.entry_count++] = entries[i];
 		}
-		_handle_copies(instr, &ctx);
+		_handle_copies(v->shader->compiler, instr, &ctx);
 	}
 }
 
@@ -475,7 +493,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
 						});
 					}
 				}
-				handle_copies(instr, copies, copies_count, v->mergedregs);
+				handle_copies(v, instr, copies, copies_count);
 				list_del(&instr->node);
 			} else if (instr->opc == OPC_META_COLLECT) {
 				copies_count = 0;
@@ -489,7 +507,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
 						.flags = flags,
 					});
 				}
-				handle_copies(instr, copies, copies_count, v->mergedregs);
+				handle_copies(v, instr, copies, copies_count);
 				list_del(&instr->node);
 			} else if (instr->opc == OPC_META_SPLIT) {
 				copies_count = 0;
@@ -501,7 +519,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
 					.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
 					.flags = flags,
 				});
-				handle_copies(instr, copies, copies_count, v->mergedregs);
+				handle_copies(v, instr, copies, copies_count);
 				list_del(&instr->node);
 			} else if (instr->opc == OPC_META_PHI) {
 				list_del(&instr->node);