Mesa (main): ir3/lower_parallelcopy: Use SWZ
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Jun 29 08:25:32 UTC 2021
Module: Mesa
Branch: main
Commit: 42b3d83dd4356b55825f332450804c5775432764
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=42b3d83dd4356b55825f332450804c5775432764
Author: Connor Abbott <cwabbott0 at gmail.com>
Date: Thu Apr 22 19:49:29 2021 +0200
ir3/lower_parallelcopy: Use SWZ
shader-db results on a650:
total instructions in shared programs: 1575484 -> 1574866 (-0.04%)
instructions in affected programs: 32579 -> 31961 (-1.90%)
helped: 75
HURT: 0
helped stats (abs) min: 1 max: 98 x̄: 8.24 x̃: 7
helped stats (rel) min: 0.41% max: 30.12% x̄: 2.47% x̃: 1.13%
95% mean confidence interval for instructions value: -10.97 -5.51
95% mean confidence interval for instructions %-change: -3.44% -1.51%
Instructions are helped.
total nops in shared programs: 355742 -> 355628 (-0.03%)
nops in affected programs: 18635 -> 18521 (-0.61%)
helped: 55
HURT: 147
helped stats (abs) min: 1 max: 14 x̄: 4.76 x̃: 6
helped stats (rel) min: 1.41% max: 100.00% x̄: 8.13% x̃: 4.76%
HURT stats (abs) min: 1 max: 2 x̄: 1.01 x̃: 1
HURT stats (rel) min: 0.56% max: 25.00% x̄: 2.09% x̃: 1.20%
95% mean confidence interval for nops value: -0.98 -0.15
95% mean confidence interval for nops %-change: -1.93% 0.55%
Inconclusive result (%-change mean confidence interval includes 0).
total non-nops in shared programs: 1219742 -> 1219238 (-0.04%)
non-nops in affected programs: 61125 -> 60621 (-0.82%)
helped: 220
HURT: 0
helped stats (abs) min: 1 max: 99 x̄: 2.29 x̃: 1
helped stats (rel) min: 0.19% max: 29.17% x̄: 0.90% x̃: 0.40%
95% mean confidence interval for non-nops value: -3.26 -1.32
95% mean confidence interval for non-nops %-change: -1.24% -0.56%
Non-nops are helped.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11565>
---
src/freedreno/ir3/ir3_lower_parallelcopy.c | 66 +++++++++++++++++++-----------
1 file changed, 42 insertions(+), 24 deletions(-)
diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c
index 1144fa056c6..10c512e7e01 100644
--- a/src/freedreno/ir3/ir3_lower_parallelcopy.c
+++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c
@@ -83,7 +83,8 @@ do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsig
}
static void
-do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
+do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+ const struct copy_entry *entry)
{
assert(!entry->src.flags);
/* TODO implement shared swaps */
@@ -104,21 +105,21 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
physreg_t tmp = entry->dst < 2 ? 2 : 0;
/* Swap src and the temporary */
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->src.reg & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
/* Do the original swap with src replaced with tmp */
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = tmp + (entry->src.reg & 1) },
.dst = entry->dst,
.flags = entry->flags,
});
/* Swap src and the temporary back */
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->src.reg & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
@@ -130,7 +131,7 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
* let the case above handle it.
*/
if (entry->dst >= RA_HALF_SIZE) {
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst },
.dst = entry->src.reg,
.flags = entry->flags,
@@ -142,13 +143,29 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
- do_xor(instr, dst_num, dst_num, src_num, entry->flags);
- do_xor(instr, src_num, src_num, dst_num, entry->flags);
- do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+ /* a5xx+ is known to support swz, which enables us to swap two registers
+ * in-place. If unsupported we emulate it using the xor trick.
+ */
+ if (compiler->gpu_id < 500) {
+ do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+ do_xor(instr, src_num, src_num, dst_num, entry->flags);
+ do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+ } else {
+ struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2);
+ ir3_dst_create(swz, dst_num, entry->flags)->wrmask = 1;
+ ir3_dst_create(swz, src_num, entry->flags)->wrmask = 1;
+ ir3_src_create(swz, src_num, entry->flags)->wrmask = 1;
+ ir3_src_create(swz, dst_num, entry->flags)->wrmask = 1;
+ swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+ swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+ swz->repeat = 1;
+ ir3_instr_move_before(swz, instr);
+ }
}
static void
-do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
+do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+ const struct copy_entry *entry)
{
/* TODO implement shared copies */
assert(!(entry->flags & IR3_REG_SHARED));
@@ -159,19 +176,19 @@ do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
/* TODO: is there a hw instruction we can use for this case? */
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
- do_copy(instr, &(struct copy_entry) {
+ do_copy(compiler, instr, &(struct copy_entry) {
.src = entry->src,
.dst = tmp + (entry->dst & 1),
.flags = entry->flags,
});
- do_swap(instr, &(struct copy_entry) {
+ do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
@@ -262,7 +279,8 @@ split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
}
static void
-_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
+_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
+ struct copy_ctx *ctx)
{
/* Set up the bookkeeping */
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
@@ -298,7 +316,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
if (!entry->done && !entry_blocked(entry, ctx)) {
entry->done = true;
progress = true;
- do_copy(instr, entry);
+ do_copy(compiler, instr, entry);
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
if (!entry->src.flags)
ctx->physreg_use_count[entry->src.reg + j]--;
@@ -383,7 +401,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
continue;
}
- do_swap(instr, entry);
+ do_swap(compiler, instr, entry);
/* Split any blocking copies whose sources are only partially
* contained within our destination.
@@ -419,18 +437,18 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
}
static void
-handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
- unsigned entry_count, bool mergedregs)
+handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
+ struct copy_entry *entries, unsigned entry_count)
{
struct copy_ctx ctx;
- if (mergedregs) {
+ if (v->mergedregs) {
/* Half regs and full regs are in the same file, so handle everything
* at once.
*/
memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count);
ctx.entry_count = entry_count;
- _handle_copies(instr, &ctx);
+ _handle_copies(v->shader->compiler, instr, &ctx);
} else {
/* There may be both half copies and full copies, so we have to split
* them up since they don't interfere.
@@ -440,14 +458,14 @@ handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
if (entries[i].flags & IR3_REG_HALF)
ctx.entries[ctx.entry_count++] = entries[i];
}
- _handle_copies(instr, &ctx);
+ _handle_copies(v->shader->compiler, instr, &ctx);
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (!(entries[i].flags & IR3_REG_HALF))
ctx.entries[ctx.entry_count++] = entries[i];
}
- _handle_copies(instr, &ctx);
+ _handle_copies(v->shader->compiler, instr, &ctx);
}
}
@@ -475,7 +493,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
});
}
}
- handle_copies(instr, copies, copies_count, v->mergedregs);
+ handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_COLLECT) {
copies_count = 0;
@@ -489,7 +507,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
.flags = flags,
});
}
- handle_copies(instr, copies, copies_count, v->mergedregs);
+ handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_SPLIT) {
copies_count = 0;
@@ -501,7 +519,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
.flags = flags,
});
- handle_copies(instr, copies, copies_count, v->mergedregs);
+ handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_PHI) {
list_del(&instr->node);
More information about the mesa-commit
mailing list