Mesa (master): aco: select v_mul_lo_u16 for 16-bit multiplications that can't overflow
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Thu Nov 12 12:42:17 UTC 2020
Module: Mesa
Branch: master
Commit: 20e48551acff92f07109630e134f59c81107e6bc
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=20e48551acff92f07109630e134f59c81107e6bc
Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date: Mon Nov 2 15:15:04 2020 +0100
aco: select v_mul_lo_u16 for 16-bit multiplications that can't overflow
Only on GFX8-9 because GFX10 doesn't zero the upper 16 bits.
No fossils-db changes.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7425>
---
src/amd/compiler/aco_instruction_selection.cpp | 26 ++++++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 3bae0a8f44f..ddf94bfec4f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -732,7 +732,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
}
void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
- bool commutative, bool swap_srcs=false, bool flush_denorms = false)
+ bool commutative, bool swap_srcs=false,
+ bool flush_denorms = false, bool nuw = false)
{
Builder bld(ctx->program, ctx->block);
bld.is_precise = instr->exact;
@@ -754,7 +755,11 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
} else {
- bld.vop2(op, Definition(dst), src0, src1);
+ if (nuw) {
+ bld.nuw().vop2(op, Definition(dst), src0, src1);
+ } else {
+ bld.vop2(op, Definition(dst), src0, src1);
+ }
}
}
@@ -1721,16 +1726,29 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
} else if (dst.type() == RegType::vgpr) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
if (src0_ub <= 0xffff && src1_ub <= 0xffff &&
+ src0_ub * src1_ub <= 0xffff &&
+ (ctx->options->chip_class == GFX8 ||
+ ctx->options->chip_class == GFX9)) {
+ /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
+ * but only on GFX8-9 because GFX10 doesn't zero the upper 16
+ * bits.
+ */
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst,
+ true /* commutative */, false, false,
+ true /* nuw */);
+ } else if (src0_ub <= 0xffff && src1_ub <= 0xffff &&
ctx->options->chip_class >= GFX9) {
/* Initialize the accumulator to 0 to allow further combinations
* in the optimizer.
*/
- Operand op0(get_alu_src(ctx, instr->src[0]));
- Operand op1(get_alu_src(ctx, instr->src[1]));
+ Operand op0(src0);
+ Operand op1(src1);
bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), bld.set16bit(op1), Operand(0u));
} else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
More information about the mesa-commit
mailing list