Mesa (master): radeonsi: clean up ffma handling
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Sep 16 02:53:06 UTC 2020
Module: Mesa
Branch: master
Commit: 758ab39d25e10d585929b87a8a2891c5a68b7c55
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=758ab39d25e10d585929b87a8a2891c5a68b7c55
Author: Marek Olšák <marek.olsak at amd.com>
Date: Fri Sep 4 06:25:47 2020 -0400
radeonsi: clean up ffma handling
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Reviewed-by: Connor Abbott <cwabbott0 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6596>
---
src/amd/llvm/ac_nir_to_llvm.c | 8 ++++----
src/gallium/drivers/radeonsi/si_get.c | 11 ++++++++++-
2 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index d227358322d..b50314dec47 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -853,10 +853,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
}
break;
case nir_op_ffma:
- /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
- result =
- emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd",
- ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
+ /* FMA is slow on gfx6-8, so it shouldn't be used. */
+ assert(ctx->ac.chip_class >= GFX9);
+ result = emit_intrin_3f_param(&ctx->ac, "llvm.fma", ac_to_float_type(&ctx->ac, def_type),
+ src[0], src[1], src[2]);
break;
case nir_op_ldexp:
src[0] = ac_to_float(&ctx->ac, src[0]);
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 059bd8d00e8..e7479012ea9 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -937,7 +937,16 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
.lower_bitfield_insert_to_bitfield_select = true,
.lower_bitfield_extract = true,
.lower_sub = true,
- .fuse_ffma = true,
+ /* gfx6-8: use MAD (FMA is 4x slower)
+ * gfx9-10: either is OK (MAD and FMA have the same performance)
+ * gfx10.3: use FMA (MAD doesn't exist, separate MUL+ADD are 2x slower)
+ *
+ * FMA has no advantage on gfx9-10 and MAD allows more algebraic optimizations.
+ * Keep FMA enabled on gfx10 to test it, which helps us validate correctness
+ * for gfx10.3 on gfx10.
+ */
+ .lower_ffma = sscreen->info.chip_class <= GFX9,
+ .fuse_ffma = sscreen->info.chip_class >= GFX10,
.lower_fmod = true,
.lower_pack_snorm_4x8 = true,
.lower_pack_unorm_4x8 = true,
More information about the mesa-commit
mailing list