Mesa (master): ac/llvm: add better code for fsign

Sun Sep 6 16:37:51 UTC 2020

Module: Mesa
Branch: master
Commit: d9a77f9ca36177c3295036588f204a9e49e81a09
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d9a77f9ca36177c3295036588f204a9e49e81a09

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Thu Sep  3 05:31:36 2020 -0400

ac/llvm: add better code for fsign

There are 2 improvements:
- better code for 16, 32, and 64 bits
- vector support for 16 and 32 bits

Totals:
SGPRS: 2639738 -> 2625882 (-0.52 %)
VGPRS: 1534120 -> 1533916 (-0.01 %)
Spilled SGPRs: 3541 -> 3557 (0.45 %)
Spilled VGPRs: 33 -> 33 (0.00 %)
Private memory VGPRs: 256 -> 256 (0.00 %)
Scratch size: 292 -> 292 (0.00 %) dwords per thread
Code Size: 55640332 -> 55384892 (-0.46 %) bytes
Max Waves: 964785 -> 964857 (0.01 %)

Totals from affected shaders:
SGPRS: 377352 -> 363496 (-3.67 %)
VGPRS: 209800 -> 209596 (-0.10 %)
Spilled SGPRs: 1979 -> 1995 (0.81 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 256 -> 256 (0.00 %)
Scratch size: 256 -> 256 (0.00 %) dwords per thread
Code Size: 12549300 -> 12293860 (-2.04 %) bytes
Max Waves: 105762 -> 105834 (0.07 %)

Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6284>

---

 src/amd/llvm/ac_llvm_build.c    | 64 +++++++++++++++++++++++++++--------------
 src/amd/llvm/ac_llvm_build.h    |  4 +--
 src/amd/llvm/ac_llvm_helper.cpp | 25 ++++++++++++++++
 src/amd/llvm/ac_llvm_util.h     |  3 ++
 src/amd/llvm/ac_nir_to_llvm.c   |  3 +-
 5 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index a3fa30fc003..861db0d889a 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -2776,31 +2776,53 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
 	return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
 }
 
-LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
-			    unsigned bitsize)
+static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
 {
-	LLVMValueRef cmp, val, zero, one;
-	LLVMTypeRef type;
+	ac_enable_signed_zeros(ctx);
+	/* (val + 0) converts negative zero to positive zero. */
+	val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
+	ac_disable_signed_zeros(ctx);
+	return val;
+}
 
-	if (bitsize == 16) {
-		type = ctx->f16;
-		zero = ctx->f16_0;
-		one = ctx->f16_1;
-	} else if (bitsize == 32) {
-		type = ctx->f32;
-		zero = ctx->f32_0;
-		one = ctx->f32_1;
-	} else {
-		type = ctx->f64;
-		zero = ctx->f64_0;
-		one = ctx->f64_1;
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+	LLVMTypeRef type = LLVMTypeOf(src);
+	LLVMValueRef pos, neg, dw[2], val;
+	unsigned bitsize = ac_get_elem_bits(ctx, type);
+
+	/* The standard version leads to this:
+	 *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
+	 *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
+	 *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
+	 *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
+	 *
+	 * The isign version:
+	 *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
+	 *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
+	 *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
+	 *
+	 * (src0 + 0) converts negative zero to positive zero.
+	 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
+	 *
+	 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
+	 * reduction. (FP64 comparisons are as fast as int64 comparisons)
+	 */
+	if (bitsize == 16 || bitsize == 32) {
+		val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
+		val = ac_build_isign(ctx, val);
+		return LLVMBuildSIToFP(ctx->builder, val, type, "");
 	}
 
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
-	return val;
+	assert(bitsize == 64);
+	pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
+	neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
+	dw[0] = ctx->i32_0;
+	dw[1] = LLVMBuildSelect(ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
+				LLVMBuildSelect(ctx->builder, neg,
+						LLVMConstInt(ctx->i32, 0xBFF00000, 0),
+						ctx->i32_0, ""), "");
+	return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
 }
 
 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h
index 7cb98d06a6e..6491843cd78 100644
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@@ -598,9 +598,7 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags);
 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
 			   unsigned bitsize);
 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0);
-LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
-			    unsigned bitsize);
-
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src);
 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0);
 
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
diff --git a/src/amd/llvm/ac_llvm_helper.cpp b/src/amd/llvm/ac_llvm_helper.cpp
index 10cf301131a..184f76a7968 100644
--- a/src/amd/llvm/ac_llvm_helper.cpp
+++ b/src/amd/llvm/ac_llvm_helper.cpp
@@ -121,6 +121,31 @@ LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
 	return builder;
 }
 
+void ac_enable_signed_zeros(struct ac_llvm_context *ctx)
+{
+	if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
+		auto *b = llvm::unwrap(ctx->builder);
+		llvm::FastMathFlags flags = b->getFastMathFlags();
+
+		/* This disables the optimization of (x + 0), which is used
+		 * to convert negative zero to positive zero.
+		 */
+		flags.setNoSignedZeros(false);
+		b->setFastMathFlags(flags);
+	}
+}
+
+void ac_disable_signed_zeros(struct ac_llvm_context *ctx)
+{
+	if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
+		auto *b = llvm::unwrap(ctx->builder);
+		llvm::FastMathFlags flags = b->getFastMathFlags();
+
+		flags.setNoSignedZeros();
+		b->setFastMathFlags(flags);
+	}
+}
+
 LLVMTargetLibraryInfoRef
 ac_create_target_library_info(const char *triple)
 {
diff --git a/src/amd/llvm/ac_llvm_util.h b/src/amd/llvm/ac_llvm_util.h
index 8039c99bfbd..d44d4deab87 100644
--- a/src/amd/llvm/ac_llvm_util.h
+++ b/src/amd/llvm/ac_llvm_util.h
@@ -37,6 +37,7 @@ extern "C" {
 #endif
 
 struct ac_compiler_passes;
+struct ac_llvm_context;
 
 enum ac_func_attr {
 	AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0),
@@ -109,6 +110,8 @@ LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx);
 
 LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
 				 enum ac_float_mode float_mode);
+void ac_enable_signed_zeros(struct ac_llvm_context *ctx);
+void ac_disable_signed_zeros(struct ac_llvm_context *ctx);
 
 void
 ac_llvm_add_target_dep_function_attr(LLVMValueRef F,
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index ddea78180b1..4b696f28f12 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -826,8 +826,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		break;
 	case nir_op_fsign:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = ac_build_fsign(&ctx->ac, src[0],
-					instr->dest.dest.ssa.bit_size);
+		result = ac_build_fsign(&ctx->ac, src[0]);
 		break;
 	case nir_op_ffloor:
 		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",