[Mesa-dev] [PATCH 09/10] radeonsi: don't emit AMDGPU intrinsics for RSQ opcodes

Sat Oct 10 18:29:49 PDT 2015

From: Marek Olšák <marek.olsak at amd.com>

Intel and Nouveau use IEEE opcodes, so we should too.
If there is a bug caused by not using the clamped RSQ variant, there must
be another way to fix it. I don't think the RSQ behavior matters much now
that NaNs are disabled.

Nine and Wine should implement necessary workarounds for DX9 games.
(they probably already do)

Not many shaders are affected.

Totals:
SGPRS: 345104 -> 344944 (-0.05 %)
VGPRS: 197420 -> 197024 (-0.20 %)
Code Size: 7324692 -> 7325688 (0.01 %) bytes
LDS: 91 -> 91 (0.00 %) blocks
Scratch: 1524736 -> 1510400 (-0.94 %) bytes per wave

Totals from affected shaders:
SGPRS: 25160 -> 25000 (-0.64 %)
VGPRS: 17336 -> 16940 (-2.28 %)
Code Size: 843412 -> 844408 (0.12 %) bytes
LDS: 6 -> 6 (0.00 %) blocks
Scratch: 139264 -> 124928 (-10.29 %) bytes per wave
---
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 28 ++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index ac99e73..1172244 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1452,6 +1452,28 @@ static void emit_minmax_int(const struct lp_build_tgsi_action *action,
 				emit_data->args[1], "");
 }
 
+/* This requires "unsafe-fp-math" for LLVM to convert it to RSQ. */
+static void emit_rsq(const struct lp_build_tgsi_action *action,
+		     struct lp_build_tgsi_context *bld_base,
+		     struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef src = emit_data->args[0];
+	bool is_f64 = LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMDoubleTypeKind;
+
+	LLVMValueRef sqrt =
+		lp_build_emit_llvm_unary(bld_base,
+					 is_f64 ? TGSI_OPCODE_DSQRT
+						: TGSI_OPCODE_SQRT,
+					 src);
+
+	emit_data->output[emit_data->chan] =
+		LLVMBuildFDiv(builder,
+			      is_f64 ? bld_base->dbl_bld.one
+				     : bld_base->base.one,
+			      sqrt, "");
+}
+
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 {
 	struct lp_type type;
@@ -1531,8 +1553,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
-	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64";
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = emit_rsq;
 	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
@@ -1584,8 +1605,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32";
 	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32";
-	bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = "llvm.AMDGPU.rsq.clamped.f32";
-	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = emit_rsq;
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_set_cond;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_set_cond;
 	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
-- 
2.1.4