Mesa (master): radeonsi: reload PS inputs with direct indexing at each use (v2)

Wed Sep 14 10:37:24 UTC 2016

Module: Mesa
Branch: master
Commit: ab29788250a705eb0dd517cb3d38f37f944eb8ad
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ab29788250a705eb0dd517cb3d38f37f944eb8ad

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Tue Sep 13 17:33:23 2016 +0200

radeonsi: reload PS inputs with direct indexing at each use (v2)

The LLVM compiler can CSE interp intrinsics thanks to
LLVMReadNoneAttribute.

26011 shaders in 14651 tests
Totals:
SGPRS: 1146340 -> 1132676 (-1.19 %)
VGPRS: 727371 -> 711730 (-2.15 %)
Spilled SGPRs: 2218 -> 2078 (-6.31 %)
Spilled VGPRs: 369 -> 369 (0.00 %)
Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread
Code Size: 35841268 -> 36009732 (0.47 %) bytes
LDS: 767 -> 767 (0.00 %) blocks
Max Waves: 222559 -> 224779 (1.00 %)
Wait states: 0 -> 0 (0.00 %)

v2: don't call load_input for fragment shaders in emit_declaration

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

---

 src/gallium/drivers/radeon/radeon_llvm.h           |  6 ++++-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 30 ++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader.c           | 27 ++++++++-----------
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index da5b7f5..f508d32 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -30,7 +30,9 @@
 #include <llvm-c/Core.h>
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_tgsi.h"
+#include "tgsi/tgsi_parse.h"
 
+#define RADEON_LLVM_MAX_INPUT_SLOTS 32
 #define RADEON_LLVM_MAX_INPUTS 32 * 4
 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
 
@@ -62,7 +64,8 @@ struct radeon_llvm_context {
 	  */
 	void (*load_input)(struct radeon_llvm_context *,
 			   unsigned input_index,
-			   const struct tgsi_full_declaration *decl);
+			   const struct tgsi_full_declaration *decl,
+			   LLVMValueRef out[4]);
 
 	void (*load_system_value)(struct radeon_llvm_context *,
 				  unsigned index,
@@ -75,6 +78,7 @@ struct radeon_llvm_context {
 	  * values will be in the form of a target intrinsic that will inform the
 	  * backend how to load the actual inputs to the shader. 
 	  */
+	struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS];
 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 4643e6d..4fa43cd 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -446,14 +446,29 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
 		}
 	}
 
-	case TGSI_FILE_INPUT:
-		result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+	case TGSI_FILE_INPUT: {
+		unsigned index = reg->Register.Index;
+		LLVMValueRef input[4];
+
+		/* I don't think doing this for vertex shaders is beneficial.
+		 * For those, we want to make sure the VMEM loads are executed
+		 * only once. Fragment shaders don't care much, because
+		 * v_interp instructions are much cheaper than VMEM loads.
+		 */
+		if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT)
+			ctx->load_input(ctx, index, &ctx->input_decls[index], input);
+		else
+			memcpy(input, &ctx->inputs[index * 4], sizeof(input));
+
+		result = input[swizzle];
+
 		if (tgsi_type_is_64bit(type)) {
 			ptr = result;
-			ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
+			ptr2 = input[swizzle + 1];
 			return radeon_llvm_emit_fetch_64bit(bld_base, type, ptr, ptr2);
 		}
 		break;
+	}
 
 	case TGSI_FILE_TEMPORARY:
 		if (reg->Register.Index >= ctx->temps_count)
@@ -626,8 +641,13 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 	{
 		unsigned idx;
 		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
-			if (ctx->load_input)
-				ctx->load_input(ctx, idx, decl);
+			if (ctx->load_input) {
+				ctx->input_decls[idx] = *decl;
+
+				if (bld_base->info->processor != PIPE_SHADER_FRAGMENT)
+					ctx->load_input(ctx, idx, decl,
+							&ctx->inputs[idx * 4]);
+			}
 		}
 	}
 	break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b034837..3ccff7a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -440,7 +440,8 @@ static LLVMValueRef get_instance_index_for_fetch(
 static void declare_input_vs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
+	const struct tgsi_full_declaration *decl,
+	LLVMValueRef out[4])
 {
 	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 	struct gallivm_state *gallivm = base->gallivm;
@@ -498,11 +499,8 @@ static void declare_input_vs(
 	/* Break up the vec4 into individual components */
 	for (chan = 0; chan < 4; chan++) {
 		LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-		/* XXX: Use a helper function for this.  There is one in
- 		 * tgsi_llvm.c. */
-		ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
-				LLVMBuildExtractElement(gallivm->builder,
-				input, llvm_chan, "");
+		out[chan] = LLVMBuildExtractElement(gallivm->builder,
+						    input, llvm_chan, "");
 	}
 }
 
@@ -1463,7 +1461,8 @@ static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
 static void declare_input_fs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
+	const struct tgsi_full_declaration *decl,
+	LLVMValueRef out[4])
 {
 	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 	struct si_shader_context *ctx =
@@ -1482,14 +1481,10 @@ static void declare_input_fs(
 		unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
 				  (i ? util_bitcount(colors_read & 0xf) : 0);
 
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-			mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-			mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-			mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
 		return;
 	}
 
@@ -1513,7 +1508,7 @@ static void declare_input_fs(
 			shader->selector->info.colors_read, interp_param,
 			LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
 			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
-			&radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
+			&out[0]);
 }
 
 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)