[Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning

Thu Mar 8 14:08:11 UTC 2018

RadeonSI does something similar, the VGPRs decrease is a win
but not sure if we really want to implement that.

Polaris10:
Totals from affected shaders:
SGPRS: 116376 -> 116768 (0.34 %)
VGPRS: 76556 -> 74868 (-2.20 %)
Spilled SGPRs: 10347 -> 10466 (1.15 %)
Code Size: 5555072 -> 5569024 (0.25 %) bytes
Max Waves: 9854 -> 9951 (0.98 %)

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
 src/amd/common/ac_nir_to_llvm.c | 118 +++++++++++++++++++++++++++++++---------
 src/amd/common/ac_shader_abi.h  |   7 +++
 2 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 644c85e2eb..eb0935972d 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				   nir_intrinsic_instr *instr)
 {
 	LLVMValueRef values[8];
+	int location = instr->variables[0]->var->data.location;
 	int idx = instr->variables[0]->var->data.driver_location;
 	int ve = instr->dest.ssa.num_components;
 	unsigned comp = instr->variables[0]->var->data.location_frac;
@@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 						     instr->num_components, vertex_index, const_index, type);
 		}
 
+		LLVMValueRef inputs[4];
+
+		if (ctx->stage == MESA_SHADER_FRAGMENT) {
+			ctx->abi->load_fs_inputs(ctx->abi, location,
+						 indir_index, const_index,
+						 stride, inputs);
+		} else {
+			unsigned index = idx +
+				(indir_index ? 0 : const_index * stride);
+
+			memcpy(inputs, &ctx->abi->inputs[index], sizeof(inputs));
+		}
+
 		for (unsigned chan = comp; chan < ve + comp; chan++) {
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
@@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 						ctx->stage == MESA_SHADER_VERTEX);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
-						&ctx->ac, ctx->abi->inputs + idx + chan, count,
+						&ctx->ac, inputs + chan, count,
 						stride, false, true);
 
 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 								       tmp_vec,
 								       indir_index, "");
-			} else
-				values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
+			} else {
+				values[chan] = inputs[chan];
+			}
 		}
 		break;
 	case nir_var_local:
@@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context *ctx,
 	}
 }
 
+static unsigned
+get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
+{
+	struct ac_shader_info *info = &ctx->shader_info->info;
+	uint64_t mask = info->input_mask & ((1ull << idx) - 1);
+
+	mask &= ~(1ull << VARYING_SLOT_POS);
+
+	return util_bitcount64(mask);
+}
+
+/* If this is true, preload FS inputs at the beginning of shaders. Otherwise,
+ * reload them at each use. This must be true if the shader is using
+ * derivatives and KILL, because KILL can leave the WQM and then a lazy
+ * input load isn't in the WQM anymore.
+ */
+static bool
+radv_preload_fs_inputs(struct radv_shader_context *ctx)
+{
+	return ctx->shader_info->info.ps.uses_derivatives &&
+	       ctx->shader_info->info.ps.uses_kill;
+}
+
 static void
-handle_fs_inputs(struct radv_shader_context *ctx,
-                 struct nir_shader *nir)
+radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
+		    LLVMValueRef out[4])
 {
 	struct ac_shader_info *info = &ctx->shader_info->info;
 
+	if (idx >= VARYING_SLOT_VAR0 ||
+	    idx == VARYING_SLOT_PNTC ||
+	    idx == VARYING_SLOT_PRIMITIVE_ID ||
+	    idx == VARYING_SLOT_LAYER) {
+		unsigned interp_mode = info->ps.input_interp_mode[idx];
+		unsigned interp_loc = info->ps.input_interp_loc[idx];
+		unsigned hw_index = get_input_hw_index(ctx, idx);
+		LLVMValueRef interp_param =
+			lookup_interp_param(&ctx->abi, interp_mode, interp_loc);
+
+		interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask,
+				&out[0]);
+	} else if (idx == VARYING_SLOT_POS) {
+		for (int i = 0; i < 3; ++i)
+			out[i] = ctx->abi.frag_pos[i];
+
+		out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
+				       ctx->abi.frag_pos[3]);
+	}
+}
+
+static void
+load_fs_inputs(struct ac_shader_abi *abi,
+	       unsigned location,
+	       LLVMValueRef indir_index,
+	       unsigned const_index,
+	       unsigned stride,
+	       LLVMValueRef out[4])
+{
+	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
+
+	if (!radv_preload_fs_inputs(ctx)) {
+		radv_load_fs_inputs(ctx, location, out);
+	} else {
+		unsigned index = radeon_llvm_reg_index_soa(location, 0);
+
+		index += (indir_index ? 0 : const_index * stride);
+
+		memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
+	}
+}
+
+static void
+handle_fs_inputs(struct radv_shader_context *ctx,
+                 struct nir_shader *nir)
+{
 	prepare_interp_optimize(ctx, nir);
 
 	nir_foreach_variable(variable, &nir->inputs)
 		handle_fs_input_decl(ctx, variable);
 
-	unsigned index = 0;
-
 	for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
-		LLVMValueRef interp_param;
 		LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
 
 		if (!(ctx->shader_info->info.input_mask & (1ull << i)))
 			continue;
 
-		if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
-		    i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
-			unsigned interp_mode = info->ps.input_interp_mode[i];
-			unsigned interp_loc = info->ps.input_interp_loc[i];
-
-			interp_param = lookup_interp_param(&ctx->abi, interp_mode,
-							   interp_loc);
-
-			interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
-					inputs);
-
-			++index;
-		} else if (i == VARYING_SLOT_POS) {
-			for(int i = 0; i < 3; ++i)
-				inputs[i] = ctx->abi.frag_pos[i];
-
-			inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
-						  ctx->abi.frag_pos[3]);
-		}
+		radv_load_fs_inputs(ctx, i, inputs);
 	}
 
 	if (ctx->shader_info->info.needs_multiview_view_index)
@@ -6924,6 +6987,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 			ctx.abi.load_base_vertex = radv_load_base_vertex;
 		} else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
 			shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
+			ctx.abi.load_fs_inputs = load_fs_inputs;
 			ctx.abi.lookup_interp_param = lookup_interp_param;
 			ctx.abi.load_sample_position = load_sample_position;
 			ctx.abi.load_sample_mask_in = load_sample_mask_in;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 901e49b1f9..8e51ce9fdd 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -97,6 +97,13 @@ struct ac_shader_abi {
 				    unsigned const_index,
 				    LLVMTypeRef type);
 
+	void (*load_fs_inputs)(struct ac_shader_abi *abi,
+			       unsigned location,
+			       LLVMValueRef indir_index,
+			       unsigned const_index,
+			       unsigned stride,
+			       LLVMValueRef out[4]);
+
 	LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
 					   LLVMTypeRef type,
 					   LLVMValueRef vertex_index,
-- 
2.16.2