[Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu Mar 8 14:08:11 UTC 2018
RadeonSI does something similar, the VGPRs decrease is a win
but not sure if we really want to implement that.
Polaris10:
Totals from affected shaders:
SGPRS: 116376 -> 116768 (0.34 %)
VGPRS: 76556 -> 74868 (-2.20 %)
Spilled SGPRs: 10347 -> 10466 (1.15 %)
Code Size: 5555072 -> 5569024 (0.25 %) bytes
Max Waves: 9854 -> 9951 (0.98 %)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
src/amd/common/ac_nir_to_llvm.c | 118 +++++++++++++++++++++++++++++++---------
src/amd/common/ac_shader_abi.h | 7 +++
2 files changed, 98 insertions(+), 27 deletions(-)
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 644c85e2eb..eb0935972d 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
nir_intrinsic_instr *instr)
{
LLVMValueRef values[8];
+ int location = instr->variables[0]->var->data.location;
int idx = instr->variables[0]->var->data.driver_location;
int ve = instr->dest.ssa.num_components;
unsigned comp = instr->variables[0]->var->data.location_frac;
@@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
instr->num_components, vertex_index, const_index, type);
}
+ LLVMValueRef inputs[4];
+
+ if (ctx->stage == MESA_SHADER_FRAGMENT) {
+ ctx->abi->load_fs_inputs(ctx->abi, location,
+ indir_index, const_index,
+ stride, inputs);
+ } else {
+ unsigned index = idx +
+ (indir_index ? 0 : const_index * stride);
+
+ memcpy(inputs, &ctx->abi->inputs[index], sizeof(inputs));
+ }
+
for (unsigned chan = comp; chan < ve + comp; chan++) {
if (indir_index) {
unsigned count = glsl_count_attribute_slots(
@@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
ctx->stage == MESA_SHADER_VERTEX);
count -= chan / 4;
LLVMValueRef tmp_vec = ac_build_gather_values_extended(
- &ctx->ac, ctx->abi->inputs + idx + chan, count,
+ &ctx->ac, inputs + chan, count,
stride, false, true);
values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
tmp_vec,
indir_index, "");
- } else
- values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
+ } else {
+ values[chan] = inputs[chan];
+ }
}
break;
case nir_var_local:
@@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context *ctx,
}
}
+static unsigned
+get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
+{
+ struct ac_shader_info *info = &ctx->shader_info->info;
+ uint64_t mask = info->input_mask & ((1ull << idx) - 1);
+
+ mask &= ~(1ull << VARYING_SLOT_POS);
+
+ return util_bitcount64(mask);
+}
+
+/* If this is true, preload FS inputs at the beginning of shaders. Otherwise,
+ * reload them at each use. This must be true if the shader is using
+ * derivatives and KILL, because KILL can leave the WQM and then a lazy
+ * input load isn't in the WQM anymore.
+ */
+static bool
+radv_preload_fs_inputs(struct radv_shader_context *ctx)
+{
+ return ctx->shader_info->info.ps.uses_derivatives &&
+ ctx->shader_info->info.ps.uses_kill;
+}
+
static void
-handle_fs_inputs(struct radv_shader_context *ctx,
- struct nir_shader *nir)
+radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
+ LLVMValueRef out[4])
{
struct ac_shader_info *info = &ctx->shader_info->info;
+ if (idx >= VARYING_SLOT_VAR0 ||
+ idx == VARYING_SLOT_PNTC ||
+ idx == VARYING_SLOT_PRIMITIVE_ID ||
+ idx == VARYING_SLOT_LAYER) {
+ unsigned interp_mode = info->ps.input_interp_mode[idx];
+ unsigned interp_loc = info->ps.input_interp_loc[idx];
+ unsigned hw_index = get_input_hw_index(ctx, idx);
+ LLVMValueRef interp_param =
+ lookup_interp_param(&ctx->abi, interp_mode, interp_loc);
+
+ interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask,
+ &out[0]);
+ } else if (idx == VARYING_SLOT_POS) {
+ for (int i = 0; i < 3; ++i)
+ out[i] = ctx->abi.frag_pos[i];
+
+ out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
+ ctx->abi.frag_pos[3]);
+ }
+}
+
+static void
+load_fs_inputs(struct ac_shader_abi *abi,
+ unsigned location,
+ LLVMValueRef indir_index,
+ unsigned const_index,
+ unsigned stride,
+ LLVMValueRef out[4])
+{
+ struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
+
+ if (!radv_preload_fs_inputs(ctx)) {
+ radv_load_fs_inputs(ctx, location, out);
+ } else {
+ unsigned index = radeon_llvm_reg_index_soa(location, 0);
+
+ index += (indir_index ? 0 : const_index * stride);
+
+ memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
+ }
+}
+
+static void
+handle_fs_inputs(struct radv_shader_context *ctx,
+ struct nir_shader *nir)
+{
prepare_interp_optimize(ctx, nir);
nir_foreach_variable(variable, &nir->inputs)
handle_fs_input_decl(ctx, variable);
- unsigned index = 0;
-
for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
- LLVMValueRef interp_param;
LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
if (!(ctx->shader_info->info.input_mask & (1ull << i)))
continue;
- if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
- i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
- unsigned interp_mode = info->ps.input_interp_mode[i];
- unsigned interp_loc = info->ps.input_interp_loc[i];
-
- interp_param = lookup_interp_param(&ctx->abi, interp_mode,
- interp_loc);
-
- interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
- inputs);
-
- ++index;
- } else if (i == VARYING_SLOT_POS) {
- for(int i = 0; i < 3; ++i)
- inputs[i] = ctx->abi.frag_pos[i];
-
- inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
- ctx->abi.frag_pos[3]);
- }
+ radv_load_fs_inputs(ctx, i, inputs);
}
if (ctx->shader_info->info.needs_multiview_view_index)
@@ -6924,6 +6987,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
ctx.abi.load_base_vertex = radv_load_base_vertex;
} else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
+ ctx.abi.load_fs_inputs = load_fs_inputs;
ctx.abi.lookup_interp_param = lookup_interp_param;
ctx.abi.load_sample_position = load_sample_position;
ctx.abi.load_sample_mask_in = load_sample_mask_in;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 901e49b1f9..8e51ce9fdd 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -97,6 +97,13 @@ struct ac_shader_abi {
unsigned const_index,
LLVMTypeRef type);
+ void (*load_fs_inputs)(struct ac_shader_abi *abi,
+ unsigned location,
+ LLVMValueRef indir_index,
+ unsigned const_index,
+ unsigned stride,
+ LLVMValueRef out[4]);
+
LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
LLVMTypeRef type,
LLVMValueRef vertex_index,
--
2.16.2
More information about the mesa-dev
mailing list