[Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning
Samuel Pitoiset
samuel.pitoiset at gmail.com
Mon Mar 12 10:13:37 UTC 2018
On 03/11/2018 04:07 PM, Marek Olšák wrote:
> On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset
> <samuel.pitoiset at gmail.com <mailto:samuel.pitoiset at gmail.com>> wrote:
>
> RadeonSI does something similar, the VGPRs decrease is a win
> but not sure if we really want to implement that.
>
> Polaris10:
> Totals from affected shaders:
> SGPRS: 116376 -> 116768 (0.34 %)
> VGPRS: 76556 -> 74868 (-2.20 %)
> Spilled SGPRs: 10347 -> 10466 (1.15 %)
> Code Size: 5555072 -> 5569024 (0.25 %) bytes
> Max Waves: 9854 -> 9951 (0.98 %)
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com
> <mailto:samuel.pitoiset at gmail.com>>
> ---
> src/amd/common/ac_nir_to_llvm.c | 118
> +++++++++++++++++++++++++++++++---------
> src/amd/common/ac_shader_abi.h | 7 +++
> 2 files changed, 98 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c
> b/src/amd/common/ac_nir_to_llvm.c
> index 644c85e2eb..eb0935972d 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
> nir_intrinsic_instr *instr)
> {
> LLVMValueRef values[8];
> + int location = instr->variables[0]->var->data.location;
> int idx = instr->variables[0]->var->data.driver_location;
> int ve = instr->dest.ssa.num_components;
> unsigned comp = instr->variables[0]->var->data.location_frac;
> @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>
> instr->num_components, vertex_index, const_index, type);
> }
>
> + LLVMValueRef inputs[4];
> +
> + if (ctx->stage == MESA_SHADER_FRAGMENT) {
> + ctx->abi->load_fs_inputs(ctx->abi, location,
> + indir_index,
> const_index,
> + stride, inputs);
>
>
> load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't
> get here?
Yes, missed that, RadeonSI should get there actually.
>
> Marek
>
> + } else {
> + unsigned index = idx +
> + (indir_index ? 0 : const_index *
> stride);
> +
> + memcpy(inputs, &ctx->abi->inputs[index],
> sizeof(inputs));
> + }
> +
> for (unsigned chan = comp; chan < ve + comp; chan++) {
> if (indir_index) {
> unsigned count =
> glsl_count_attribute_slots(
> @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
> ctx->stage ==
> MESA_SHADER_VERTEX);
> count -= chan / 4;
> LLVMValueRef tmp_vec =
> ac_build_gather_values_extended(
> - &ctx->ac,
> ctx->abi->inputs + idx + chan, count,
> + &ctx->ac, inputs +
> chan, count,
> stride, false, true);
>
> values[chan] =
> LLVMBuildExtractElement(ctx->ac.builder,
>
> tmp_vec,
>
> indir_index, "");
> - } else
> - values[chan] = ctx->abi->inputs[idx
> + chan + const_index * stride];
> + } else {
> + values[chan] = inputs[chan];
> + }
> }
> break;
> case nir_var_local:
> @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct
> radv_shader_context *ctx,
> }
> }
>
> +static unsigned
> +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
> +{
> + struct ac_shader_info *info = &ctx->shader_info->info;
> + uint64_t mask = info->input_mask & ((1ull << idx) - 1);
> +
> + mask &= ~(1ull << VARYING_SLOT_POS);
> +
> + return util_bitcount64(mask);
> +}
> +
> +/* If this is true, preload FS inputs at the beginning of shaders.
> Otherwise,
> + * reload them at each use. This must be true if the shader is using
> + * derivatives and KILL, because KILL can leave the WQM and then a lazy
> + * input load isn't in the WQM anymore.
> + */
> +static bool
> +radv_preload_fs_inputs(struct radv_shader_context *ctx)
> +{
> + return ctx->shader_info->info.ps
> <http://info.ps>.uses_derivatives &&
> + ctx->shader_info->info.ps <http://info.ps>.uses_kill;
> +}
> +
> static void
> -handle_fs_inputs(struct radv_shader_context *ctx,
> - struct nir_shader *nir)
> +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
> + LLVMValueRef out[4])
> {
> struct ac_shader_info *info = &ctx->shader_info->info;
>
> + if (idx >= VARYING_SLOT_VAR0 ||
> + idx == VARYING_SLOT_PNTC ||
> + idx == VARYING_SLOT_PRIMITIVE_ID ||
> + idx == VARYING_SLOT_LAYER) {
> + unsigned interp_mode = info->ps.input_interp_mode[idx];
> + unsigned interp_loc = info->ps.input_interp_loc[idx];
> + unsigned hw_index = get_input_hw_index(ctx, idx);
> + LLVMValueRef interp_param =
> + lookup_interp_param(&ctx->abi, interp_mode,
> interp_loc);
> +
> + interp_fs_input(ctx, hw_index, interp_param,
> ctx->abi.prim_mask,
> + &out[0]);
> + } else if (idx == VARYING_SLOT_POS) {
> + for (int i = 0; i < 3; ++i)
> + out[i] = ctx->abi.frag_pos[i];
> +
> + out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
> + ctx->abi.frag_pos[3]);
> + }
> +}
> +
> +static void
> +load_fs_inputs(struct ac_shader_abi *abi,
> + unsigned location,
> + LLVMValueRef indir_index,
> + unsigned const_index,
> + unsigned stride,
> + LLVMValueRef out[4])
> +{
> + struct radv_shader_context *ctx =
> radv_shader_context_from_abi(abi);
> +
> + if (!radv_preload_fs_inputs(ctx)) {
> + radv_load_fs_inputs(ctx, location, out);
> + } else {
> + unsigned index = radeon_llvm_reg_index_soa(location, 0);
> +
> + index += (indir_index ? 0 : const_index * stride);
> +
> + memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
> + }
> +}
> +
> +static void
> +handle_fs_inputs(struct radv_shader_context *ctx,
> + struct nir_shader *nir)
> +{
> prepare_interp_optimize(ctx, nir);
>
> nir_foreach_variable(variable, &nir->inputs)
> handle_fs_input_decl(ctx, variable);
>
> - unsigned index = 0;
> -
> for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
> - LLVMValueRef interp_param;
> LLVMValueRef *inputs = ctx->inputs
> +radeon_llvm_reg_index_soa(i, 0);
>
> if (!(ctx->shader_info->info.input_mask & (1ull << i)))
> continue;
>
> - if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
> - i == VARYING_SLOT_PRIMITIVE_ID || i ==
> VARYING_SLOT_LAYER) {
> - unsigned interp_mode =
> info->ps.input_interp_mode[i];
> - unsigned interp_loc =
> info->ps.input_interp_loc[i];
> -
> - interp_param =
> lookup_interp_param(&ctx->abi, interp_mode,
> - interp_loc);
> -
> - interp_fs_input(ctx, index, interp_param,
> ctx->abi.prim_mask,
> - inputs);
> -
> - ++index;
> - } else if (i == VARYING_SLOT_POS) {
> - for(int i = 0; i < 3; ++i)
> - inputs[i] = ctx->abi.frag_pos[i];
> -
> - inputs[3] = ac_build_fdiv(&ctx->ac,
> ctx->ac.f32_1,
> - ctx->abi.frag_pos[3]);
> - }
> + radv_load_fs_inputs(ctx, i, inputs);
> }
>
> if (ctx->shader_info->info.needs_multiview_view_index)
> @@ -6924,6 +6987,7 @@ LLVMModuleRef
> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
> ctx.abi.load_base_vertex =
> radv_load_base_vertex;
> } else if (shaders[i]->info.stage ==
> MESA_SHADER_FRAGMENT) {
> shader_info->fs.can_discard =
> shaders[i]->info.fs.uses_discard;
> + ctx.abi.load_fs_inputs = load_fs_inputs;
> ctx.abi.lookup_interp_param =
> lookup_interp_param;
> ctx.abi.load_sample_position =
> load_sample_position;
> ctx.abi.load_sample_mask_in =
> load_sample_mask_in;
> diff --git a/src/amd/common/ac_shader_abi.h
> b/src/amd/common/ac_shader_abi.h
> index 901e49b1f9..8e51ce9fdd 100644
> --- a/src/amd/common/ac_shader_abi.h
> +++ b/src/amd/common/ac_shader_abi.h
> @@ -97,6 +97,13 @@ struct ac_shader_abi {
> unsigned const_index,
> LLVMTypeRef type);
>
> + void (*load_fs_inputs)(struct ac_shader_abi *abi,
> + unsigned location,
> + LLVMValueRef indir_index,
> + unsigned const_index,
> + unsigned stride,
> + LLVMValueRef out[4]);
> +
> LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
> LLVMTypeRef type,
> LLVMValueRef vertex_index,
> --
> 2.16.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org <mailto:mesa-dev at lists.freedesktop.org>
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> <https://lists.freedesktop.org/mailman/listinfo/mesa-dev>
>
>
More information about the mesa-dev
mailing list