[Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning

Mon Mar 12 10:13:37 UTC 2018

On 03/11/2018 04:07 PM, Marek Olšák wrote:
> On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset 
> <samuel.pitoiset at gmail.com <mailto:samuel.pitoiset at gmail.com>> wrote:
> 
>     RadeonSI does something similar, the VGPRs decrease is a win
>     but not sure if we really want to implement that.
> 
>     Polaris10:
>     Totals from affected shaders:
>     SGPRS: 116376 -> 116768 (0.34 %)
>     VGPRS: 76556 -> 74868 (-2.20 %)
>     Spilled SGPRs: 10347 -> 10466 (1.15 %)
>     Code Size: 5555072 -> 5569024 (0.25 %) bytes
>     Max Waves: 9854 -> 9951 (0.98 %)
> 
>     Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com
>     <mailto:samuel.pitoiset at gmail.com>>
>     ---
>       src/amd/common/ac_nir_to_llvm.c | 118
>     +++++++++++++++++++++++++++++++---------
>       src/amd/common/ac_shader_abi.h  |   7 +++
>       2 files changed, 98 insertions(+), 27 deletions(-)
> 
>     diff --git a/src/amd/common/ac_nir_to_llvm.c
>     b/src/amd/common/ac_nir_to_llvm.c
>     index 644c85e2eb..eb0935972d 100644
>     --- a/src/amd/common/ac_nir_to_llvm.c
>     +++ b/src/amd/common/ac_nir_to_llvm.c
>     @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                         nir_intrinsic_instr *instr)
>       {
>              LLVMValueRef values[8];
>     +       int location = instr->variables[0]->var->data.location;
>              int idx = instr->variables[0]->var->data.driver_location;
>              int ve = instr->dest.ssa.num_components;
>              unsigned comp = instr->variables[0]->var->data.location_frac;
>     @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                                         
>       instr->num_components, vertex_index, const_index, type);
>                      }
> 
>     +               LLVMValueRef inputs[4];
>     +
>     +               if (ctx->stage == MESA_SHADER_FRAGMENT) {
>     +                       ctx->abi->load_fs_inputs(ctx->abi, location,
>     +                                                indir_index,
>     const_index,
>     +                                                stride, inputs);
> 
> 
> load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't 
> get here?

Yes, missed that, RadeonSI should get there actually.

> 
> Marek
> 
>     +               } else {
>     +                       unsigned index = idx +
>     +                               (indir_index ? 0 : const_index *
>     stride);
>     +
>     +                       memcpy(inputs, &ctx->abi->inputs[index],
>     sizeof(inputs));
>     +               }
>     +
>                      for (unsigned chan = comp; chan < ve + comp; chan++) {
>                              if (indir_index) {
>                                      unsigned count =
>     glsl_count_attribute_slots(
>     @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                                      ctx->stage ==
>     MESA_SHADER_VERTEX);
>                                      count -= chan / 4;
>                                      LLVMValueRef tmp_vec =
>     ac_build_gather_values_extended(
>     -                                               &ctx->ac,
>     ctx->abi->inputs + idx + chan, count,
>     +                                               &ctx->ac, inputs +
>     chan, count,
>                                                      stride, false, true);
> 
>                                      values[chan] =
>     LLVMBuildExtractElement(ctx->ac.builder,
>                                                                         
>         tmp_vec,
>                                                                         
>         indir_index, "");
>     -                       } else
>     -                               values[chan] = ctx->abi->inputs[idx
>     + chan + const_index * stride];
>     +                       } else {
>     +                               values[chan] = inputs[chan];
>     +                       }
>                      }
>                      break;
>              case nir_var_local:
>     @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct
>     radv_shader_context *ctx,
>              }
>       }
> 
>     +static unsigned
>     +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
>     +{
>     +       struct ac_shader_info *info = &ctx->shader_info->info;
>     +       uint64_t mask = info->input_mask & ((1ull << idx) - 1);
>     +
>     +       mask &= ~(1ull << VARYING_SLOT_POS);
>     +
>     +       return util_bitcount64(mask);
>     +}
>     +
>     +/* If this is true, preload FS inputs at the beginning of shaders.
>     Otherwise,
>     + * reload them at each use. This must be true if the shader is using
>     + * derivatives and KILL, because KILL can leave the WQM and then a lazy
>     + * input load isn't in the WQM anymore.
>     + */
>     +static bool
>     +radv_preload_fs_inputs(struct radv_shader_context *ctx)
>     +{
>     +       return ctx->shader_info->info.ps
>     <http://info.ps>.uses_derivatives &&
>     +              ctx->shader_info->info.ps <http://info.ps>.uses_kill;
>     +}
>     +
>       static void
>     -handle_fs_inputs(struct radv_shader_context *ctx,
>     -                 struct nir_shader *nir)
>     +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
>     +                   LLVMValueRef out[4])
>       {
>              struct ac_shader_info *info = &ctx->shader_info->info;
> 
>     +       if (idx >= VARYING_SLOT_VAR0 ||
>     +           idx == VARYING_SLOT_PNTC ||
>     +           idx == VARYING_SLOT_PRIMITIVE_ID ||
>     +           idx == VARYING_SLOT_LAYER) {
>     +               unsigned interp_mode = info->ps.input_interp_mode[idx];
>     +               unsigned interp_loc = info->ps.input_interp_loc[idx];
>     +               unsigned hw_index = get_input_hw_index(ctx, idx);
>     +               LLVMValueRef interp_param =
>     +                       lookup_interp_param(&ctx->abi, interp_mode,
>     interp_loc);
>     +
>     +               interp_fs_input(ctx, hw_index, interp_param,
>     ctx->abi.prim_mask,
>     +                               &out[0]);
>     +       } else if (idx == VARYING_SLOT_POS) {
>     +               for (int i = 0; i < 3; ++i)
>     +                       out[i] = ctx->abi.frag_pos[i];
>     +
>     +               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
>     +                                      ctx->abi.frag_pos[3]);
>     +       }
>     +}
>     +
>     +static void
>     +load_fs_inputs(struct ac_shader_abi *abi,
>     +              unsigned location,
>     +              LLVMValueRef indir_index,
>     +              unsigned const_index,
>     +              unsigned stride,
>     +              LLVMValueRef out[4])
>     +{
>     +       struct radv_shader_context *ctx =
>     radv_shader_context_from_abi(abi);
>     +
>     +       if (!radv_preload_fs_inputs(ctx)) {
>     +               radv_load_fs_inputs(ctx, location, out);
>     +       } else {
>     +               unsigned index = radeon_llvm_reg_index_soa(location, 0);
>     +
>     +               index += (indir_index ? 0 : const_index * stride);
>     +
>     +               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
>     +       }
>     +}
>     +
>     +static void
>     +handle_fs_inputs(struct radv_shader_context *ctx,
>     +                 struct nir_shader *nir)
>     +{
>              prepare_interp_optimize(ctx, nir);
> 
>              nir_foreach_variable(variable, &nir->inputs)
>                      handle_fs_input_decl(ctx, variable);
> 
>     -       unsigned index = 0;
>     -
>              for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
>     -               LLVMValueRef interp_param;
>                      LLVMValueRef *inputs = ctx->inputs
>     +radeon_llvm_reg_index_soa(i, 0);
> 
>                      if (!(ctx->shader_info->info.input_mask & (1ull << i)))
>                              continue;
> 
>     -               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
>     -                   i == VARYING_SLOT_PRIMITIVE_ID || i ==
>     VARYING_SLOT_LAYER) {
>     -                       unsigned interp_mode =
>     info->ps.input_interp_mode[i];
>     -                       unsigned interp_loc =
>     info->ps.input_interp_loc[i];
>     -
>     -                       interp_param =
>     lookup_interp_param(&ctx->abi, interp_mode,
>     -                                                          interp_loc);
>     -
>     -                       interp_fs_input(ctx, index, interp_param,
>     ctx->abi.prim_mask,
>     -                                       inputs);
>     -
>     -                       ++index;
>     -               } else if (i == VARYING_SLOT_POS) {
>     -                       for(int i = 0; i < 3; ++i)
>     -                               inputs[i] = ctx->abi.frag_pos[i];
>     -
>     -                       inputs[3] = ac_build_fdiv(&ctx->ac,
>     ctx->ac.f32_1,
>     -                                                 ctx->abi.frag_pos[3]);
>     -               }
>     +               radv_load_fs_inputs(ctx, i, inputs);
>              }
> 
>              if (ctx->shader_info->info.needs_multiview_view_index)
>     @@ -6924,6 +6987,7 @@ LLVMModuleRef
>     ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>                              ctx.abi.load_base_vertex =
>     radv_load_base_vertex;
>                      } else if (shaders[i]->info.stage ==
>     MESA_SHADER_FRAGMENT) {
>                              shader_info->fs.can_discard =
>     shaders[i]->info.fs.uses_discard;
>     +                       ctx.abi.load_fs_inputs = load_fs_inputs;
>                              ctx.abi.lookup_interp_param =
>     lookup_interp_param;
>                              ctx.abi.load_sample_position =
>     load_sample_position;
>                              ctx.abi.load_sample_mask_in =
>     load_sample_mask_in;
>     diff --git a/src/amd/common/ac_shader_abi.h
>     b/src/amd/common/ac_shader_abi.h
>     index 901e49b1f9..8e51ce9fdd 100644
>     --- a/src/amd/common/ac_shader_abi.h
>     +++ b/src/amd/common/ac_shader_abi.h
>     @@ -97,6 +97,13 @@ struct ac_shader_abi {
>                                          unsigned const_index,
>                                          LLVMTypeRef type);
> 
>     +       void (*load_fs_inputs)(struct ac_shader_abi *abi,
>     +                              unsigned location,
>     +                              LLVMValueRef indir_index,
>     +                              unsigned const_index,
>     +                              unsigned stride,
>     +                              LLVMValueRef out[4]);
>     +
>              LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
>                                                 LLVMTypeRef type,
>                                                 LLVMValueRef vertex_index,
>     --
>     2.16.2
> 
>     _______________________________________________
>     mesa-dev mailing list
>     mesa-dev at lists.freedesktop.org <mailto:mesa-dev at lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>     <https://lists.freedesktop.org/mailman/listinfo/mesa-dev>
> 
>