[Mesa-dev] [RFC PATCH 4/4] radv: add support for push constants inlining when possible

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Tue Feb 5 10:29:53 UTC 2019


On Tue, Feb 5, 2019 at 11:07 AM Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
>
>
> On 2/5/19 10:58 AM, Bas Nieuwenhuizen wrote:
> > On Fri, Jan 25, 2019 at 5:27 PM Samuel Pitoiset
> > <samuel.pitoiset at gmail.com> wrote:
> >> This removes some scalar loads from shaders, but it increases
> >> the number of SET_SH_REG packets. This is currently basic but
> >> it could be improved if needed. Inlining dynamic offsets might
> >> also help.
> >>
> >> Original idea from Dave Airlie.
> >>
> >> 29164 shaders in 15096 tests
> >> Totals:
> >> SGPRS: 1336072 -> 1365241 (2.18 %)
> >> VGPRS: 937784 -> 934592 (-0.34 %)
> >> Spilled SGPRs: 24751 -> 24796 (0.18 %)
> >> Code Size: 50001672 -> 49815524 (-0.37 %) bytes
> >> Max Waves: 208755 -> 208830 (0.04 %)
> >>
> >> Totals from affected shaders:
> >> SGPRS: 295018 -> 324187 (9.89 %)
> >> VGPRS: 243108 -> 239916 (-1.31 %)
> >> Spilled SGPRs: 1464 -> 1509 (3.07 %)
> >> Code Size: 8028188 -> 7842040 (-2.32 %) bytes
> >> Max Waves: 69580 -> 69655 (0.11 %)
> >>
> >> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> >> ---
> >>   src/amd/common/ac_nir_to_llvm.c   | 23 +++++++--
> >>   src/amd/common/ac_shader_abi.h    |  4 ++
> >>   src/amd/vulkan/radv_cmd_buffer.c  | 78 ++++++++++++++++++++++---------
> >>   src/amd/vulkan/radv_nir_to_llvm.c | 54 +++++++++++++++++++++
> >>   src/amd/vulkan/radv_shader.h      | 10 ++--
> >>   src/amd/vulkan/radv_shader_info.c |  4 ++
> >>   6 files changed, 145 insertions(+), 28 deletions(-)
> >>
> >> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> >> index f509fc31dff..db1574b5b35 100644
> >> --- a/src/amd/common/ac_nir_to_llvm.c
> >> +++ b/src/amd/common/ac_nir_to_llvm.c
> >> @@ -1392,10 +1392,27 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
> >>                                                nir_intrinsic_instr *instr)
> >>   {
> >>          LLVMValueRef ptr, addr;
> >> +       LLVMValueRef src0 = get_src(ctx, instr->src[0]);
> >> +       unsigned index = nir_intrinsic_base(instr);
> >>
> >> -       addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
> >> -       addr = LLVMBuildAdd(ctx->ac.builder, addr,
> >> -                           get_src(ctx, instr->src[0]), "");
> >> +       addr = LLVMConstInt(ctx->ac.i32, index, 0);
> >> +       addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
> >> +
> >> +       /* Load constant values from user SGPRS when possible, otherwise
> >> +        * fallback to the default path that loads directly from memory.
> >> +        */
> >> +       if (LLVMIsConstant(src0) &&
> >> +           index == 0 &&
> >> +           instr->dest.ssa.bit_size == 32) {
> >> +               unsigned offset = LLVMConstIntGetZExtValue(src0) / 4;
> >> +               unsigned count = instr->dest.ssa.num_components;
> >> +
> >> +               if (offset + count <= ctx->abi->num_inline_push_consts) {
> >> +                       return ac_build_gather_values(&ctx->ac,
> >> +                                                     ctx->abi->inline_push_consts + offset,
> >> +                                                     count);
> >> +               }
> >> +       }
> >>
> >>          ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
> >>
> >> diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
> >> index ee18e6c1923..704c3d107c2 100644
> >> --- a/src/amd/common/ac_shader_abi.h
> >> +++ b/src/amd/common/ac_shader_abi.h
> >> @@ -32,6 +32,8 @@ struct nir_variable;
> >>
> >>   #define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
> >>
> >> +#define AC_MAX_INLINE_PUSH_CONSTS 8
> >> +
> >>   enum ac_descriptor_type {
> >>          AC_DESC_IMAGE,
> >>          AC_DESC_FMASK,
> >> @@ -66,6 +68,8 @@ struct ac_shader_abi {
> >>
> >>          /* Vulkan only */
> >>          LLVMValueRef push_constants;
> >> +       LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
> >> +       unsigned num_inline_push_consts;
> >>          LLVMValueRef view_index;
> >>
> >>          LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
> >> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> >> index aae90290841..f80e2078da0 100644
> >> --- a/src/amd/vulkan/radv_cmd_buffer.c
> >> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> >> @@ -627,6 +627,23 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
> >>          }
> >>   }
> >>
> >> +static void
> >> +radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
> >> +                            struct radv_pipeline *pipeline,
> >> +                            gl_shader_stage stage,
> >> +                            int idx, int count, uint32_t *values)
> >> +{
> >> +       struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
> >> +       uint32_t base_reg = pipeline->user_data_0[stage];
> >> +       if (loc->sgpr_idx == -1)
> >> +               return;
> >> +
> >> +       assert(loc->num_sgprs == count);
> >> +
> >> +       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
> >> +       radeon_emit_array(cmd_buffer->cs, values, count);
> >> +}
> >> +
> >>   static void
> >>   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> >>                                struct radv_pipeline *pipeline)
> >> @@ -1900,6 +1917,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
> >>                  radv_get_descriptors_state(cmd_buffer, bind_point);
> >>          struct radv_pipeline_layout *layout = pipeline->layout;
> >>          struct radv_shader_variant *shader, *prev_shader;
> >> +       bool need_push_constants = false;
> >>          unsigned offset;
> >>          void *ptr;
> >>          uint64_t va;
> >> @@ -1909,37 +1927,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
> >>              (!layout->push_constant_size && !layout->dynamic_offset_count))
> >>                  return;
> >>
> >> -       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> >> -                                         16 * layout->dynamic_offset_count,
> >> -                                         256, &offset, &ptr))
> >> -               return;
> >> +       radv_foreach_stage(stage, stages) {
> >> +               if (!pipeline->shaders[stage])
> >> +                       continue;
> >>
> >> -       memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> >> -       memcpy((char*)ptr + layout->push_constant_size,
> >> -              descriptors_state->dynamic_buffers,
> >> -              16 * layout->dynamic_offset_count);
> >> +               need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
> >> +               need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
> >>
> >> -       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> >> -       va += offset;
> >> +               uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts;
> >>
> >> -       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
> >> -                                                          cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> >> +               radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
> >> +                                            AC_UD_INLINE_PUSH_CONSTANTS,
> >> +                                            count,
> >> +                                            (uint32_t *)&cmd_buffer->push_constants[0]);
> >> +       }
> >>
> >> -       prev_shader = NULL;
> >> -       radv_foreach_stage(stage, stages) {
> >> -               shader = radv_get_shader(pipeline, stage);
> >> +       if (need_push_constants) {
> >> +               if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> >> +                                                 16 * layout->dynamic_offset_count,
> >> +                                                 256, &offset, &ptr))
> >> +                       return;
> >> +
> >> +               memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> >> +               memcpy((char*)ptr + layout->push_constant_size,
> >> +                      descriptors_state->dynamic_buffers,
> >> +                      16 * layout->dynamic_offset_count);
> >> +
> >> +               va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> >> +               va += offset;
> >> +
> >> +               MAYBE_UNUSED unsigned cdw_max =
> >> +                       radeon_check_space(cmd_buffer->device->ws,
> >> +                                          cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> >> +
> >> +               prev_shader = NULL;
> >> +               radv_foreach_stage(stage, stages) {
> >> +                       shader = radv_get_shader(pipeline, stage);
> >>
> >> -               /* Avoid redundantly emitting the address for merged stages. */
> >> -               if (shader && shader != prev_shader) {
> >> -                       radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> >> -                                                  AC_UD_PUSH_CONSTANTS, va);
> >> +                       /* Avoid redundantly emitting the address for merged stages. */
> >> +                       if (shader && shader != prev_shader) {
> >> +                               radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> >> +                                                          AC_UD_PUSH_CONSTANTS, va);
> >>
> >> -                       prev_shader = shader;
> >> +                               prev_shader = shader;
> >> +                       }
> >>                  }
> >> +               assert(cmd_buffer->cs->cdw <= cdw_max);
> >>          }
> >>
> >>          cmd_buffer->push_constant_stages &= ~stages;
> >> -       assert(cmd_buffer->cs->cdw <= cdw_max);
> >>   }
> >>
> >>   static void
> >> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
> >> index 11417c5991b..1cffa748d15 100644
> >> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> >> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> >> @@ -627,6 +627,47 @@ count_vs_user_sgprs(struct radv_shader_context *ctx)
> >>          return count;
> >>   }
> >>
> >> +static void allocate_inline_push_consts(struct radv_shader_context *ctx,
> >> +                                       struct user_sgpr_info *user_sgpr_info)
> >> +{
> >> +       uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
> >> +
> >> +       /* Only supported if shaders don't have indirect push constants. */
> >> +       if (ctx->shader_info->info.has_indirect_push_constants)
> >> +               return;
> >> +
> >> +       /* Only supported for 32-bit push constants. */
> >> +       if (!ctx->shader_info->info.has_32bit_push_constants)
> >> +               return;
> >> +
> >> +       /* Only supported if base starts from 0. */
> >> +       if (ctx->shader_info->info.min_push_constant_used != 0)
> >> +               return;
> >> +
> >> +       uint8_t num_push_consts =
> >> +               (ctx->shader_info->info.max_push_constant_used -
> >> +                ctx->shader_info->info.min_push_constant_used) / 4;
> > I think you need +1? if min_push_constant_used ==
> > max_push_constant_used you still use 1 push constant. (The code in the
> > info pass adds range, but that does not include the size of the loaded
> > thing?)
>
> No, as min is 'base' and max is 'base + range'.
>
> >
> > Now that I think about it, the shader info pass does not take into
> > account the size of the variable at all right? What happens with a
> > vec4? The range does not include the object itself right?
>
> If it's a vec4 starting from base=0, range should be 16, that works for
> vec4.

hmm, so it includes the size of type. If I read the spir-v parser
correctly we set range very large though (looks like range is always
the total size of all push constants?)

>
> >
> >> +
> >> +       /* Check if the number of user SGPRs is large enough. */
> >> +       if (num_push_consts < remaining_sgprs) {
> >> +               ctx->shader_info->info.num_inline_push_consts = num_push_consts;
> >> +       } else {
> >> +               ctx->shader_info->info.num_inline_push_consts = remaining_sgprs;
> >> +       }
> >> +
> >> +       /* Clamp to the maximum number of allowed inlined push constants. */
> >> +       if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
> >> +               ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
> >> +
> >> +       if (ctx->shader_info->info.num_inline_push_consts == num_push_consts &&
> >> +           !ctx->shader_info->info.loads_dynamic_offsets) {
> > This won't work, as we can reach here even if there are some accesses
> > which don't use inline push constants, e.g. some non 32-bit accesses.
>
> Good catch! One possible fix is to change 'has_32bit_push_constants' to
> 'has_only_32bit_push_constants' and bail out if it's false.
>
> What do you think?

That works.

>
> >
> >> +               /* Disable the default push constants path if all constants are
> >> +                * inlined and if shaders don't use dynamic descriptors.
> >> +                */
> >> +               ctx->shader_info->info.loads_push_constants = false;
> >> +       }
> >> +}
> >> +
> >>   static void allocate_user_sgprs(struct radv_shader_context *ctx,
> >>                                  gl_shader_stage stage,
> >>                                  bool has_previous_stage,
> >> @@ -706,6 +747,8 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
> >>          } else {
> >>                  user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
> >>          }
> >> +
> >> +       allocate_inline_push_consts(ctx, user_sgpr_info);
> >>   }
> >>
> >>   static void
> >> @@ -735,6 +778,12 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
> >>                  add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
> >>          }
> >>
> >> +       for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) {
> >> +               add_arg(args, ARG_SGPR, ctx->ac.i32,
> >> +                       &ctx->abi.inline_push_consts[i]);
> >> +       }
> >> +       ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts;
> >> +
> >>          if (ctx->shader_info->info.so.num_outputs) {
> >>                  add_arg(args, ARG_SGPR,
> >>                          ac_array_in_const32_addr_space(ctx->ac.v4i32),
> >> @@ -853,6 +902,11 @@ set_global_input_locs(struct radv_shader_context *ctx,
> >>                  set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
> >>          }
> >>
> >> +       if (ctx->shader_info->info.num_inline_push_consts) {
> >> +               set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
> >> +                              ctx->shader_info->info.num_inline_push_consts);
> >> +       }
> >> +
> >>          if (ctx->streamout_buffers) {
> >>                  set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
> >>                                 user_sgpr_idx);
> >> diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
> >> index 09bc5c2d4a9..d1bc467c5d9 100644
> >> --- a/src/amd/vulkan/radv_shader.h
> >> +++ b/src/amd/vulkan/radv_shader.h
> >> @@ -129,10 +129,11 @@ struct radv_nir_compiler_options {
> >>   enum radv_ud_index {
> >>          AC_UD_SCRATCH_RING_OFFSETS = 0,
> >>          AC_UD_PUSH_CONSTANTS = 1,
> >> -       AC_UD_INDIRECT_DESCRIPTOR_SETS = 2,
> >> -       AC_UD_VIEW_INDEX = 3,
> >> -       AC_UD_STREAMOUT_BUFFERS = 4,
> >> -       AC_UD_SHADER_START = 5,
> >> +       AC_UD_INLINE_PUSH_CONSTANTS = 2,
> >> +       AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
> >> +       AC_UD_VIEW_INDEX = 4,
> >> +       AC_UD_STREAMOUT_BUFFERS = 5,
> >> +       AC_UD_SHADER_START = 6,
> >>          AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
> >>          AC_UD_VS_BASE_VERTEX_START_INSTANCE,
> >>          AC_UD_VS_MAX_UD,
> >> @@ -167,6 +168,7 @@ struct radv_shader_info {
> >>          uint8_t max_push_constant_used;
> >>          bool has_32bit_push_constants;
> >>          bool has_indirect_push_constants;
> >> +       uint8_t num_inline_push_consts;
> >>          uint32_t desc_set_used_mask;
> >>          bool needs_multiview_view_index;
> >>          bool uses_invocation_id;
> >> diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
> >> index cabe2a470a0..900bca1d097 100644
> >> --- a/src/amd/vulkan/radv_shader_info.c
> >> +++ b/src/amd/vulkan/radv_shader_info.c
> >> @@ -211,6 +211,10 @@ gather_push_constant_info(const nir_shader *nir,
> >>          if (base < info->min_push_constant_used)
> >>                  info->min_push_constant_used = base;
> >>
> >> +       /* TODO: handle base not 0. */
> >> +       if (base != 0)
> >> +               info->min_push_constant_used = -1;
> >> +
> >>          info->loads_push_constants = true;
> >>   }
> >>
> >> --
> >> 2.20.1
> >>
> >> _______________________________________________
> >> mesa-dev mailing list
> >> mesa-dev at lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list