[Mesa-dev] [PATCH 4/4] radeonsi/nir: gather buffers declared more accurately and use const fast path

Fri Mar 30 05:50:57 UTC 2018

On 30/03/18 13:52, Marek Olšák wrote:
> On Tue, Mar 27, 2018 at 12:19 AM, Timothy Arceri <tarceri at itsqueeze.com 
> <mailto:tarceri at itsqueeze.com>> wrote:
> 
>     For now we skip SI && HAVE_LLVM < 0x0600 for simplicity. We also skip
>     setting the more accurate masks for some builtin uniforms for now as
>     it causes some piglit regressions.
>     ---
>       src/gallium/drivers/radeonsi/si_shader.c     |  8 +++
>       src/gallium/drivers/radeonsi/si_shader_nir.c | 82
>     ++++++++++++++++++++++++++--
>       2 files changed, 84 insertions(+), 6 deletions(-)
> 
>     diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>     b/src/gallium/drivers/radeonsi/si_shader.c
>     index 62cb7ea7eb5..9a12f9ee8f2 100644
>     --- a/src/gallium/drivers/radeonsi/si_shader.c
>     +++ b/src/gallium/drivers/radeonsi/si_shader.c
>     @@ -2377,8 +2377,16 @@ static LLVMValueRef
>     load_const_buffer_desc(struct si_shader_context *ctx, int i)
>       static LLVMValueRef load_ubo(struct ac_shader_abi *abi,
>     LLVMValueRef index)
>       {
>              struct si_shader_context *ctx =
>     si_shader_context_from_abi(abi);
>     +       struct si_shader_selector *sel = ctx->shader->selector;
>     +
>              LLVMValueRef ptr = LLVMGetParam(ctx->main_fn,
>     ctx->param_const_and_shader_buffers);
> 
>     +       if (sel->info.const_buffers_declared == 1 &&
>     +           sel->info.shader_buffers_declared == 0 &&
>     +           !(ctx->screen->info.chip_class == SI && HAVE_LLVM <
>     0x0600)) {
> 
> 
> You don't have to check SI and LLVM here. (because 
> const_buffers_declared > 1)

Right. Will fix.

> 
>     +               return load_const_buffer_desc_fast_path(ctx);
>     +       }
>     +
>              index = si_llvm_bound_index(ctx, index,
>     ctx->num_const_buffers);
>              index = LLVMBuildAdd(ctx->ac.builder, index,
>                                   LLVMConstInt(ctx->i32,
>     SI_NUM_SHADER_BUFFERS, 0), "");
>     diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c
>     b/src/gallium/drivers/radeonsi/si_shader_nir.c
>     index 52950668714..595f376f6a2 100644
>     --- a/src/gallium/drivers/radeonsi/si_shader_nir.c
>     +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
>     @@ -611,23 +611,91 @@ void si_nir_scan_shader(const struct
>     nir_shader *nir,
> 
>              info->num_outputs = num_outputs;
> 
>     +       struct set *ubo_set = _mesa_set_create(NULL, _mesa_hash_pointer,
>     +                                              _mesa_key_pointer_equal);
>     +
>     +       unsigned ubo_idx = 1;
>              nir_foreach_variable(variable, &nir->uniforms) {
>                      const struct glsl_type *type = variable->type;
>                      enum glsl_base_type base_type =
>                              glsl_get_base_type(glsl_without_array(type));
>                      unsigned aoa_size = MAX2(1, glsl_get_aoa_size(type));
> 
>     +               /* Gather buffers declared bitmasks. Note: radeonsi
>     doesn't
>     +                * really use the mask (other than ubo_idx == 1 for
>     regular
>     +                * uniforms) its really only used for getting the
>     buffer count
>     +                * so we don't need to worry about the ordering.
>     +                */
>     +               if (variable->interface_type != NULL) {
>     +                       if (variable->data.mode == nir_var_uniform) {
>     +
>     +                               unsigned block_size;
>     +                               if (base_type != GLSL_TYPE_INTERFACE) {
>     +                                       struct set_entry *entry =
>     +                                             
>       _mesa_set_search(ubo_set, variable->interface_type);
>     +
>     +                                       /* Check if we have already
>     processed
>     +                                        * a member from this ubo.
>     +                                        */
>     +                                       if (entry)
>     +                                               continue;
>     +
>     +                                       block_size = 1;
>     +                               } else {
>     +                                       block_size = aoa_size;
>     +                               }
>     +
>     +                               info->const_buffers_declared |=
>     u_bit_consecutive(ubo_idx, block_size);
>     +                               ubo_idx += block_size;
> 
> 
> Can you explain what this does?

Sets the mask for an array of blocks e.g.

uniform block {

vec4 a;
vec4 b;

} name[4];

Since each block array element is considered a separate buffer. I should 
probably rename block_size -> block_count

> 
>     +
>     +                               _mesa_set_add(ubo_set,
>     variable->interface_type);
>     +                       }
>     +
>     +                       if (variable->data.mode ==
>     nir_var_shader_storage) {
>     +                               /* TODO: make this more accurate */
>     +                               info->shader_buffers_declared =
>     +                                       u_bit_consecutive(0,
>     SI_NUM_SHADER_BUFFERS);
> 
>     +                       }
>     +
>     +                       continue;
>     +               }
>     +
>                      /* We rely on the fact that
>     nir_lower_samplers_as_deref has
>                       * eliminated struct dereferences.
>                       */
>     -               if (base_type == GLSL_TYPE_SAMPLER)
>     +               if (base_type == GLSL_TYPE_SAMPLER) {
>                              info->samplers_declared |=
>                                     
>     u_bit_consecutive(variable->data.binding, aoa_size);
>     -               else if (base_type == GLSL_TYPE_IMAGE)
>     +
>     +                       if (variable->data.bindless)
>     +                               info->const_buffers_declared |= 1;
>     +               } else if (base_type == GLSL_TYPE_IMAGE) {
>                              info->images_declared |=
>                                     
>     u_bit_consecutive(variable->data.binding, aoa_size);
>     +
>     +                       if (variable->data.bindless)
>     +                               info->const_buffers_declared |= 1;
>     +               } else if (base_type != GLSL_TYPE_ATOMIC_UINT) {
>     +                       if (strncmp(variable->name, "state.", 6) == 0) {
>     +                               /* FIXME: figure out why piglit
>     tests with builtin
>     +                                * uniforms are failing without this.
>     +                                */
>     +                               info->const_buffers_declared =
>     +                                       u_bit_consecutive(0,
>     SI_NUM_CONST_BUFFERS);
> 
>     +                       } else {
>     +                               info->const_buffers_declared |= 1;
>     +                               info->const_file_max[0] +=
>     +                                     
>       glsl_count_attribute_slots(type, false);
>     +                       }
>     +               }
>              }
> 
>     +       _mesa_set_destroy(ubo_set, NULL);
>     +
>     +       /* This is the max index not max count so we adjust it here */
>     +       if (info->const_file_max[0] != 0)
>     +               info->const_file_max[0] -= 1;
>     +
>              info->num_written_clipdistance =
>     nir->info.clip_distance_array_size;
>              info->num_written_culldistance =
>     nir->info.cull_distance_array_size;
>              info->clipdist_writemask = u_bit_consecutive(0,
>     info->num_written_clipdistance);
>     @@ -636,10 +704,6 @@ void si_nir_scan_shader(const struct nir_shader
>     *nir,
>              if (info->processor == PIPE_SHADER_FRAGMENT)
>                      info->uses_kill = nir->info.fs.uses_discard;
> 
>     -       /* TODO make this more accurate */
>     -       info->const_buffers_declared = u_bit_consecutive(0,
>     SI_NUM_CONST_BUFFERS);
>     -       info->shader_buffers_declared = u_bit_consecutive(0,
>     SI_NUM_SHADER_BUFFERS);
>     -
>              func = (struct nir_function
>     *)exec_list_get_head_const(&nir->functions);
>              nir_foreach_block(block, func->impl) {
>                      nir_foreach_instr(instr, block)
>     @@ -654,6 +718,12 @@ void si_nir_scan_shader(const struct nir_shader
>     *nir,
>       void
>       si_lower_nir(struct si_shader_selector* sel)
>       {
>     +       /* Disable const buffer fast path for old LLVM versions */
>     +       if (sel->screen->info.chip_class == SI && HAVE_LLVM < 0x0600) {
>     +               sel->info.const_buffers_declared =
>     u_bit_consecutive(0, SI_NUM_CONST_BUFFERS);
>     +               sel->info.shader_buffers_declared =
>     u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
>     +       }
> 
> 
> You can ignore what I said in patch 3.
> 
> You can do this instead:
> if (.... && const_buffers_declared == 1 && shader_buffers_declared = 0)
>     const_buffers_declared |= 0x2;
> 
> This disables the optimization while still significantly decreases 
> overhead in the descriptor upload path.

Makes sense will change. Thanks!.

> 
> Marek