[Mesa-dev] [PATCH 16/16] radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

Tue Oct 17 19:33:13 UTC 2017

On 17.10.2017 17:43, Marek Olšák wrote:
> On Tue, Oct 17, 2017 at 2:25 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 13.10.2017 14:04, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer
>>> 0
>>> if there is no other buffer there.
>>>
>>> Benefits:
>>> - there is no constbuf descriptor upload and shader load
>>>
>>> It's assumed that all constant addresses are within bounds. Non-constant
>>> addresses are clamped against the last declared CONST variable.
>>> This only works if the state tracker ensures the bound constant buffer
>>> matches what the shader needs.
>>>
>>> Once we get 32-bit pointers, we can only do this for user constant buffers
>>> where the driver is in charge of the upload so that it can guarantee a
>>> 32-bit
>>> address.
>>>
>>> The real performance benefit might not be measurable.
>>>
>>> These apps get 100% theoretical benefit in all shaders (except where
>>> noted):
>>> - antichamber
>>> - barman arkham origins
>>> - borderlands 2
>>> - borderlands pre-sequel
>>> - brutal legend
>>> - civilization BE
>>> - CS:GO
>>> - deadcore
>>> - dota 2 -- most shaders
>>> - europa universalis
>>> - grid autosport -- most shaders
>>> - left 4 dead 2
>>> - legend of grimrock
>>> - life is strange
>>> - payday 2
>>> - portal
>>> - rocket league
>>> - serious sam 3 bfe
>>> - talos principle
>>> - team fortress 2
>>> - thea
>>> - unigine heaven
>>> - unigine valley -- also sanctuary and tropics
>>> - wasteland 2
>>> - xcom: enemy unknown & enemy within
>>> - tesseract
>>> - unity (engine)
>>>
>>> Changed stats only:
>>>       SGPRS: 2059998 -> 2086238 (1.27 %)
>>>       VGPRS: 1626888 -> 1626904 (0.00 %)
>>>       Spilled SGPRs: 7902 -> 7865 (-0.47 %)
>>>       Code Size: 60924520 -> 60982660 (0.10 %) bytes
>>>       Max Waves: 374539 -> 374526 (-0.00 %)
>>> ---
>>>    src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
>>>    src/gallium/drivers/radeonsi/si_shader.c      | 72
>>> +++++++++++++++++++++++----
>>>    src/gallium/drivers/radeonsi/si_shader.h      |  2 +-
>>>    src/gallium/drivers/radeonsi/si_state.h       |  3 ++
>>>    4 files changed, 87 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 0c1fca8..da6efa8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -119,20 +119,21 @@ static void si_init_descriptor_list(uint32_t
>>> *desc_list,
>>>      static void si_init_descriptors(struct si_descriptors *desc,
>>>                                  unsigned shader_userdata_index,
>>>                                  unsigned element_dw_size,
>>>                                  unsigned num_elements)
>>>    {
>>>          desc->list = CALLOC(num_elements, element_dw_size * 4);
>>>          desc->element_dw_size = element_dw_size;
>>>          desc->num_elements = num_elements;
>>>          desc->shader_userdata_offset = shader_userdata_index * 4;
>>> +       desc->slot_index_to_bind_directly = -1;
>>>    }
>>>      static void si_release_descriptors(struct si_descriptors *desc)
>>>    {
>>>          r600_resource_reference(&desc->buffer, NULL);
>>>          FREE(desc->list);
>>>    }
>>>      static bool si_upload_descriptors(struct si_context *sctx,
>>>                                    struct si_descriptors *desc)
>>> @@ -141,20 +142,34 @@ static bool si_upload_descriptors(struct si_context
>>> *sctx,
>>>          unsigned first_slot_offset = desc->first_active_slot * slot_size;
>>>          unsigned upload_size = desc->num_active_slots * slot_size;
>>>          /* Skip the upload if no shader is using the descriptors.
>>> dirty_mask
>>>           * will stay dirty and the descriptors will be uploaded when there
>>> is
>>>           * a shader using them.
>>>           */
>>>          if (!upload_size)
>>>                  return true;
>>>    +     /* If there is just one active descriptor, bind it directly. */
>>> +       if ((int)desc->first_active_slot ==
>>> desc->slot_index_to_bind_directly &&
>>> +           desc->num_active_slots == 1) {
>>> +               uint32_t *descriptor =
>>> &desc->list[desc->slot_index_to_bind_directly *
>>> +                                                  desc->element_dw_size];
>>> +
>>> +               /* The buffer is already in the buffer list. */
>>> +               r600_resource_reference(&desc->buffer, NULL);
>>> +               desc->gpu_list = NULL;
>>> +               desc->gpu_address =
>>> si_desc_extract_buffer_address(descriptor);
>>> +               si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
>>> +               return true;
>>> +       }
>>> +
>>>          uint32_t *ptr;
>>>          int buffer_offset;
>>>          u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
>>>                         si_optimal_tcc_alignment(sctx, upload_size),
>>>                         (unsigned*)&buffer_offset,
>>>                         (struct pipe_resource**)&desc->buffer,
>>>                         (void**)&ptr);
>>>          if (!desc->buffer) {
>>>                  desc->gpu_address = 0;
>>>                  return false; /* skip the draw call */
>>> @@ -2524,38 +2539,40 @@ void si_init_all_descriptors(struct si_context
>>> *sctx)
>>>          int i;
>>>          STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>>          STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>>          for (i = 0; i < SI_NUM_SHADERS; i++) {
>>>                  bool gfx9_tcs = false;
>>>                  bool gfx9_gs = false;
>>>                  unsigned num_sampler_slots = SI_NUM_IMAGES / 2 +
>>> SI_NUM_SAMPLERS;
>>>                  unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS +
>>> SI_NUM_CONST_BUFFERS;
>>> +               struct si_descriptors *desc;
>>>                  if (sctx->b.chip_class >= GFX9) {
>>>                          gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
>>>                          gfx9_gs = i == PIPE_SHADER_GEOMETRY;
>>>                  }
>>>    -
>>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
>>> -
>>> si_const_and_shader_buffer_descriptors(sctx, i),
>>> +               desc = si_const_and_shader_buffer_descriptors(sctx, i);
>>> +
>>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
>>>                                           num_buffer_slots,
>>>                                           gfx9_tcs ?
>>> GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
>>>                                           gfx9_gs ?
>>> GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
>>>
>>> SI_SGPR_CONST_AND_SHADER_BUFFERS,
>>>                                           RADEON_USAGE_READWRITE,
>>>                                           RADEON_USAGE_READ,
>>>                                           RADEON_PRIO_SHADER_RW_BUFFER,
>>>                                           RADEON_PRIO_CONST_BUFFER);
>>> +               desc->slot_index_to_bind_directly =
>>> si_get_constbuf_slot(0);
>>>    -             struct si_descriptors *desc =
>>> si_sampler_and_image_descriptors(sctx, i);
>>> +               desc = si_sampler_and_image_descriptors(sctx, i);
>>>                  si_init_descriptors(desc,
>>>                                      gfx9_tcs ?
>>> GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
>>>                                      gfx9_gs ?
>>> GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
>>>                                                SI_SGPR_SAMPLERS_AND_IMAGES,
>>>                                      16, num_sampler_slots);
>>>                  int j;
>>>                  for (j = 0; j < SI_NUM_IMAGES; j++)
>>>                          memcpy(desc->list + j * 8, null_image_descriptor,
>>> 8 * 4);
>>>                  for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>> index f83f45c..ed60f35 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -1966,20 +1966,21 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef
>>> index, bool write)
>>>          return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
>>>    }
>>>      static LLVMValueRef fetch_constant(
>>>          struct lp_build_tgsi_context *bld_base,
>>>          const struct tgsi_full_src_register *reg,
>>>          enum tgsi_opcode_type type,
>>>          unsigned swizzle)
>>>    {
>>>          struct si_shader_context *ctx = si_shader_context(bld_base);
>>> +       struct si_shader_selector *sel = ctx->shader->selector;
>>>          const struct tgsi_ind_register *ireg = &reg->Indirect;
>>>          unsigned buf, idx;
>>>          LLVMValueRef addr, bufp;
>>>          if (swizzle == LP_CHAN_ALL) {
>>>                  unsigned chan;
>>>                  LLVMValueRef values[4];
>>>                  for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
>>>                          values[chan] = fetch_constant(bld_base, reg, type,
>>> chan);
>>> @@ -1989,42 +1990,87 @@ static LLVMValueRef fetch_constant(
>>>          /* Split 64-bit loads. */
>>>          if (tgsi_type_is_64bit(type)) {
>>>                  LLVMValueRef lo, hi;
>>>                  lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>>> swizzle);
>>>                  hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>>> swizzle + 1);
>>>                  return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
>>>          }
>>>    +     idx = reg->Register.Index * 4 + swizzle;
>>> +       if (reg->Register.Indirect) {
>>> +               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>>> +       } else {
>>> +               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>>> +       }
>>> +
>>> +       /* Fast path when user data SGPRs point to constant buffer 0
>>> directly. */
>>> +       if (sel->info.const_buffers_declared == 1 &&
>>> +           sel->info.shader_buffers_declared == 0) {
>>> +               LLVMValueRef ptr =
>>> +                       LLVMGetParam(ctx->main_fn,
>>> ctx->param_const_and_shader_buffers);
>>> +
>>> +               /* This enables use of s_load_dword and flat_load_dword
>>> for const buffer 0
>>> +                * loads, and up to x4 load opcode merging. However, it
>>> leads to horrible
>>> +                * code reducing SIMD wave occupancy from 8 to 2 in many
>>> cases.
>>> +                *
>>> +                * Using s_buffer_load_dword (x1) seems to be the best
>>> option right now.
>>> +                */
>>> +#if 0 /* keep this codepath disabled */
>>> +               if (!reg->Register.Indirect) {
>>> +                       addr = LLVMBuildLShr(ctx->ac.builder, addr,
>>> LLVMConstInt(ctx->i32, 2, 0), "");
>>> +                       LLVMValueRef result =
>>> ac_build_load_invariant(&ctx->ac, ptr, addr);
>>> +                       return bitcast(bld_base, type, result);
>>> +               }
>>> +#endif
>>> +
>>> +               /* Do the bounds checking with a descriptor, because
>>> +                * doing computation and manual bounds checking of 64-bit
>>> +                * addresses generates horrible VALU code with very high
>>> +                * VGPR usage and very low SIMD occupancy.
>>> +                */
>>> +               ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64,
>>> "");
>>> +               ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32,
>>> "");
>>> +
>>> +               LLVMValueRef desc_elems[] = {
>>> +                       LLVMBuildExtractElement(ctx->ac.builder, ptr,
>>> ctx->i32_0, ""),
>>> +                       LLVMBuildExtractElement(ctx->ac.builder, ptr,
>>> ctx->i32_1, ""),
>>> +                       LLVMConstInt(ctx->i32,
>>> (sel->info.const_file_max[0] + 1) * 16, 0),
>>> +                       LLVMConstInt(ctx->i32,
>>> +                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
>>> +                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
>>> +                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
>>> +                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
>>> +
>>> S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
>>> +
>>> S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
>>> +               };
>>> +               LLVMValueRef desc = ac_build_gather_values(&ctx->ac,
>>> desc_elems, 4);
>>> +               LLVMValueRef result = buffer_load_const(ctx, desc, addr);
>>> +               return bitcast(bld_base, type, result);
>>> +       }
>>> +
>>>          assert(reg->Register.Dimension);
>>>          buf = reg->Dimension.Index;
>>> -       idx = reg->Register.Index * 4 + swizzle;
>>>          if (reg->Dimension.Indirect) {
>>>                  LLVMValueRef ptr = LLVMGetParam(ctx->main_fn,
>>> ctx->param_const_and_shader_buffers);
>>>                  LLVMValueRef index;
>>>                  index = si_get_bounded_indirect_index(ctx,
>>> &reg->DimIndirect,
>>>
>>> reg->Dimension.Index,
>>>
>>> ctx->num_const_buffers);
>>>                  index = LLVMBuildAdd(ctx->ac.builder, index,
>>>                                       LLVMConstInt(ctx->i32,
>>> SI_NUM_SHADER_BUFFERS, 0), "");
>>>                  bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
>>>          } else
>>>                  bufp = load_const_buffer_desc(ctx, buf);
>>>    -     if (reg->Register.Indirect) {
>>> -               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>>> -       } else {
>>> -               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>>> -       }
>>> -
>>>          return bitcast(bld_base, type, buffer_load_const(ctx, bufp,
>>> addr));
>>>    }
>>>      /* Upper 16 bits must be zero. */
>>>    static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context
>>> *ctx,
>>>                                             LLVMValueRef val[2])
>>>    {
>>>          return LLVMBuildOr(ctx->ac.builder, val[0],
>>>                             LLVMBuildShl(ctx->ac.builder, val[1],
>>>                                          LLVMConstInt(ctx->i32, 16, 0),
>>> @@ -4250,24 +4296,32 @@ static unsigned si_get_max_workgroup_size(const
>>> struct si_shader *shader)
>>>                   */
>>>                  max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
>>>          }
>>>          return max_work_group_size;
>>>    }
>>>      static void declare_per_stage_desc_pointers(struct si_shader_context
>>> *ctx,
>>>                                              struct si_function_info
>>> *fninfo,
>>>                                              bool assign_params)
>>>    {
>>> +       LLVMTypeRef const_shader_buf_type;
>>> +
>>> +       if (ctx->shader->selector->info.const_buffers_declared == 1 &&
>>> +           ctx->shader->selector->info.shader_buffers_declared == 0)
>>> +               const_shader_buf_type = ctx->f32;
>>> +       else
>>> +               const_shader_buf_type = ctx->v4i32;
>>> +
>>>          unsigned const_and_shader_buffers =
>>>                  add_arg(fninfo, ARG_SGPR,
>>> -                       si_const_array(ctx->v4i32,
>>> -                                      SI_NUM_SHADER_BUFFERS +
>>> SI_NUM_CONST_BUFFERS));
>>> +                       si_const_array(const_shader_buf_type, 0));
>>
>>
>> Is this really necessary? The pointer is only cast to an int anyway.
> 
> Not with the current patch, but it allows using normal loads if we
> ever wanted to switch to them. Not sure if PointerCast would work for
> those.

Ah yes, for the #if'd out code. Makes sense, R-b for this patch as well.

Cheers,
Nicolai

> 
> Marek
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.