[Mesa-dev] [PATCH 16/16] radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

Tue Oct 17 15:43:15 UTC 2017

On Tue, Oct 17, 2017 at 2:25 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 13.10.2017 14:04, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer
>> 0
>> if there is no other buffer there.
>>
>> Benefits:
>> - there is no constbuf descriptor upload and shader load
>>
>> It's assumed that all constant addresses are within bounds. Non-constant
>> addresses are clamped against the last declared CONST variable.
>> This only works if the state tracker ensures the bound constant buffer
>> matches what the shader needs.
>>
>> Once we get 32-bit pointers, we can only do this for user constant buffers
>> where the driver is in charge of the upload so that it can guarantee a
>> 32-bit
>> address.
>>
>> The real performance benefit might not be measurable.
>>
>> These apps get 100% theoretical benefit in all shaders (except where
>> noted):
>> - antichamber
>> - barman arkham origins
>> - borderlands 2
>> - borderlands pre-sequel
>> - brutal legend
>> - civilization BE
>> - CS:GO
>> - deadcore
>> - dota 2 -- most shaders
>> - europa universalis
>> - grid autosport -- most shaders
>> - left 4 dead 2
>> - legend of grimrock
>> - life is strange
>> - payday 2
>> - portal
>> - rocket league
>> - serious sam 3 bfe
>> - talos principle
>> - team fortress 2
>> - thea
>> - unigine heaven
>> - unigine valley -- also sanctuary and tropics
>> - wasteland 2
>> - xcom: enemy unknown & enemy within
>> - tesseract
>> - unity (engine)
>>
>> Changed stats only:
>>      SGPRS: 2059998 -> 2086238 (1.27 %)
>>      VGPRS: 1626888 -> 1626904 (0.00 %)
>>      Spilled SGPRs: 7902 -> 7865 (-0.47 %)
>>      Code Size: 60924520 -> 60982660 (0.10 %) bytes
>>      Max Waves: 374539 -> 374526 (-0.00 %)
>> ---
>>   src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
>>   src/gallium/drivers/radeonsi/si_shader.c      | 72
>> +++++++++++++++++++++++----
>>   src/gallium/drivers/radeonsi/si_shader.h      |  2 +-
>>   src/gallium/drivers/radeonsi/si_state.h       |  3 ++
>>   4 files changed, 87 insertions(+), 13 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 0c1fca8..da6efa8 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -119,20 +119,21 @@ static void si_init_descriptor_list(uint32_t
>> *desc_list,
>>     static void si_init_descriptors(struct si_descriptors *desc,
>>                                 unsigned shader_userdata_index,
>>                                 unsigned element_dw_size,
>>                                 unsigned num_elements)
>>   {
>>         desc->list = CALLOC(num_elements, element_dw_size * 4);
>>         desc->element_dw_size = element_dw_size;
>>         desc->num_elements = num_elements;
>>         desc->shader_userdata_offset = shader_userdata_index * 4;
>> +       desc->slot_index_to_bind_directly = -1;
>>   }
>>     static void si_release_descriptors(struct si_descriptors *desc)
>>   {
>>         r600_resource_reference(&desc->buffer, NULL);
>>         FREE(desc->list);
>>   }
>>     static bool si_upload_descriptors(struct si_context *sctx,
>>                                   struct si_descriptors *desc)
>> @@ -141,20 +142,34 @@ static bool si_upload_descriptors(struct si_context
>> *sctx,
>>         unsigned first_slot_offset = desc->first_active_slot * slot_size;
>>         unsigned upload_size = desc->num_active_slots * slot_size;
>>         /* Skip the upload if no shader is using the descriptors.
>> dirty_mask
>>          * will stay dirty and the descriptors will be uploaded when there
>> is
>>          * a shader using them.
>>          */
>>         if (!upload_size)
>>                 return true;
>>   +     /* If there is just one active descriptor, bind it directly. */
>> +       if ((int)desc->first_active_slot ==
>> desc->slot_index_to_bind_directly &&
>> +           desc->num_active_slots == 1) {
>> +               uint32_t *descriptor =
>> &desc->list[desc->slot_index_to_bind_directly *
>> +                                                  desc->element_dw_size];
>> +
>> +               /* The buffer is already in the buffer list. */
>> +               r600_resource_reference(&desc->buffer, NULL);
>> +               desc->gpu_list = NULL;
>> +               desc->gpu_address =
>> si_desc_extract_buffer_address(descriptor);
>> +               si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
>> +               return true;
>> +       }
>> +
>>         uint32_t *ptr;
>>         int buffer_offset;
>>         u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
>>                        si_optimal_tcc_alignment(sctx, upload_size),
>>                        (unsigned*)&buffer_offset,
>>                        (struct pipe_resource**)&desc->buffer,
>>                        (void**)&ptr);
>>         if (!desc->buffer) {
>>                 desc->gpu_address = 0;
>>                 return false; /* skip the draw call */
>> @@ -2524,38 +2539,40 @@ void si_init_all_descriptors(struct si_context
>> *sctx)
>>         int i;
>>         STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>         STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>         for (i = 0; i < SI_NUM_SHADERS; i++) {
>>                 bool gfx9_tcs = false;
>>                 bool gfx9_gs = false;
>>                 unsigned num_sampler_slots = SI_NUM_IMAGES / 2 +
>> SI_NUM_SAMPLERS;
>>                 unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS +
>> SI_NUM_CONST_BUFFERS;
>> +               struct si_descriptors *desc;
>>                 if (sctx->b.chip_class >= GFX9) {
>>                         gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
>>                         gfx9_gs = i == PIPE_SHADER_GEOMETRY;
>>                 }
>>   -
>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
>> -
>> si_const_and_shader_buffer_descriptors(sctx, i),
>> +               desc = si_const_and_shader_buffer_descriptors(sctx, i);
>> +
>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
>>                                          num_buffer_slots,
>>                                          gfx9_tcs ?
>> GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
>>                                          gfx9_gs ?
>> GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
>>
>> SI_SGPR_CONST_AND_SHADER_BUFFERS,
>>                                          RADEON_USAGE_READWRITE,
>>                                          RADEON_USAGE_READ,
>>                                          RADEON_PRIO_SHADER_RW_BUFFER,
>>                                          RADEON_PRIO_CONST_BUFFER);
>> +               desc->slot_index_to_bind_directly =
>> si_get_constbuf_slot(0);
>>   -             struct si_descriptors *desc =
>> si_sampler_and_image_descriptors(sctx, i);
>> +               desc = si_sampler_and_image_descriptors(sctx, i);
>>                 si_init_descriptors(desc,
>>                                     gfx9_tcs ?
>> GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
>>                                     gfx9_gs ?
>> GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
>>                                               SI_SGPR_SAMPLERS_AND_IMAGES,
>>                                     16, num_sampler_slots);
>>                 int j;
>>                 for (j = 0; j < SI_NUM_IMAGES; j++)
>>                         memcpy(desc->list + j * 8, null_image_descriptor,
>> 8 * 4);
>>                 for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>> b/src/gallium/drivers/radeonsi/si_shader.c
>> index f83f45c..ed60f35 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -1966,20 +1966,21 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef
>> index, bool write)
>>         return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
>>   }
>>     static LLVMValueRef fetch_constant(
>>         struct lp_build_tgsi_context *bld_base,
>>         const struct tgsi_full_src_register *reg,
>>         enum tgsi_opcode_type type,
>>         unsigned swizzle)
>>   {
>>         struct si_shader_context *ctx = si_shader_context(bld_base);
>> +       struct si_shader_selector *sel = ctx->shader->selector;
>>         const struct tgsi_ind_register *ireg = &reg->Indirect;
>>         unsigned buf, idx;
>>         LLVMValueRef addr, bufp;
>>         if (swizzle == LP_CHAN_ALL) {
>>                 unsigned chan;
>>                 LLVMValueRef values[4];
>>                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
>>                         values[chan] = fetch_constant(bld_base, reg, type,
>> chan);
>> @@ -1989,42 +1990,87 @@ static LLVMValueRef fetch_constant(
>>         /* Split 64-bit loads. */
>>         if (tgsi_type_is_64bit(type)) {
>>                 LLVMValueRef lo, hi;
>>                 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>> swizzle);
>>                 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>> swizzle + 1);
>>                 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
>>         }
>>   +     idx = reg->Register.Index * 4 + swizzle;
>> +       if (reg->Register.Indirect) {
>> +               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>> +       } else {
>> +               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>> +       }
>> +
>> +       /* Fast path when user data SGPRs point to constant buffer 0
>> directly. */
>> +       if (sel->info.const_buffers_declared == 1 &&
>> +           sel->info.shader_buffers_declared == 0) {
>> +               LLVMValueRef ptr =
>> +                       LLVMGetParam(ctx->main_fn,
>> ctx->param_const_and_shader_buffers);
>> +
>> +               /* This enables use of s_load_dword and flat_load_dword
>> for const buffer 0
>> +                * loads, and up to x4 load opcode merging. However, it
>> leads to horrible
>> +                * code reducing SIMD wave occupancy from 8 to 2 in many
>> cases.
>> +                *
>> +                * Using s_buffer_load_dword (x1) seems to be the best
>> option right now.
>> +                */
>> +#if 0 /* keep this codepath disabled */
>> +               if (!reg->Register.Indirect) {
>> +                       addr = LLVMBuildLShr(ctx->ac.builder, addr,
>> LLVMConstInt(ctx->i32, 2, 0), "");
>> +                       LLVMValueRef result =
>> ac_build_load_invariant(&ctx->ac, ptr, addr);
>> +                       return bitcast(bld_base, type, result);
>> +               }
>> +#endif
>> +
>> +               /* Do the bounds checking with a descriptor, because
>> +                * doing computation and manual bounds checking of 64-bit
>> +                * addresses generates horrible VALU code with very high
>> +                * VGPR usage and very low SIMD occupancy.
>> +                */
>> +               ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64,
>> "");
>> +               ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32,
>> "");
>> +
>> +               LLVMValueRef desc_elems[] = {
>> +                       LLVMBuildExtractElement(ctx->ac.builder, ptr,
>> ctx->i32_0, ""),
>> +                       LLVMBuildExtractElement(ctx->ac.builder, ptr,
>> ctx->i32_1, ""),
>> +                       LLVMConstInt(ctx->i32,
>> (sel->info.const_file_max[0] + 1) * 16, 0),
>> +                       LLVMConstInt(ctx->i32,
>> +                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
>> +                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
>> +                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
>> +                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
>> +
>> S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
>> +
>> S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
>> +               };
>> +               LLVMValueRef desc = ac_build_gather_values(&ctx->ac,
>> desc_elems, 4);
>> +               LLVMValueRef result = buffer_load_const(ctx, desc, addr);
>> +               return bitcast(bld_base, type, result);
>> +       }
>> +
>>         assert(reg->Register.Dimension);
>>         buf = reg->Dimension.Index;
>> -       idx = reg->Register.Index * 4 + swizzle;
>>         if (reg->Dimension.Indirect) {
>>                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn,
>> ctx->param_const_and_shader_buffers);
>>                 LLVMValueRef index;
>>                 index = si_get_bounded_indirect_index(ctx,
>> &reg->DimIndirect,
>>
>> reg->Dimension.Index,
>>
>> ctx->num_const_buffers);
>>                 index = LLVMBuildAdd(ctx->ac.builder, index,
>>                                      LLVMConstInt(ctx->i32,
>> SI_NUM_SHADER_BUFFERS, 0), "");
>>                 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
>>         } else
>>                 bufp = load_const_buffer_desc(ctx, buf);
>>   -     if (reg->Register.Indirect) {
>> -               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>> -       } else {
>> -               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>> -       }
>> -
>>         return bitcast(bld_base, type, buffer_load_const(ctx, bufp,
>> addr));
>>   }
>>     /* Upper 16 bits must be zero. */
>>   static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context
>> *ctx,
>>                                            LLVMValueRef val[2])
>>   {
>>         return LLVMBuildOr(ctx->ac.builder, val[0],
>>                            LLVMBuildShl(ctx->ac.builder, val[1],
>>                                         LLVMConstInt(ctx->i32, 16, 0),
>> @@ -4250,24 +4296,32 @@ static unsigned si_get_max_workgroup_size(const
>> struct si_shader *shader)
>>                  */
>>                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
>>         }
>>         return max_work_group_size;
>>   }
>>     static void declare_per_stage_desc_pointers(struct si_shader_context
>> *ctx,
>>                                             struct si_function_info
>> *fninfo,
>>                                             bool assign_params)
>>   {
>> +       LLVMTypeRef const_shader_buf_type;
>> +
>> +       if (ctx->shader->selector->info.const_buffers_declared == 1 &&
>> +           ctx->shader->selector->info.shader_buffers_declared == 0)
>> +               const_shader_buf_type = ctx->f32;
>> +       else
>> +               const_shader_buf_type = ctx->v4i32;
>> +
>>         unsigned const_and_shader_buffers =
>>                 add_arg(fninfo, ARG_SGPR,
>> -                       si_const_array(ctx->v4i32,
>> -                                      SI_NUM_SHADER_BUFFERS +
>> SI_NUM_CONST_BUFFERS));
>> +                       si_const_array(const_shader_buf_type, 0));
>
>
> Is this really necessary? The pointer is only cast to an int anyway.

Not with the current patch, but it allows using normal loads if we
ever wanted to switch to them. Not sure if PointerCast would work for
those.

Marek