[Mesa-dev] [PATCH 16/16] radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer
Nicolai Hähnle
nhaehnle at gmail.com
Tue Oct 17 19:33:13 UTC 2017
On 17.10.2017 17:43, Marek Olšák wrote:
> On Tue, Oct 17, 2017 at 2:25 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 13.10.2017 14:04, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer
>>> 0
>>> if there is no other buffer there.
>>>
>>> Benefits:
>>> - there is no constbuf descriptor upload and shader load
>>>
>>> It's assumed that all constant addresses are within bounds. Non-constant
>>> addresses are clamped against the last declared CONST variable.
>>> This only works if the state tracker ensures the bound constant buffer
>>> matches what the shader needs.
>>>
>>> Once we get 32-bit pointers, we can only do this for user constant buffers
>>> where the driver is in charge of the upload so that it can guarantee a
>>> 32-bit
>>> address.
>>>
>>> The real performance benefit might not be measurable.
>>>
>>> These apps get 100% theoretical benefit in all shaders (except where
>>> noted):
>>> - antichamber
>>> - barman arkham origins
>>> - borderlands 2
>>> - borderlands pre-sequel
>>> - brutal legend
>>> - civilization BE
>>> - CS:GO
>>> - deadcore
>>> - dota 2 -- most shaders
>>> - europa universalis
>>> - grid autosport -- most shaders
>>> - left 4 dead 2
>>> - legend of grimrock
>>> - life is strange
>>> - payday 2
>>> - portal
>>> - rocket league
>>> - serious sam 3 bfe
>>> - talos principle
>>> - team fortress 2
>>> - thea
>>> - unigine heaven
>>> - unigine valley -- also sanctuary and tropics
>>> - wasteland 2
>>> - xcom: enemy unknown & enemy within
>>> - tesseract
>>> - unity (engine)
>>>
>>> Changed stats only:
>>> SGPRS: 2059998 -> 2086238 (1.27 %)
>>> VGPRS: 1626888 -> 1626904 (0.00 %)
>>> Spilled SGPRs: 7902 -> 7865 (-0.47 %)
>>> Code Size: 60924520 -> 60982660 (0.10 %) bytes
>>> Max Waves: 374539 -> 374526 (-0.00 %)
>>> ---
>>> src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
>>> src/gallium/drivers/radeonsi/si_shader.c | 72
>>> +++++++++++++++++++++++----
>>> src/gallium/drivers/radeonsi/si_shader.h | 2 +-
>>> src/gallium/drivers/radeonsi/si_state.h | 3 ++
>>> 4 files changed, 87 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 0c1fca8..da6efa8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -119,20 +119,21 @@ static void si_init_descriptor_list(uint32_t
>>> *desc_list,
>>> static void si_init_descriptors(struct si_descriptors *desc,
>>> unsigned shader_userdata_index,
>>> unsigned element_dw_size,
>>> unsigned num_elements)
>>> {
>>> desc->list = CALLOC(num_elements, element_dw_size * 4);
>>> desc->element_dw_size = element_dw_size;
>>> desc->num_elements = num_elements;
>>> desc->shader_userdata_offset = shader_userdata_index * 4;
>>> + desc->slot_index_to_bind_directly = -1;
>>> }
>>> static void si_release_descriptors(struct si_descriptors *desc)
>>> {
>>> r600_resource_reference(&desc->buffer, NULL);
>>> FREE(desc->list);
>>> }
>>> static bool si_upload_descriptors(struct si_context *sctx,
>>> struct si_descriptors *desc)
>>> @@ -141,20 +142,34 @@ static bool si_upload_descriptors(struct si_context
>>> *sctx,
>>> unsigned first_slot_offset = desc->first_active_slot * slot_size;
>>> unsigned upload_size = desc->num_active_slots * slot_size;
>>> /* Skip the upload if no shader is using the descriptors.
>>> dirty_mask
>>> * will stay dirty and the descriptors will be uploaded when there
>>> is
>>> * a shader using them.
>>> */
>>> if (!upload_size)
>>> return true;
>>> + /* If there is just one active descriptor, bind it directly. */
>>> + if ((int)desc->first_active_slot ==
>>> desc->slot_index_to_bind_directly &&
>>> + desc->num_active_slots == 1) {
>>> + uint32_t *descriptor =
>>> &desc->list[desc->slot_index_to_bind_directly *
>>> + desc->element_dw_size];
>>> +
>>> + /* The buffer is already in the buffer list. */
>>> + r600_resource_reference(&desc->buffer, NULL);
>>> + desc->gpu_list = NULL;
>>> + desc->gpu_address =
>>> si_desc_extract_buffer_address(descriptor);
>>> + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
>>> + return true;
>>> + }
>>> +
>>> uint32_t *ptr;
>>> int buffer_offset;
>>> u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
>>> si_optimal_tcc_alignment(sctx, upload_size),
>>> (unsigned*)&buffer_offset,
>>> (struct pipe_resource**)&desc->buffer,
>>> (void**)&ptr);
>>> if (!desc->buffer) {
>>> desc->gpu_address = 0;
>>> return false; /* skip the draw call */
>>> @@ -2524,38 +2539,40 @@ void si_init_all_descriptors(struct si_context
>>> *sctx)
>>> int i;
>>> STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>> STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>>> for (i = 0; i < SI_NUM_SHADERS; i++) {
>>> bool gfx9_tcs = false;
>>> bool gfx9_gs = false;
>>> unsigned num_sampler_slots = SI_NUM_IMAGES / 2 +
>>> SI_NUM_SAMPLERS;
>>> unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS +
>>> SI_NUM_CONST_BUFFERS;
>>> + struct si_descriptors *desc;
>>> if (sctx->b.chip_class >= GFX9) {
>>> gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
>>> gfx9_gs = i == PIPE_SHADER_GEOMETRY;
>>> }
>>> -
>>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
>>> -
>>> si_const_and_shader_buffer_descriptors(sctx, i),
>>> + desc = si_const_and_shader_buffer_descriptors(sctx, i);
>>> +
>>> si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
>>> num_buffer_slots,
>>> gfx9_tcs ?
>>> GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
>>> gfx9_gs ?
>>> GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
>>>
>>> SI_SGPR_CONST_AND_SHADER_BUFFERS,
>>> RADEON_USAGE_READWRITE,
>>> RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_RW_BUFFER,
>>> RADEON_PRIO_CONST_BUFFER);
>>> + desc->slot_index_to_bind_directly =
>>> si_get_constbuf_slot(0);
>>> - struct si_descriptors *desc =
>>> si_sampler_and_image_descriptors(sctx, i);
>>> + desc = si_sampler_and_image_descriptors(sctx, i);
>>> si_init_descriptors(desc,
>>> gfx9_tcs ?
>>> GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
>>> gfx9_gs ?
>>> GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
>>> SI_SGPR_SAMPLERS_AND_IMAGES,
>>> 16, num_sampler_slots);
>>> int j;
>>> for (j = 0; j < SI_NUM_IMAGES; j++)
>>> memcpy(desc->list + j * 8, null_image_descriptor,
>>> 8 * 4);
>>> for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>> index f83f45c..ed60f35 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -1966,20 +1966,21 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef
>>> index, bool write)
>>> return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
>>> }
>>> static LLVMValueRef fetch_constant(
>>> struct lp_build_tgsi_context *bld_base,
>>> const struct tgsi_full_src_register *reg,
>>> enum tgsi_opcode_type type,
>>> unsigned swizzle)
>>> {
>>> struct si_shader_context *ctx = si_shader_context(bld_base);
>>> + struct si_shader_selector *sel = ctx->shader->selector;
>>> const struct tgsi_ind_register *ireg = ®->Indirect;
>>> unsigned buf, idx;
>>> LLVMValueRef addr, bufp;
>>> if (swizzle == LP_CHAN_ALL) {
>>> unsigned chan;
>>> LLVMValueRef values[4];
>>> for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
>>> values[chan] = fetch_constant(bld_base, reg, type,
>>> chan);
>>> @@ -1989,42 +1990,87 @@ static LLVMValueRef fetch_constant(
>>> /* Split 64-bit loads. */
>>> if (tgsi_type_is_64bit(type)) {
>>> LLVMValueRef lo, hi;
>>> lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>>> swizzle);
>>> hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED,
>>> swizzle + 1);
>>> return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
>>> }
>>> + idx = reg->Register.Index * 4 + swizzle;
>>> + if (reg->Register.Indirect) {
>>> + addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>>> + } else {
>>> + addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>>> + }
>>> +
>>> + /* Fast path when user data SGPRs point to constant buffer 0
>>> directly. */
>>> + if (sel->info.const_buffers_declared == 1 &&
>>> + sel->info.shader_buffers_declared == 0) {
>>> + LLVMValueRef ptr =
>>> + LLVMGetParam(ctx->main_fn,
>>> ctx->param_const_and_shader_buffers);
>>> +
>>> + /* This enables use of s_load_dword and flat_load_dword
>>> for const buffer 0
>>> + * loads, and up to x4 load opcode merging. However, it
>>> leads to horrible
>>> + * code reducing SIMD wave occupancy from 8 to 2 in many
>>> cases.
>>> + *
>>> + * Using s_buffer_load_dword (x1) seems to be the best
>>> option right now.
>>> + */
>>> +#if 0 /* keep this codepath disabled */
>>> + if (!reg->Register.Indirect) {
>>> + addr = LLVMBuildLShr(ctx->ac.builder, addr,
>>> LLVMConstInt(ctx->i32, 2, 0), "");
>>> + LLVMValueRef result =
>>> ac_build_load_invariant(&ctx->ac, ptr, addr);
>>> + return bitcast(bld_base, type, result);
>>> + }
>>> +#endif
>>> +
>>> + /* Do the bounds checking with a descriptor, because
>>> + * doing computation and manual bounds checking of 64-bit
>>> + * addresses generates horrible VALU code with very high
>>> + * VGPR usage and very low SIMD occupancy.
>>> + */
>>> + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64,
>>> "");
>>> + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32,
>>> "");
>>> +
>>> + LLVMValueRef desc_elems[] = {
>>> + LLVMBuildExtractElement(ctx->ac.builder, ptr,
>>> ctx->i32_0, ""),
>>> + LLVMBuildExtractElement(ctx->ac.builder, ptr,
>>> ctx->i32_1, ""),
>>> + LLVMConstInt(ctx->i32,
>>> (sel->info.const_file_max[0] + 1) * 16, 0),
>>> + LLVMConstInt(ctx->i32,
>>> + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
>>> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
>>> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
>>> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
>>> +
>>> S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
>>> +
>>> S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
>>> + };
>>> + LLVMValueRef desc = ac_build_gather_values(&ctx->ac,
>>> desc_elems, 4);
>>> + LLVMValueRef result = buffer_load_const(ctx, desc, addr);
>>> + return bitcast(bld_base, type, result);
>>> + }
>>> +
>>> assert(reg->Register.Dimension);
>>> buf = reg->Dimension.Index;
>>> - idx = reg->Register.Index * 4 + swizzle;
>>> if (reg->Dimension.Indirect) {
>>> LLVMValueRef ptr = LLVMGetParam(ctx->main_fn,
>>> ctx->param_const_and_shader_buffers);
>>> LLVMValueRef index;
>>> index = si_get_bounded_indirect_index(ctx,
>>> ®->DimIndirect,
>>>
>>> reg->Dimension.Index,
>>>
>>> ctx->num_const_buffers);
>>> index = LLVMBuildAdd(ctx->ac.builder, index,
>>> LLVMConstInt(ctx->i32,
>>> SI_NUM_SHADER_BUFFERS, 0), "");
>>> bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
>>> } else
>>> bufp = load_const_buffer_desc(ctx, buf);
>>> - if (reg->Register.Indirect) {
>>> - addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
>>> - } else {
>>> - addr = LLVMConstInt(ctx->i32, idx * 4, 0);
>>> - }
>>> -
>>> return bitcast(bld_base, type, buffer_load_const(ctx, bufp,
>>> addr));
>>> }
>>> /* Upper 16 bits must be zero. */
>>> static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context
>>> *ctx,
>>> LLVMValueRef val[2])
>>> {
>>> return LLVMBuildOr(ctx->ac.builder, val[0],
>>> LLVMBuildShl(ctx->ac.builder, val[1],
>>> LLVMConstInt(ctx->i32, 16, 0),
>>> @@ -4250,24 +4296,32 @@ static unsigned si_get_max_workgroup_size(const
>>> struct si_shader *shader)
>>> */
>>> max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
>>> }
>>> return max_work_group_size;
>>> }
>>> static void declare_per_stage_desc_pointers(struct si_shader_context
>>> *ctx,
>>> struct si_function_info
>>> *fninfo,
>>> bool assign_params)
>>> {
>>> + LLVMTypeRef const_shader_buf_type;
>>> +
>>> + if (ctx->shader->selector->info.const_buffers_declared == 1 &&
>>> + ctx->shader->selector->info.shader_buffers_declared == 0)
>>> + const_shader_buf_type = ctx->f32;
>>> + else
>>> + const_shader_buf_type = ctx->v4i32;
>>> +
>>> unsigned const_and_shader_buffers =
>>> add_arg(fninfo, ARG_SGPR,
>>> - si_const_array(ctx->v4i32,
>>> - SI_NUM_SHADER_BUFFERS +
>>> SI_NUM_CONST_BUFFERS));
>>> + si_const_array(const_shader_buf_type, 0));
>>
>>
>> Is this really necessary? The pointer is only cast to an int anyway.
>
> Not with the current patch, but it allows using normal loads if we
> ever wanted to switch to them. Not sure if PointerCast would work for
> those.
Ah yes, for the #if'd out code. Makes sense, R-b for this patch as well.
Cheers,
Nicolai
>
> Marek
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list