[Mesa-dev] [PATCH 16/16] radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer
Nicolai Hähnle
nhaehnle at gmail.com
Tue Oct 17 12:25:55 UTC 2017
On 13.10.2017 14:04, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer 0
> if there is no other buffer there.
>
> Benefits:
> - there is no constbuf descriptor upload and shader load
>
> It's assumed that all constant addresses are within bounds. Non-constant
> addresses are clamped against the last declared CONST variable.
> This only works if the state tracker ensures the bound constant buffer
> matches what the shader needs.
>
> Once we get 32-bit pointers, we can only do this for user constant buffers
> where the driver is in charge of the upload so that it can guarantee a 32-bit
> address.
>
> The real performance benefit might not be measurable.
>
> These apps get 100% theoretical benefit in all shaders (except where noted):
> - antichamber
> - barman arkham origins
> - borderlands 2
> - borderlands pre-sequel
> - brutal legend
> - civilization BE
> - CS:GO
> - deadcore
> - dota 2 -- most shaders
> - europa universalis
> - grid autosport -- most shaders
> - left 4 dead 2
> - legend of grimrock
> - life is strange
> - payday 2
> - portal
> - rocket league
> - serious sam 3 bfe
> - talos principle
> - team fortress 2
> - thea
> - unigine heaven
> - unigine valley -- also sanctuary and tropics
> - wasteland 2
> - xcom: enemy unknown & enemy within
> - tesseract
> - unity (engine)
>
> Changed stats only:
> SGPRS: 2059998 -> 2086238 (1.27 %)
> VGPRS: 1626888 -> 1626904 (0.00 %)
> Spilled SGPRs: 7902 -> 7865 (-0.47 %)
> Code Size: 60924520 -> 60982660 (0.10 %) bytes
> Max Waves: 374539 -> 374526 (-0.00 %)
> ---
> src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
> src/gallium/drivers/radeonsi/si_shader.c | 72 +++++++++++++++++++++++----
> src/gallium/drivers/radeonsi/si_shader.h | 2 +-
> src/gallium/drivers/radeonsi/si_state.h | 3 ++
> 4 files changed, 87 insertions(+), 13 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 0c1fca8..da6efa8 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -119,20 +119,21 @@ static void si_init_descriptor_list(uint32_t *desc_list,
>
> static void si_init_descriptors(struct si_descriptors *desc,
> unsigned shader_userdata_index,
> unsigned element_dw_size,
> unsigned num_elements)
> {
> desc->list = CALLOC(num_elements, element_dw_size * 4);
> desc->element_dw_size = element_dw_size;
> desc->num_elements = num_elements;
> desc->shader_userdata_offset = shader_userdata_index * 4;
> + desc->slot_index_to_bind_directly = -1;
> }
>
> static void si_release_descriptors(struct si_descriptors *desc)
> {
> r600_resource_reference(&desc->buffer, NULL);
> FREE(desc->list);
> }
>
> static bool si_upload_descriptors(struct si_context *sctx,
> struct si_descriptors *desc)
> @@ -141,20 +142,34 @@ static bool si_upload_descriptors(struct si_context *sctx,
> unsigned first_slot_offset = desc->first_active_slot * slot_size;
> unsigned upload_size = desc->num_active_slots * slot_size;
>
> /* Skip the upload if no shader is using the descriptors. dirty_mask
> * will stay dirty and the descriptors will be uploaded when there is
> * a shader using them.
> */
> if (!upload_size)
> return true;
>
> + /* If there is just one active descriptor, bind it directly. */
> + if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
> + desc->num_active_slots == 1) {
> + uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
> + desc->element_dw_size];
> +
> + /* The buffer is already in the buffer list. */
> + r600_resource_reference(&desc->buffer, NULL);
> + desc->gpu_list = NULL;
> + desc->gpu_address = si_desc_extract_buffer_address(descriptor);
> + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
> + return true;
> + }
> +
> uint32_t *ptr;
> int buffer_offset;
> u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
> si_optimal_tcc_alignment(sctx, upload_size),
> (unsigned*)&buffer_offset,
> (struct pipe_resource**)&desc->buffer,
> (void**)&ptr);
> if (!desc->buffer) {
> desc->gpu_address = 0;
> return false; /* skip the draw call */
> @@ -2524,38 +2539,40 @@ void si_init_all_descriptors(struct si_context *sctx)
> int i;
>
> STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
> STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>
> for (i = 0; i < SI_NUM_SHADERS; i++) {
> bool gfx9_tcs = false;
> bool gfx9_gs = false;
> unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
> unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
> + struct si_descriptors *desc;
>
> if (sctx->b.chip_class >= GFX9) {
> gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
> gfx9_gs = i == PIPE_SHADER_GEOMETRY;
> }
>
> - si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
> - si_const_and_shader_buffer_descriptors(sctx, i),
> + desc = si_const_and_shader_buffer_descriptors(sctx, i);
> + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
> num_buffer_slots,
> gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
> gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
> SI_SGPR_CONST_AND_SHADER_BUFFERS,
> RADEON_USAGE_READWRITE,
> RADEON_USAGE_READ,
> RADEON_PRIO_SHADER_RW_BUFFER,
> RADEON_PRIO_CONST_BUFFER);
> + desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
>
> - struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
> + desc = si_sampler_and_image_descriptors(sctx, i);
> si_init_descriptors(desc,
> gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
> gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
> SI_SGPR_SAMPLERS_AND_IMAGES,
> 16, num_sampler_slots);
>
> int j;
> for (j = 0; j < SI_NUM_IMAGES; j++)
> memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
> for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index f83f45c..ed60f35 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -1966,20 +1966,21 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
> return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
> }
>
> static LLVMValueRef fetch_constant(
> struct lp_build_tgsi_context *bld_base,
> const struct tgsi_full_src_register *reg,
> enum tgsi_opcode_type type,
> unsigned swizzle)
> {
> struct si_shader_context *ctx = si_shader_context(bld_base);
> + struct si_shader_selector *sel = ctx->shader->selector;
> const struct tgsi_ind_register *ireg = ®->Indirect;
> unsigned buf, idx;
>
> LLVMValueRef addr, bufp;
>
> if (swizzle == LP_CHAN_ALL) {
> unsigned chan;
> LLVMValueRef values[4];
> for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
> values[chan] = fetch_constant(bld_base, reg, type, chan);
> @@ -1989,42 +1990,87 @@ static LLVMValueRef fetch_constant(
>
> /* Split 64-bit loads. */
> if (tgsi_type_is_64bit(type)) {
> LLVMValueRef lo, hi;
>
> lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
> hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
> return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
> }
>
> + idx = reg->Register.Index * 4 + swizzle;
> + if (reg->Register.Indirect) {
> + addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
> + } else {
> + addr = LLVMConstInt(ctx->i32, idx * 4, 0);
> + }
> +
> + /* Fast path when user data SGPRs point to constant buffer 0 directly. */
> + if (sel->info.const_buffers_declared == 1 &&
> + sel->info.shader_buffers_declared == 0) {
> + LLVMValueRef ptr =
> + LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
> +
> + /* This enables use of s_load_dword and flat_load_dword for const buffer 0
> + * loads, and up to x4 load opcode merging. However, it leads to horrible
> + * code reducing SIMD wave occupancy from 8 to 2 in many cases.
> + *
> + * Using s_buffer_load_dword (x1) seems to be the best option right now.
> + */
> +#if 0 /* keep this codepath disabled */
> + if (!reg->Register.Indirect) {
> + addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
> + LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
> + return bitcast(bld_base, type, result);
> + }
> +#endif
> +
> + /* Do the bounds checking with a descriptor, because
> + * doing computation and manual bounds checking of 64-bit
> + * addresses generates horrible VALU code with very high
> + * VGPR usage and very low SIMD occupancy.
> + */
> + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
> + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
> +
> + LLVMValueRef desc_elems[] = {
> + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
> + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
> + LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
> + LLVMConstInt(ctx->i32,
> + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
> + };
> + LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
> + LLVMValueRef result = buffer_load_const(ctx, desc, addr);
> + return bitcast(bld_base, type, result);
> + }
> +
> assert(reg->Register.Dimension);
> buf = reg->Dimension.Index;
> - idx = reg->Register.Index * 4 + swizzle;
>
> if (reg->Dimension.Indirect) {
> LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
> LLVMValueRef index;
> index = si_get_bounded_indirect_index(ctx, ®->DimIndirect,
> reg->Dimension.Index,
> ctx->num_const_buffers);
> index = LLVMBuildAdd(ctx->ac.builder, index,
> LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
> bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
> } else
> bufp = load_const_buffer_desc(ctx, buf);
>
> - if (reg->Register.Indirect) {
> - addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
> - } else {
> - addr = LLVMConstInt(ctx->i32, idx * 4, 0);
> - }
> -
> return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
> }
>
> /* Upper 16 bits must be zero. */
> static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
> LLVMValueRef val[2])
> {
> return LLVMBuildOr(ctx->ac.builder, val[0],
> LLVMBuildShl(ctx->ac.builder, val[1],
> LLVMConstInt(ctx->i32, 16, 0),
> @@ -4250,24 +4296,32 @@ static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
> */
> max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
> }
> return max_work_group_size;
> }
>
> static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
> struct si_function_info *fninfo,
> bool assign_params)
> {
> + LLVMTypeRef const_shader_buf_type;
> +
> + if (ctx->shader->selector->info.const_buffers_declared == 1 &&
> + ctx->shader->selector->info.shader_buffers_declared == 0)
> + const_shader_buf_type = ctx->f32;
> + else
> + const_shader_buf_type = ctx->v4i32;
> +
> unsigned const_and_shader_buffers =
> add_arg(fninfo, ARG_SGPR,
> - si_const_array(ctx->v4i32,
> - SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
> + si_const_array(const_shader_buf_type, 0));
Is this really necessary? The pointer is only cast to an int anyway.
I like this change on the whole, but I'm thinking it may be time to come
up with a better way to manage userdata registers, since we're starting
to pile up special cases now.
Cheers,
Nicolai
> +
> unsigned samplers_and_images =
> add_arg(fninfo, ARG_SGPR,
> si_const_array(ctx->v8i32,
> SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
>
> if (assign_params) {
> ctx->param_const_and_shader_buffers = const_and_shader_buffers;
> ctx->param_samplers_and_images = samplers_and_images;
> }
> }
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index ba80f55..ebe956e 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -154,21 +154,21 @@ struct nir_shader;
>
> /* SGPR user data indices */
> enum {
> /* GFX9 merged shaders have RW_BUFFERS among the first 8 system SGPRs,
> * and these two are used for other purposes.
> */
> SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
> SI_SGPR_RW_BUFFERS_HI,
> SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
> SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
> - SI_SGPR_CONST_AND_SHADER_BUFFERS,
> + SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
> SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
> SI_SGPR_SAMPLERS_AND_IMAGES,
> SI_SGPR_SAMPLERS_AND_IMAGES_HI,
> SI_NUM_RESOURCE_SGPRS,
>
> /* all VS variants */
> SI_SGPR_VERTEX_BUFFERS = SI_NUM_RESOURCE_SGPRS,
> SI_SGPR_VERTEX_BUFFERS_HI,
> SI_SGPR_BASE_VERTEX,
> SI_SGPR_START_INSTANCE,
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index eb1901b..7eb0aa3c 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -272,20 +272,23 @@ struct si_descriptors {
> * It determines which slots are uploaded.
> */
> uint32_t first_active_slot;
> uint32_t num_active_slots;
>
> /* The SGPR index where the 64-bit pointer to the descriptor array will
> * be stored. */
> ubyte shader_userdata_offset;
> /* The size of one descriptor. */
> ubyte element_dw_size;
> + /* If there is only one slot enabled, bind it directly instead of
> + * uploading descriptors. -1 if disabled. */
> + signed char slot_index_to_bind_directly;
> };
>
> struct si_buffer_resources {
> struct pipe_resource **buffers; /* this has num_buffers elements */
>
> enum radeon_bo_usage shader_usage:4; /* READ, WRITE, or READWRITE */
> enum radeon_bo_usage shader_usage_constbuf:4;
> enum radeon_bo_priority priority:6;
> enum radeon_bo_priority priority_constbuf:6;
>
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list