[Mesa-dev] [PATCH 16/16] radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

Tue Oct 17 12:25:55 UTC 2017

On 13.10.2017 14:04, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer 0
> if there is no other buffer there.
> 
> Benefits:
> - there is no constbuf descriptor upload and shader load
> 
> It's assumed that all constant addresses are within bounds. Non-constant
> addresses are clamped against the last declared CONST variable.
> This only works if the state tracker ensures the bound constant buffer
> matches what the shader needs.
> 
> Once we get 32-bit pointers, we can only do this for user constant buffers
> where the driver is in charge of the upload so that it can guarantee a 32-bit
> address.
> 
> The real performance benefit might not be measurable.
> 
> These apps get 100% theoretical benefit in all shaders (except where noted):
> - antichamber
> - barman arkham origins
> - borderlands 2
> - borderlands pre-sequel
> - brutal legend
> - civilization BE
> - CS:GO
> - deadcore
> - dota 2 -- most shaders
> - europa universalis
> - grid autosport -- most shaders
> - left 4 dead 2
> - legend of grimrock
> - life is strange
> - payday 2
> - portal
> - rocket league
> - serious sam 3 bfe
> - talos principle
> - team fortress 2
> - thea
> - unigine heaven
> - unigine valley -- also sanctuary and tropics
> - wasteland 2
> - xcom: enemy unknown & enemy within
> - tesseract
> - unity (engine)
> 
> Changed stats only:
>      SGPRS: 2059998 -> 2086238 (1.27 %)
>      VGPRS: 1626888 -> 1626904 (0.00 %)
>      Spilled SGPRs: 7902 -> 7865 (-0.47 %)
>      Code Size: 60924520 -> 60982660 (0.10 %) bytes
>      Max Waves: 374539 -> 374526 (-0.00 %)
> ---
>   src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
>   src/gallium/drivers/radeonsi/si_shader.c      | 72 +++++++++++++++++++++++----
>   src/gallium/drivers/radeonsi/si_shader.h      |  2 +-
>   src/gallium/drivers/radeonsi/si_state.h       |  3 ++
>   4 files changed, 87 insertions(+), 13 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 0c1fca8..da6efa8 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -119,20 +119,21 @@ static void si_init_descriptor_list(uint32_t *desc_list,
>   
>   static void si_init_descriptors(struct si_descriptors *desc,
>   				unsigned shader_userdata_index,
>   				unsigned element_dw_size,
>   				unsigned num_elements)
>   {
>   	desc->list = CALLOC(num_elements, element_dw_size * 4);
>   	desc->element_dw_size = element_dw_size;
>   	desc->num_elements = num_elements;
>   	desc->shader_userdata_offset = shader_userdata_index * 4;
> +	desc->slot_index_to_bind_directly = -1;
>   }
>   
>   static void si_release_descriptors(struct si_descriptors *desc)
>   {
>   	r600_resource_reference(&desc->buffer, NULL);
>   	FREE(desc->list);
>   }
>   
>   static bool si_upload_descriptors(struct si_context *sctx,
>   				  struct si_descriptors *desc)
> @@ -141,20 +142,34 @@ static bool si_upload_descriptors(struct si_context *sctx,
>   	unsigned first_slot_offset = desc->first_active_slot * slot_size;
>   	unsigned upload_size = desc->num_active_slots * slot_size;
>   
>   	/* Skip the upload if no shader is using the descriptors. dirty_mask
>   	 * will stay dirty and the descriptors will be uploaded when there is
>   	 * a shader using them.
>   	 */
>   	if (!upload_size)
>   		return true;
>   
> +	/* If there is just one active descriptor, bind it directly. */
> +	if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
> +	    desc->num_active_slots == 1) {
> +		uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
> +						   desc->element_dw_size];
> +
> +		/* The buffer is already in the buffer list. */
> +		r600_resource_reference(&desc->buffer, NULL);
> +		desc->gpu_list = NULL;
> +		desc->gpu_address = si_desc_extract_buffer_address(descriptor);
> +		si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
> +		return true;
> +	}
> +
>   	uint32_t *ptr;
>   	int buffer_offset;
>   	u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
>   		       si_optimal_tcc_alignment(sctx, upload_size),
>   		       (unsigned*)&buffer_offset,
>   		       (struct pipe_resource**)&desc->buffer,
>   		       (void**)&ptr);
>   	if (!desc->buffer) {
>   		desc->gpu_address = 0;
>   		return false; /* skip the draw call */
> @@ -2524,38 +2539,40 @@ void si_init_all_descriptors(struct si_context *sctx)
>   	int i;
>   
>   	STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>   	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>   
>   	for (i = 0; i < SI_NUM_SHADERS; i++) {
>   		bool gfx9_tcs = false;
>   		bool gfx9_gs = false;
>   		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
>   		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
> +		struct si_descriptors *desc;
>   
>   		if (sctx->b.chip_class >= GFX9) {
>   			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
>   			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
>   		}
>   
> -		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
> -					 si_const_and_shader_buffer_descriptors(sctx, i),
> +		desc = si_const_and_shader_buffer_descriptors(sctx, i);
> +		si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
>   					 num_buffer_slots,
>   					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
>   					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
>   						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
>   					 RADEON_USAGE_READWRITE,
>   					 RADEON_USAGE_READ,
>   					 RADEON_PRIO_SHADER_RW_BUFFER,
>   					 RADEON_PRIO_CONST_BUFFER);
> +		desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
>   
> -		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
> +		desc = si_sampler_and_image_descriptors(sctx, i);
>   		si_init_descriptors(desc,
>   				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
>   				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
>   					      SI_SGPR_SAMPLERS_AND_IMAGES,
>   				    16, num_sampler_slots);
>   
>   		int j;
>   		for (j = 0; j < SI_NUM_IMAGES; j++)
>   			memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
>   		for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index f83f45c..ed60f35 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -1966,20 +1966,21 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
>   	return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
>   }
>   
>   static LLVMValueRef fetch_constant(
>   	struct lp_build_tgsi_context *bld_base,
>   	const struct tgsi_full_src_register *reg,
>   	enum tgsi_opcode_type type,
>   	unsigned swizzle)
>   {
>   	struct si_shader_context *ctx = si_shader_context(bld_base);
> +	struct si_shader_selector *sel = ctx->shader->selector;
>   	const struct tgsi_ind_register *ireg = &reg->Indirect;
>   	unsigned buf, idx;
>   
>   	LLVMValueRef addr, bufp;
>   
>   	if (swizzle == LP_CHAN_ALL) {
>   		unsigned chan;
>   		LLVMValueRef values[4];
>   		for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
>   			values[chan] = fetch_constant(bld_base, reg, type, chan);
> @@ -1989,42 +1990,87 @@ static LLVMValueRef fetch_constant(
>   
>   	/* Split 64-bit loads. */
>   	if (tgsi_type_is_64bit(type)) {
>   		LLVMValueRef lo, hi;
>   
>   		lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
>   		hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
>   		return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
>   	}
>   
> +	idx = reg->Register.Index * 4 + swizzle;
> +	if (reg->Register.Indirect) {
> +		addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
> +	} else {
> +		addr = LLVMConstInt(ctx->i32, idx * 4, 0);
> +	}
> +
> +	/* Fast path when user data SGPRs point to constant buffer 0 directly. */
> +	if (sel->info.const_buffers_declared == 1 &&
> +	    sel->info.shader_buffers_declared == 0) {
> +		LLVMValueRef ptr =
> +			LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
> +
> +		/* This enables use of s_load_dword and flat_load_dword for const buffer 0
> +		 * loads, and up to x4 load opcode merging. However, it leads to horrible
> +		 * code reducing SIMD wave occupancy from 8 to 2 in many cases.
> +		 *
> +		 * Using s_buffer_load_dword (x1) seems to be the best option right now.
> +		 */
> +#if 0 /* keep this codepath disabled */
> +		if (!reg->Register.Indirect) {
> +			addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
> +			LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
> +			return bitcast(bld_base, type, result);
> +		}
> +#endif
> +
> +		/* Do the bounds checking with a descriptor, because
> +		 * doing computation and manual bounds checking of 64-bit
> +		 * addresses generates horrible VALU code with very high
> +		 * VGPR usage and very low SIMD occupancy.
> +		 */
> +		ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
> +		ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
> +
> +		LLVMValueRef desc_elems[] = {
> +			LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
> +			LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
> +			LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
> +			LLVMConstInt(ctx->i32,
> +				S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
> +		};
> +		LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
> +		LLVMValueRef result = buffer_load_const(ctx, desc, addr);
> +		return bitcast(bld_base, type, result);
> +	}
> +
>   	assert(reg->Register.Dimension);
>   	buf = reg->Dimension.Index;
> -	idx = reg->Register.Index * 4 + swizzle;
>   
>   	if (reg->Dimension.Indirect) {
>   		LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
>   		LLVMValueRef index;
>   		index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
>   						      reg->Dimension.Index,
>   						      ctx->num_const_buffers);
>   		index = LLVMBuildAdd(ctx->ac.builder, index,
>   				     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
>   		bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
>   	} else
>   		bufp = load_const_buffer_desc(ctx, buf);
>   
> -	if (reg->Register.Indirect) {
> -		addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
> -	} else {
> -		addr = LLVMConstInt(ctx->i32, idx * 4, 0);
> -	}
> -
>   	return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
>   }
>   
>   /* Upper 16 bits must be zero. */
>   static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
>   					   LLVMValueRef val[2])
>   {
>   	return LLVMBuildOr(ctx->ac.builder, val[0],
>   			   LLVMBuildShl(ctx->ac.builder, val[1],
>   					LLVMConstInt(ctx->i32, 16, 0),
> @@ -4250,24 +4296,32 @@ static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
>   		 */
>   		max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
>   	}
>   	return max_work_group_size;
>   }
>   
>   static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
>   					    struct si_function_info *fninfo,
>   					    bool assign_params)
>   {
> +	LLVMTypeRef const_shader_buf_type;
> +
> +	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
> +	    ctx->shader->selector->info.shader_buffers_declared == 0)
> +		const_shader_buf_type = ctx->f32;
> +	else
> +		const_shader_buf_type = ctx->v4i32;
> +
>   	unsigned const_and_shader_buffers =
>   		add_arg(fninfo, ARG_SGPR,
> -			si_const_array(ctx->v4i32,
> -				       SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
> +			si_const_array(const_shader_buf_type, 0));

Is this really necessary? The pointer is only cast to an int anyway.

I like this change on the whole, but I'm thinking it may be time to come 
up with a better way to manage userdata registers, since we're starting 
to pile up special cases now.

Cheers,
Nicolai

> +
>   	unsigned samplers_and_images =
>   		add_arg(fninfo, ARG_SGPR,
>   			si_const_array(ctx->v8i32,
>   				       SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
>   
>   	if (assign_params) {
>   		ctx->param_const_and_shader_buffers = const_and_shader_buffers;
>   		ctx->param_samplers_and_images = samplers_and_images;
>   	}
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index ba80f55..ebe956e 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -154,21 +154,21 @@ struct nir_shader;
>   
>   /* SGPR user data indices */
>   enum {
>   	/* GFX9 merged shaders have RW_BUFFERS among the first 8 system SGPRs,
>   	 * and these two are used for other purposes.
>   	 */
>   	SI_SGPR_RW_BUFFERS,  /* rings (& stream-out, VS only) */
>   	SI_SGPR_RW_BUFFERS_HI,
>   	SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
>   	SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
> -	SI_SGPR_CONST_AND_SHADER_BUFFERS,
> +	SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
>   	SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
>   	SI_SGPR_SAMPLERS_AND_IMAGES,
>   	SI_SGPR_SAMPLERS_AND_IMAGES_HI,
>   	SI_NUM_RESOURCE_SGPRS,
>   
>   	/* all VS variants */
>   	SI_SGPR_VERTEX_BUFFERS	= SI_NUM_RESOURCE_SGPRS,
>   	SI_SGPR_VERTEX_BUFFERS_HI,
>   	SI_SGPR_BASE_VERTEX,
>   	SI_SGPR_START_INSTANCE,
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index eb1901b..7eb0aa3c 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -272,20 +272,23 @@ struct si_descriptors {
>   	 * It determines which slots are uploaded.
>   	 */
>   	uint32_t first_active_slot;
>   	uint32_t num_active_slots;
>   
>   	/* The SGPR index where the 64-bit pointer to the descriptor array will
>   	 * be stored. */
>   	ubyte shader_userdata_offset;
>   	/* The size of one descriptor. */
>   	ubyte element_dw_size;
> +	/* If there is only one slot enabled, bind it directly instead of
> +	 * uploading descriptors. -1 if disabled. */
> +	signed char slot_index_to_bind_directly;
>   };
>   
>   struct si_buffer_resources {
>   	struct pipe_resource		**buffers; /* this has num_buffers elements */
>   
>   	enum radeon_bo_usage		shader_usage:4; /* READ, WRITE, or READWRITE */
>   	enum radeon_bo_usage		shader_usage_constbuf:4;
>   	enum radeon_bo_priority		priority:6;
>   	enum radeon_bo_priority		priority_constbuf:6;
>   
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.