[Mesa-dev] [PATCH 10/10] radeonsi/gfx9: use CE RAM optimally

Thu May 18 09:38:44 UTC 2017

On 17.05.2017 21:38, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> On GFX9 with only 4K CE RAM, define the range of slots that will be
> allocated in CE RAM. All other slots will be uploaded directly. This will
> switch dynamically according to which slots are used by current shaders.
>
> GFX9 CE usage should now be similar to VI instead of being often disabled.
>
> Tested on VI by taking the GFX9 CE allocation codepath and setting
> num_ce_slots = 2 everywhere to get frequent switches between both modes.
> CE is still disabled on GFX9.
> ---
>  src/gallium/drivers/radeonsi/si_descriptors.c | 163 ++++++++++++++++++++------
>  src/gallium/drivers/radeonsi/si_state.h       |   7 ++
>  2 files changed, 134 insertions(+), 36 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index af174d5..5753ae0 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -88,40 +88,45 @@ static uint32_t null_texture_descriptor[8] = {
>
>  static uint32_t null_image_descriptor[8] = {
>  	0,
>  	0,
>  	0,
>  	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
>  	/* the rest must contain zeros, which is also used by the buffer
>  	 * descriptor */
>  };
>
> -static void si_init_descriptors(struct si_descriptors *desc,
> +static void si_init_descriptors(struct si_context *sctx,
> +				struct si_descriptors *desc,
>  				unsigned shader_userdata_index,
>  				unsigned element_dw_size,
>  				unsigned num_elements,
> +				unsigned first_ce_slot,
> +				unsigned num_ce_slots,
>  				unsigned *ce_offset)
>  {
>  	assert(num_elements <= sizeof(desc->dirty_mask)*8);
>
>  	desc->list = CALLOC(num_elements, element_dw_size * 4);
>  	desc->element_dw_size = element_dw_size;
>  	desc->num_elements = num_elements;
> +	desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
> +	desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
>  	desc->dirty_mask = u_bit_consecutive64(0, num_elements);
>  	desc->shader_userdata_offset = shader_userdata_index * 4;
>
> -	if (ce_offset) {
> +	if (desc->num_ce_slots) {
>  		desc->uses_ce = true;
>  		desc->ce_offset = *ce_offset;
>
>  		/* make sure that ce_offset stays 32 byte aligned */
> -		*ce_offset += element_dw_size * num_elements * 4;
> +		*ce_offset += element_dw_size * desc->num_ce_slots * 4;
>  	}
>  }
>
>  static void si_release_descriptors(struct si_descriptors *desc)
>  {
>  	r600_resource_reference(&desc->buffer, NULL);
>  	FREE(desc->list);
>  }
>
>  static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
> @@ -195,38 +200,43 @@ static bool si_upload_descriptors(struct si_context *sctx,
>  				  struct si_descriptors *desc,
>  				  struct r600_atom * atom)
>  {
>  	unsigned slot_size = desc->element_dw_size * 4;
>  	unsigned first_slot_offset = desc->first_active_slot * slot_size;
>  	unsigned upload_size = desc->num_active_slots * slot_size;
>
>  	if (!upload_size)
>  		return true;
>
> -	if (sctx->ce_ib && desc->uses_ce) {
> -		uint32_t const* list = (uint32_t const*)desc->list;
> +	if (desc->uses_ce) {
> +		const uint32_t *list = desc->list +
> +				       desc->first_ce_slot * desc->element_dw_size;
> +		uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
> +				u_bit_consecutive64(0, desc->num_ce_slots);
>
> -		while(desc->dirty_mask) {
> +
> +		while (mask) {
>  			int begin, count;
> -			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
> -						       &count);
> +			u_bit_scan_consecutive_range64(&mask, &begin, &count);
>
>  			begin *= desc->element_dw_size;
>  			count *= desc->element_dw_size;
>
>  			radeon_emit(sctx->ce_ib,
>  			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
>  			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
>  			radeon_emit_array(sctx->ce_ib, list + begin, count);
>  		}
>
> -		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
> +		if (!si_ce_upload(sctx,
> +				  desc->ce_offset +
> +				  (first_slot_offset - desc->first_ce_slot * slot_size),
>  				  upload_size, (unsigned*)&desc->buffer_offset,
>  				  &desc->buffer))
>  			return false;
>  	} else {
>  		uint32_t *ptr;
>
>  		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
>  			       si_optimal_tcc_alignment(sctx, upload_size),
>  			       (unsigned*)&desc->buffer_offset,
>  			       (struct pipe_resource**)&desc->buffer,
> @@ -910,38 +920,41 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
>  			continue;
>
>  		memcpy(desc->list + desc_slot * 16 + 12, sstates[i]->val, 4*4);
>  		desc->dirty_mask |= 1ull << desc_slot;
>  		sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
>  	}
>  }
>
>  /* BUFFER RESOURCES */
>
> -static void si_init_buffer_resources(struct si_buffer_resources *buffers,
> +static void si_init_buffer_resources(struct si_context *sctx,
> +				     struct si_buffer_resources *buffers,
>  				     struct si_descriptors *descs,
>  				     unsigned num_buffers,
> +				     unsigned first_ce_slot,
> +				     unsigned num_ce_slots,
>  				     unsigned shader_userdata_index,
>  				     enum radeon_bo_usage shader_usage,
>  				     enum radeon_bo_usage shader_usage_constbuf,
>  				     enum radeon_bo_priority priority,
>  				     enum radeon_bo_priority priority_constbuf,
>  				     unsigned *ce_offset)
>  {
>  	buffers->shader_usage = shader_usage;
>  	buffers->shader_usage_constbuf = shader_usage_constbuf;
>  	buffers->priority = priority;
>  	buffers->priority_constbuf = priority_constbuf;
>  	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
>
> -	si_init_descriptors(descs, shader_userdata_index, 4,
> -			    num_buffers, ce_offset);
> +	si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
> +			    first_ce_slot, num_ce_slots, ce_offset);
>  }
>
>  static void si_release_buffer_resources(struct si_buffer_resources *buffers,
>  					struct si_descriptors *descs)
>  {
>  	int i;
>
>  	for (i = 0; i < descs->num_elements; i++) {
>  		pipe_resource_reference(&buffers->buffers[i], NULL);
>  	}
> @@ -1983,82 +1996,140 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
>  	while (mask) {
>  		unsigned i = u_bit_scan(&mask);
>
>  		si_emit_shader_pointer(sctx, descs + i, base);
>  	}
>  	sctx->shader_pointers_dirty &= ~compute_mask;
>  }
>
>  /* INIT/DEINIT/UPLOAD */
>
> +/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
> + * to make CE RAM as useful as possible, this defines limits
> + * for the number slots that can be in CE RAM on GFX9. If a shader
> + * is using more, descriptors will be uploaded to memory directly and
> + * CE won't be used.
> + *
> + * These numbers are based on shader-db.
> + */
> +static unsigned gfx9_max_samplers[SI_NUM_SHADERS] = {
> +	[PIPE_SHADER_VERTEX] = 0,
> +	[PIPE_SHADER_TESS_CTRL] = 0,
> +	[PIPE_SHADER_TESS_EVAL] = 1,
> +	[PIPE_SHADER_GEOMETRY] = 0,
> +	[PIPE_SHADER_FRAGMENT] = 24,
> +	[PIPE_SHADER_COMPUTE] = 16,
> +};
> +static unsigned gfx9_max_images[SI_NUM_SHADERS] = {
> +	/* these must be even due to slot alignment */
> +	[PIPE_SHADER_VERTEX] = 0,
> +	[PIPE_SHADER_TESS_CTRL] = 0,
> +	[PIPE_SHADER_TESS_EVAL] = 0,
> +	[PIPE_SHADER_GEOMETRY] = 0,
> +	[PIPE_SHADER_FRAGMENT] = 2,
> +	[PIPE_SHADER_COMPUTE] = 8,
> +};
> +static unsigned gfx9_max_const_buffers[SI_NUM_SHADERS] = {
> +	[PIPE_SHADER_VERTEX] = 9,
> +	[PIPE_SHADER_TESS_CTRL] = 3,
> +	[PIPE_SHADER_TESS_EVAL] = 5,
> +	[PIPE_SHADER_GEOMETRY] = 0,
> +	[PIPE_SHADER_FRAGMENT] = 8,
> +	[PIPE_SHADER_COMPUTE] = 6,
> +};
> +static unsigned gfx9_max_shader_buffers[SI_NUM_SHADERS] = {
> +	[PIPE_SHADER_VERTEX] = 0,
> +	[PIPE_SHADER_TESS_CTRL] = 0,
> +	[PIPE_SHADER_TESS_EVAL] = 0,
> +	[PIPE_SHADER_GEOMETRY] = 0,
> +	[PIPE_SHADER_FRAGMENT] = 12,
> +	[PIPE_SHADER_COMPUTE] = 13,
> +};

Please rename these to gfx9_max_ce_*

Apart from that:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> +
>  void si_init_all_descriptors(struct si_context *sctx)
>  {
>  	int i;
>  	unsigned ce_offset = 0;
>
>  	STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>  	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>
>  	for (i = 0; i < SI_NUM_SHADERS; i++) {
> -		bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
> -				i == PIPE_SHADER_TESS_CTRL;
> -		bool gfx9_gs = sctx->b.chip_class == GFX9 &&
> -			       i == PIPE_SHADER_GEOMETRY;
> -		/* GFX9 has only 4KB of CE, while previous chips had 32KB.
> -		 * Rarely used descriptors don't use CE RAM.
> -		 */
> -		bool big_ce = sctx->b.chip_class <= VI;
> -		bool const_and_shaderbufs_use_ce = big_ce ||
> -						   i == PIPE_SHADER_VERTEX ||
> -						   i == PIPE_SHADER_FRAGMENT;
> -		bool samplers_and_images_use_ce = big_ce ||
> -						  i == PIPE_SHADER_FRAGMENT;
> -
> -		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
> +		bool gfx9_tcs = false;
> +		bool gfx9_gs = false;
> +		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
> +		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
> +
> +		unsigned first_sampler_ce_slot = 0;
> +		unsigned num_sampler_ce_slots = num_sampler_slots;
> +
> +		unsigned first_buffer_ce_slot = 0;
> +		unsigned num_buffer_ce_slots = num_buffer_slots;
> +
> +		/* Adjust CE slot ranges based on GFX9 CE RAM limits. */
> +		if (sctx->b.chip_class >= GFX9) {
> +			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
> +			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
> +
> +			first_sampler_ce_slot =
> +				si_get_image_slot(gfx9_max_images[i] - 1) / 2;
> +			num_sampler_ce_slots = gfx9_max_images[i] / 2 +
> +					       gfx9_max_samplers[i];
> +
> +			first_buffer_ce_slot =
> +				si_get_shaderbuf_slot(gfx9_max_shader_buffers[i] - 1);
> +			num_buffer_ce_slots = gfx9_max_shader_buffers[i] +
> +					      gfx9_max_const_buffers[i];
> +		}
> +
> +		si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
>  					 si_const_and_shader_buffer_descriptors(sctx, i),
> -					 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
> +					 num_buffer_slots,
> +					 first_buffer_ce_slot, num_buffer_ce_slots,
>  					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
>  					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
>  						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
>  					 RADEON_USAGE_READWRITE,
>  					 RADEON_USAGE_READ,
>  					 RADEON_PRIO_SHADER_RW_BUFFER,
>  					 RADEON_PRIO_CONST_BUFFER,
> -					 const_and_shaderbufs_use_ce ? &ce_offset : NULL);
> +					 &ce_offset);
>
>  		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
> -		si_init_descriptors(desc,
> +		si_init_descriptors(sctx, desc,
>  				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
>  				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
>  					      SI_SGPR_SAMPLERS_AND_IMAGES,
> -				    16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
> -				    samplers_and_images_use_ce ? &ce_offset : NULL);
> +				    16, num_sampler_slots,
> +				    first_sampler_ce_slot, num_sampler_ce_slots,
> +				    &ce_offset);
>
>  		int j;
>  		for (j = 0; j < SI_NUM_IMAGES; j++)
>  			memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
>  		for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
>  			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
>  	}
>
> -	si_init_buffer_resources(&sctx->rw_buffers,
> +	si_init_buffer_resources(sctx, &sctx->rw_buffers,
>  				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
> -				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
> +				 SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
> +				 SI_SGPR_RW_BUFFERS,
>  				 /* The second set of usage/priority is used by
>  				  * const buffers in RW buffer slots. */
>  				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
>  				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
>  				 &ce_offset);
>  	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
>
> -	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
> -			    4, SI_NUM_VERTEX_BUFFERS, NULL);
> +	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
> +			    4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
>
>  	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
>  	sctx->total_ce_ram_allocated = ce_offset;
>
>  	if (sctx->b.chip_class >= GFX9)
>  		assert(ce_offset <= 4096);
>  	else
>  		assert(ce_offset <= 32768);
>
>  	/* Set pipe_context functions. */
> @@ -2186,20 +2257,40 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
>
>  	int first, count;
>  	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
>  	assert(new_active_mask == 0);
>
>  	/* Upload/dump descriptors if slots are being enabled. */
>  	if (first < desc->first_active_slot ||
>  	    first + count > desc->first_active_slot + desc->num_active_slots)
>  		sctx->descriptors_dirty |= 1u << desc_idx;
>
> +	/* Enable or disable CE for this descriptor array. */
> +	bool used_ce = desc->uses_ce;
> +	desc->uses_ce = desc->first_ce_slot <= first &&
> +			desc->first_ce_slot + desc->num_ce_slots >= first + count;
> +
> +	if (desc->uses_ce != used_ce) {
> +		/* Upload or dump descriptors if we're disabling or enabling CE,
> +		 * respectively. */
> +		sctx->descriptors_dirty |= 1u << desc_idx;
> +
> +		/* If we're enabling CE, re-upload all descriptors to CE RAM.
> +		 * When CE was disabled, uploads to CE RAM stopped.
> +		 */
> +		if (desc->uses_ce) {
> +			desc->dirty_mask |=
> +				u_bit_consecutive64(desc->first_ce_slot,
> +						    desc->num_ce_slots);
> +		}
> +	}
> +
>  	desc->first_active_slot = first;
>  	desc->num_active_slots = count;
>  }
>
>  void si_set_active_descriptors_for_shader(struct si_context *sctx,
>  					  struct si_shader_selector *sel)
>  {
>  	if (!sel)
>  		return;
>
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index dfabaa3..275f830 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -221,20 +221,27 @@ struct si_descriptors {
>  	/* The maximum number of descriptors. */
>  	unsigned num_elements;
>
>  	/* The buffer where the descriptors have been uploaded. */
>  	struct r600_resource *buffer;
>  	int buffer_offset; /* can be negative if not using lower slots */
>
>  	/* Offset in CE RAM */
>  	unsigned ce_offset;
>
> +	/* Slots allocated in CE RAM. If we get active slots outside of this
> +	 * range, direct uploads to memory will be used instead. This basically
> +	 * governs switching between onchip (CE) and offchip (upload) modes.
> +	 */
> +	unsigned first_ce_slot;
> +	unsigned num_ce_slots;
> +
>  	/* Slots that are used by currently-bound shaders.
>  	 * With CE: It determines which slots are dumped to L2.
>  	 *          It doesn't skip uploads to CE RAM.
>  	 * Without CE: It determines which slots are uploaded.
>  	 */
>  	unsigned first_active_slot;
>  	unsigned num_active_slots;
>
>  	/* Slots that have been changed and need to be uploaded. */
>  	uint64_t dirty_mask;
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.