[Mesa-dev] [PATCH 10/10] radeonsi/gfx9: use CE RAM optimally

Wed May 17 19:38:52 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

On GFX9 with only 4K CE RAM, define the range of slots that will be
allocated in CE RAM. All other slots will be uploaded directly. This will
switch dynamically according to which slots are used by current shaders.

GFX9 CE usage should now be similar to VI instead of being often disabled.

Tested on VI by taking the GFX9 CE allocation codepath and setting
num_ce_slots = 2 everywhere to get frequent switches between both modes.
CE is still disabled on GFX9.
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 163 ++++++++++++++++++++------
 src/gallium/drivers/radeonsi/si_state.h       |   7 ++
 2 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index af174d5..5753ae0 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -88,40 +88,45 @@ static uint32_t null_texture_descriptor[8] = {
 
 static uint32_t null_image_descriptor[8] = {
 	0,
 	0,
 	0,
 	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
 	/* the rest must contain zeros, which is also used by the buffer
 	 * descriptor */
 };
 
-static void si_init_descriptors(struct si_descriptors *desc,
+static void si_init_descriptors(struct si_context *sctx,
+				struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
 				unsigned num_elements,
+				unsigned first_ce_slot,
+				unsigned num_ce_slots,
 				unsigned *ce_offset)
 {
 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
 	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
+	desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
+	desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
 	desc->dirty_mask = u_bit_consecutive64(0, num_elements);
 	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	if (ce_offset) {
+	if (desc->num_ce_slots) {
 		desc->uses_ce = true;
 		desc->ce_offset = *ce_offset;
 
 		/* make sure that ce_offset stays 32 byte aligned */
-		*ce_offset += element_dw_size * num_elements * 4;
+		*ce_offset += element_dw_size * desc->num_ce_slots * 4;
 	}
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	r600_resource_reference(&desc->buffer, NULL);
 	FREE(desc->list);
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
@@ -195,38 +200,43 @@ static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc,
 				  struct r600_atom * atom)
 {
 	unsigned slot_size = desc->element_dw_size * 4;
 	unsigned first_slot_offset = desc->first_active_slot * slot_size;
 	unsigned upload_size = desc->num_active_slots * slot_size;
 
 	if (!upload_size)
 		return true;
 
-	if (sctx->ce_ib && desc->uses_ce) {
-		uint32_t const* list = (uint32_t const*)desc->list;
+	if (desc->uses_ce) {
+		const uint32_t *list = desc->list +
+				       desc->first_ce_slot * desc->element_dw_size;
+		uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
+				u_bit_consecutive64(0, desc->num_ce_slots);
 
-		while(desc->dirty_mask) {
+
+		while (mask) {
 			int begin, count;
-			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
-						       &count);
+			u_bit_scan_consecutive_range64(&mask, &begin, &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
 
 			radeon_emit(sctx->ce_ib,
 			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
 			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+		if (!si_ce_upload(sctx,
+				  desc->ce_offset +
+				  (first_slot_offset - desc->first_ce_slot * slot_size),
 				  upload_size, (unsigned*)&desc->buffer_offset,
 				  &desc->buffer))
 			return false;
 	} else {
 		uint32_t *ptr;
 
 		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
 			       si_optimal_tcc_alignment(sctx, upload_size),
 			       (unsigned*)&desc->buffer_offset,
 			       (struct pipe_resource**)&desc->buffer,
@@ -910,38 +920,41 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
 			continue;
 
 		memcpy(desc->list + desc_slot * 16 + 12, sstates[i]->val, 4*4);
 		desc->dirty_mask |= 1ull << desc_slot;
 		sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 	}
 }
 
 /* BUFFER RESOURCES */
 
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+				     struct si_buffer_resources *buffers,
 				     struct si_descriptors *descs,
 				     unsigned num_buffers,
+				     unsigned first_ce_slot,
+				     unsigned num_ce_slots,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_usage shader_usage_constbuf,
 				     enum radeon_bo_priority priority,
 				     enum radeon_bo_priority priority_constbuf,
 				     unsigned *ce_offset)
 {
 	buffers->shader_usage = shader_usage;
 	buffers->shader_usage_constbuf = shader_usage_constbuf;
 	buffers->priority = priority;
 	buffers->priority_constbuf = priority_constbuf;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
-	si_init_descriptors(descs, shader_userdata_index, 4,
-			    num_buffers, ce_offset);
+	si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
+			    first_ce_slot, num_ce_slots, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
 					struct si_descriptors *descs)
 {
 	int i;
 
 	for (i = 0; i < descs->num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
@@ -1983,82 +1996,140 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
 	while (mask) {
 		unsigned i = u_bit_scan(&mask);
 
 		si_emit_shader_pointer(sctx, descs + i, base);
 	}
 	sctx->shader_pointers_dirty &= ~compute_mask;
 }
 
 /* INIT/DEINIT/UPLOAD */
 
+/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
+ * to make CE RAM as useful as possible, this defines limits
+ * for the number slots that can be in CE RAM on GFX9. If a shader
+ * is using more, descriptors will be uploaded to memory directly and
+ * CE won't be used.
+ *
+ * These numbers are based on shader-db.
+ */
+static unsigned gfx9_max_samplers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 1,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 24,
+	[PIPE_SHADER_COMPUTE] = 16,
+};
+static unsigned gfx9_max_images[SI_NUM_SHADERS] = {
+	/* these must be even due to slot alignment */
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 2,
+	[PIPE_SHADER_COMPUTE] = 8,
+};
+static unsigned gfx9_max_const_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 9,
+	[PIPE_SHADER_TESS_CTRL] = 3,
+	[PIPE_SHADER_TESS_EVAL] = 5,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 8,
+	[PIPE_SHADER_COMPUTE] = 6,
+};
+static unsigned gfx9_max_shader_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 12,
+	[PIPE_SHADER_COMPUTE] = 13,
+};
+
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 	unsigned ce_offset = 0;
 
 	STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
 	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
-				i == PIPE_SHADER_TESS_CTRL;
-		bool gfx9_gs = sctx->b.chip_class == GFX9 &&
-			       i == PIPE_SHADER_GEOMETRY;
-		/* GFX9 has only 4KB of CE, while previous chips had 32KB.
-		 * Rarely used descriptors don't use CE RAM.
-		 */
-		bool big_ce = sctx->b.chip_class <= VI;
-		bool const_and_shaderbufs_use_ce = big_ce ||
-						   i == PIPE_SHADER_VERTEX ||
-						   i == PIPE_SHADER_FRAGMENT;
-		bool samplers_and_images_use_ce = big_ce ||
-						  i == PIPE_SHADER_FRAGMENT;
-
-		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
+		bool gfx9_tcs = false;
+		bool gfx9_gs = false;
+		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
+		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+
+		unsigned first_sampler_ce_slot = 0;
+		unsigned num_sampler_ce_slots = num_sampler_slots;
+
+		unsigned first_buffer_ce_slot = 0;
+		unsigned num_buffer_ce_slots = num_buffer_slots;
+
+		/* Adjust CE slot ranges based on GFX9 CE RAM limits. */
+		if (sctx->b.chip_class >= GFX9) {
+			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
+			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
+
+			first_sampler_ce_slot =
+				si_get_image_slot(gfx9_max_images[i] - 1) / 2;
+			num_sampler_ce_slots = gfx9_max_images[i] / 2 +
+					       gfx9_max_samplers[i];
+
+			first_buffer_ce_slot =
+				si_get_shaderbuf_slot(gfx9_max_shader_buffers[i] - 1);
+			num_buffer_ce_slots = gfx9_max_shader_buffers[i] +
+					      gfx9_max_const_buffers[i];
+		}
+
+		si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
 					 si_const_and_shader_buffer_descriptors(sctx, i),
-					 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
+					 num_buffer_slots,
+					 first_buffer_ce_slot, num_buffer_ce_slots,
 					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
 					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
 						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
 					 RADEON_USAGE_READWRITE,
 					 RADEON_USAGE_READ,
 					 RADEON_PRIO_SHADER_RW_BUFFER,
 					 RADEON_PRIO_CONST_BUFFER,
-					 const_and_shaderbufs_use_ce ? &ce_offset : NULL);
+					 &ce_offset);
 
 		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
-		si_init_descriptors(desc,
+		si_init_descriptors(sctx, desc,
 				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
 				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
 					      SI_SGPR_SAMPLERS_AND_IMAGES,
-				    16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
-				    samplers_and_images_use_ce ? &ce_offset : NULL);
+				    16, num_sampler_slots,
+				    first_sampler_ce_slot, num_sampler_ce_slots,
+				    &ce_offset);
 
 		int j;
 		for (j = 0; j < SI_NUM_IMAGES; j++)
 			memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
 		for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
 			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
 	}
 
-	si_init_buffer_resources(&sctx->rw_buffers,
+	si_init_buffer_resources(sctx, &sctx->rw_buffers,
 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+				 SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
+				 SI_SGPR_RW_BUFFERS,
 				 /* The second set of usage/priority is used by
 				  * const buffers in RW buffer slots. */
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
 				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
 				 &ce_offset);
 	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
+			    4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
 	sctx->total_ce_ram_allocated = ce_offset;
 
 	if (sctx->b.chip_class >= GFX9)
 		assert(ce_offset <= 4096);
 	else
 		assert(ce_offset <= 32768);
 
 	/* Set pipe_context functions. */
@@ -2186,20 +2257,40 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
 
 	int first, count;
 	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
 	assert(new_active_mask == 0);
 
 	/* Upload/dump descriptors if slots are being enabled. */
 	if (first < desc->first_active_slot ||
 	    first + count > desc->first_active_slot + desc->num_active_slots)
 		sctx->descriptors_dirty |= 1u << desc_idx;
 
+	/* Enable or disable CE for this descriptor array. */
+	bool used_ce = desc->uses_ce;
+	desc->uses_ce = desc->first_ce_slot <= first &&
+			desc->first_ce_slot + desc->num_ce_slots >= first + count;
+
+	if (desc->uses_ce != used_ce) {
+		/* Upload or dump descriptors if we're disabling or enabling CE,
+		 * respectively. */
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+		/* If we're enabling CE, re-upload all descriptors to CE RAM.
+		 * When CE was disabled, uploads to CE RAM stopped.
+		 */
+		if (desc->uses_ce) {
+			desc->dirty_mask |=
+				u_bit_consecutive64(desc->first_ce_slot,
+						    desc->num_ce_slots);
+		}
+	}
+
 	desc->first_active_slot = first;
 	desc->num_active_slots = count;
 }
 
 void si_set_active_descriptors_for_shader(struct si_context *sctx,
 					  struct si_shader_selector *sel)
 {
 	if (!sel)
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index dfabaa3..275f830 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -221,20 +221,27 @@ struct si_descriptors {
 	/* The maximum number of descriptors. */
 	unsigned num_elements;
 
 	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	int buffer_offset; /* can be negative if not using lower slots */
 
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
+	/* Slots allocated in CE RAM. If we get active slots outside of this
+	 * range, direct uploads to memory will be used instead. This basically
+	 * governs switching between onchip (CE) and offchip (upload) modes.
+	 */
+	unsigned first_ce_slot;
+	unsigned num_ce_slots;
+
 	/* Slots that are used by currently-bound shaders.
 	 * With CE: It determines which slots are dumped to L2.
 	 *          It doesn't skip uploads to CE RAM.
 	 * Without CE: It determines which slots are uploaded.
 	 */
 	unsigned first_active_slot;
 	unsigned num_active_slots;
 
 	/* Slots that have been changed and need to be uploaded. */
 	uint64_t dirty_mask;
-- 
2.7.4