Mesa (master): radeonsi/gfx9: use CE RAM optimally

Thu May 18 20:15:17 UTC 2017

Module: Mesa
Branch: master
Commit: 807e1d257707a37850a150d12a1d6878c3b13d5d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=807e1d257707a37850a150d12a1d6878c3b13d5d

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed May 17 02:49:40 2017 +0200

radeonsi/gfx9: use CE RAM optimally

On GFX9 with only 4K CE RAM, define the range of slots that will be
allocated in CE RAM. All other slots will be uploaded directly. This will
switch dynamically according to which slots are used by current shaders.

GFX9 CE usage should now be similar to VI instead of being often disabled.

Tested on VI by taking the GFX9 CE allocation codepath and setting
num_ce_slots = 2 everywhere to get frequent switches between both modes.
CE is still disabled on GFX9.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

---

 src/gallium/drivers/radeonsi/si_descriptors.c | 163 ++++++++++++++++++++------
 src/gallium/drivers/radeonsi/si_state.h       |   7 ++
 2 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5086a33969..61eb2f10be 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -95,10 +95,13 @@ static uint32_t null_image_descriptor[8] = {
 	 * descriptor */
 };
 
-static void si_init_descriptors(struct si_descriptors *desc,
+static void si_init_descriptors(struct si_context *sctx,
+				struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
 				unsigned num_elements,
+				unsigned first_ce_slot,
+				unsigned num_ce_slots,
 				unsigned *ce_offset)
 {
 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
@@ -106,14 +109,16 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
+	desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
+	desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
 	desc->dirty_mask = u_bit_consecutive64(0, num_elements);
 	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	if (ce_offset) {
+	if (desc->num_ce_slots) {
 		desc->uses_ce = true;
 		desc->ce_offset = *ce_offset;
 
-		*ce_offset += element_dw_size * num_elements * 4;
+		*ce_offset += element_dw_size * desc->num_ce_slots * 4;
 	}
 }
 
@@ -205,13 +210,16 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!upload_size)
 		return true;
 
-	if (sctx->ce_ib && desc->uses_ce) {
-		uint32_t const* list = (uint32_t const*)desc->list;
+	if (desc->uses_ce) {
+		const uint32_t *list = desc->list +
+				       desc->first_ce_slot * desc->element_dw_size;
+		uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
+				u_bit_consecutive64(0, desc->num_ce_slots);
 
-		while(desc->dirty_mask) {
+
+		while (mask) {
 			int begin, count;
-			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
-						       &count);
+			u_bit_scan_consecutive_range64(&mask, &begin, &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
@@ -222,7 +230,9 @@ static bool si_upload_descriptors(struct si_context *sctx,
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+		if (!si_ce_upload(sctx,
+				  desc->ce_offset +
+				  (first_slot_offset - desc->first_ce_slot * slot_size),
 				  upload_size, (unsigned*)&desc->buffer_offset,
 				  &desc->buffer))
 			return false;
@@ -920,9 +930,12 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
 
 /* BUFFER RESOURCES */
 
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+				     struct si_buffer_resources *buffers,
 				     struct si_descriptors *descs,
 				     unsigned num_buffers,
+				     unsigned first_ce_slot,
+				     unsigned num_ce_slots,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_usage shader_usage_constbuf,
@@ -936,8 +949,8 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 	buffers->priority_constbuf = priority_constbuf;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
-	si_init_descriptors(descs, shader_userdata_index, 4,
-			    num_buffers, ce_offset);
+	si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
+			    first_ce_slot, num_ce_slots, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -1994,6 +2007,48 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
 
 /* INIT/DEINIT/UPLOAD */
 
+/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
+ * to make CE RAM as useful as possible, this defines limits
+ * for the number slots that can be in CE RAM on GFX9. If a shader
+ * is using more, descriptors will be uploaded to memory directly and
+ * CE won't be used.
+ *
+ * These numbers are based on shader-db.
+ */
+static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 1,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 24,
+	[PIPE_SHADER_COMPUTE] = 16,
+};
+static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = {
+	/* these must be even due to slot alignment */
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 2,
+	[PIPE_SHADER_COMPUTE] = 8,
+};
+static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 9,
+	[PIPE_SHADER_TESS_CTRL] = 3,
+	[PIPE_SHADER_TESS_EVAL] = 5,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 8,
+	[PIPE_SHADER_COMPUTE] = 6,
+};
+static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 12,
+	[PIPE_SHADER_COMPUTE] = 13,
+};
+
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
@@ -2003,23 +2058,37 @@ void si_init_all_descriptors(struct si_context *sctx)
 	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
-				i == PIPE_SHADER_TESS_CTRL;
-		bool gfx9_gs = sctx->b.chip_class == GFX9 &&
-			       i == PIPE_SHADER_GEOMETRY;
-		/* GFX9 has only 4KB of CE, while previous chips had 32KB.
-		 * Rarely used descriptors don't use CE RAM.
-		 */
-		bool big_ce = sctx->b.chip_class <= VI;
-		bool const_and_shaderbufs_use_ce = big_ce ||
-						   i == PIPE_SHADER_VERTEX ||
-						   i == PIPE_SHADER_FRAGMENT;
-		bool samplers_and_images_use_ce = big_ce ||
-						  i == PIPE_SHADER_FRAGMENT;
-
-		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
+		bool gfx9_tcs = false;
+		bool gfx9_gs = false;
+		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
+		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+
+		unsigned first_sampler_ce_slot = 0;
+		unsigned num_sampler_ce_slots = num_sampler_slots;
+
+		unsigned first_buffer_ce_slot = 0;
+		unsigned num_buffer_ce_slots = num_buffer_slots;
+
+		/* Adjust CE slot ranges based on GFX9 CE RAM limits. */
+		if (sctx->b.chip_class >= GFX9) {
+			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
+			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
+
+			first_sampler_ce_slot =
+				si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2;
+			num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 +
+					       gfx9_max_ce_samplers[i];
+
+			first_buffer_ce_slot =
+				si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1);
+			num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] +
+					      gfx9_max_ce_const_buffers[i];
+		}
+
+		si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
 					 si_const_and_shader_buffer_descriptors(sctx, i),
-					 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
+					 num_buffer_slots,
+					 first_buffer_ce_slot, num_buffer_ce_slots,
 					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
 					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
 						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
@@ -2027,15 +2096,16 @@ void si_init_all_descriptors(struct si_context *sctx)
 					 RADEON_USAGE_READ,
 					 RADEON_PRIO_SHADER_RW_BUFFER,
 					 RADEON_PRIO_CONST_BUFFER,
-					 const_and_shaderbufs_use_ce ? &ce_offset : NULL);
+					 &ce_offset);
 
 		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
-		si_init_descriptors(desc,
+		si_init_descriptors(sctx, desc,
 				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
 				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
 					      SI_SGPR_SAMPLERS_AND_IMAGES,
-				    16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
-				    samplers_and_images_use_ce ? &ce_offset : NULL);
+				    16, num_sampler_slots,
+				    first_sampler_ce_slot, num_sampler_ce_slots,
+				    &ce_offset);
 
 		int j;
 		for (j = 0; j < SI_NUM_IMAGES; j++)
@@ -2044,9 +2114,10 @@ void si_init_all_descriptors(struct si_context *sctx)
 			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
 	}
 
-	si_init_buffer_resources(&sctx->rw_buffers,
+	si_init_buffer_resources(sctx, &sctx->rw_buffers,
 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+				 SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
+				 SI_SGPR_RW_BUFFERS,
 				 /* The second set of usage/priority is used by
 				  * const buffers in RW buffer slots. */
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
@@ -2054,8 +2125,8 @@ void si_init_all_descriptors(struct si_context *sctx)
 				 &ce_offset);
 	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
+			    4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
 	sctx->total_ce_ram_allocated = ce_offset;
@@ -2197,6 +2268,26 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
 	    first + count > desc->first_active_slot + desc->num_active_slots)
 		sctx->descriptors_dirty |= 1u << desc_idx;
 
+	/* Enable or disable CE for this descriptor array. */
+	bool used_ce = desc->uses_ce;
+	desc->uses_ce = desc->first_ce_slot <= first &&
+			desc->first_ce_slot + desc->num_ce_slots >= first + count;
+
+	if (desc->uses_ce != used_ce) {
+		/* Upload or dump descriptors if we're disabling or enabling CE,
+		 * respectively. */
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+		/* If we're enabling CE, re-upload all descriptors to CE RAM.
+		 * When CE was disabled, uploads to CE RAM stopped.
+		 */
+		if (desc->uses_ce) {
+			desc->dirty_mask |=
+				u_bit_consecutive64(desc->first_ce_slot,
+						    desc->num_ce_slots);
+		}
+	}
+
 	desc->first_active_slot = first;
 	desc->num_active_slots = count;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index dfabaa3556..275f830613 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,13 @@ struct si_descriptors {
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
+	/* Slots allocated in CE RAM. If we get active slots outside of this
+	 * range, direct uploads to memory will be used instead. This basically
+	 * governs switching between onchip (CE) and offchip (upload) modes.
+	 */
+	unsigned first_ce_slot;
+	unsigned num_ce_slots;
+
 	/* Slots that are used by currently-bound shaders.
 	 * With CE: It determines which slots are dumped to L2.
 	 *          It doesn't skip uploads to CE RAM.