Mesa (master): radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Thu May 18 20:15:17 UTC 2017

Module: Mesa
Branch: master
Commit: a7f098fb769bdfdac692a04eab6bdd84e061e5cd
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=a7f098fb769bdfdac692a04eab6bdd84e061e5cd

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Mon May 15 23:03:01 2017 +0200

radeonsi: only upload (dump to L2) those descriptors that are used by shaders

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

---

 src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 89 ++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h         | 18 ++++-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 117 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef1116af..4c980668d3 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -208,7 +208,24 @@ static void *si_create_compute_state(
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	sctx->cs_shader_state.program = (struct si_compute*)state;
+	struct si_compute *program = (struct si_compute*)state;
+
+	sctx->cs_shader_state.program = program;
+	if (!program)
+		return;
+
+	/* Wait because we need active slot usage masks. */
+	if (program->ir_type == PIPE_SHADER_IR_TGSI)
+		util_queue_fence_wait(&program->ready);
+
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+				  program->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+				  program->active_samplers_and_images);
 }
 
 static void si_set_global_binding(
@@ -756,12 +773,9 @@ static void si_launch_grid(
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 				 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-		util_queue_fence_wait(&program->ready);
-
-		if (program->shader.compilation_failed)
-			return;
-	}
+	if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+	    program->shader.compilation_failed)
+		return;
 
 	si_decompress_compute_textures(sctx);
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index b38b6b5fa9..b514961925 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -125,12 +125,14 @@ static void si_release_descriptors(struct si_descriptors *desc)
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
-			 unsigned *out_offset, struct r600_resource **out_buf) {
+			 unsigned *out_offset, struct r600_resource **out_buf)
+{
 	uint64_t va;
 
 	u_suballocator_alloc(sctx->ce_suballocator, size,
-			     sctx->screen->b.info.tcc_cache_line_size,
-			     out_offset, (struct pipe_resource**)out_buf);
+			     si_optimal_tcc_alignment(sctx, size),
+			     out_offset,
+			     (struct pipe_resource**)out_buf);
 	if (!out_buf)
 			return false;
 
@@ -193,7 +195,16 @@ static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc,
 				  struct r600_atom * atom)
 {
-	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	unsigned slot_size = desc->element_dw_size * 4;
+	unsigned first_slot_offset = desc->first_active_slot * slot_size;
+	unsigned upload_size = desc->num_active_slots * slot_size;
+
+	/* Skip the upload if no shader is using the descriptors. dirty_mask
+	 * will stay dirty and the descriptors will be uploaded when there is
+	 * a shader using them.
+	 */
+	if (!upload_size)
+		return true;
 
 	if (sctx->ce_ib && desc->uses_ce) {
 		uint32_t const* list = (uint32_t const*)desc->list;
@@ -212,25 +223,32 @@ static bool si_upload_descriptors(struct si_context *sctx,
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
-		                           &desc->buffer_offset, &desc->buffer))
+		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+				  upload_size, (unsigned*)&desc->buffer_offset,
+				  &desc->buffer))
 			return false;
 	} else {
-		void *ptr;
+		uint32_t *ptr;
 
-		u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
-			       sctx->screen->b.info.tcc_cache_line_size,
-			       &desc->buffer_offset,
-			       (struct pipe_resource**)&desc->buffer, &ptr);
+		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+			       si_optimal_tcc_alignment(sctx, upload_size),
+			       (unsigned*)&desc->buffer_offset,
+			       (struct pipe_resource**)&desc->buffer,
+			       (void**)&ptr);
 		if (!desc->buffer)
 			return false; /* skip the draw call */
 
-		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
-		desc->gpu_list = ptr;
+		util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
+					upload_size);
+		desc->gpu_list = ptr - first_slot_offset / 4;
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 	}
+
+	/* The shader pointer should point to slot 0. */
+	desc->buffer_offset -= first_slot_offset;
+
 	desc->dirty_mask = 0;
 
 	if (atom)
@@ -1030,7 +1048,7 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	u_upload_alloc(sctx->b.b.const_uploader, 0,
 		       desc_list_byte_size,
 		       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
-		       &desc->buffer_offset,
+		       (unsigned*)&desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
@@ -1891,7 +1909,8 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
-	assert(desc->buffer);
+	if (!desc->buffer)
+		return; /* the pointer is not used by current shaders */
 
 	va = desc->buffer->gpu_address +
 	     desc->buffer_offset;
@@ -2034,6 +2053,8 @@ void si_init_all_descriptors(struct si_context *sctx)
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
 				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
 				 &ce_offset);
+	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
@@ -2156,3 +2177,41 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 
 	si_shader_userdata_begin_new_cs(sctx);
 }
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask)
+{
+	struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+	/* Ignore no-op updates and updates that disable all slots. */
+	if (!new_active_mask ||
+	    new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+						   desc->num_active_slots))
+		return;
+
+	int first, count;
+	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+	assert(new_active_mask == 0);
+
+	/* Upload/dump descriptors if slots are being enabled. */
+	if (first < desc->first_active_slot ||
+	    first + count > desc->first_active_slot + desc->num_active_slots)
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+	desc->first_active_slot = first;
+	desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel)
+{
+	if (!sel)
+		return;
+
+	si_set_active_descriptors(sctx,
+		si_const_and_shader_buffer_descriptors_idx(sel->type),
+		sel->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+		si_sampler_and_image_descriptors_idx(sel->type),
+		sel->active_samplers_and_images);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f2003a5072..dfabaa3556 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -42,6 +42,7 @@
 
 struct si_screen;
 struct si_shader;
+struct si_shader_selector;
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
@@ -222,12 +223,20 @@ struct si_descriptors {
 
 	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
-	unsigned buffer_offset;
+	int buffer_offset; /* can be negative if not using lower slots */
 
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
-	/* elements of the list that are changed and need to be uploaded */
+	/* Slots that are used by currently-bound shaders.
+	 * With CE: It determines which slots are dumped to L2.
+	 *          It doesn't skip uploads to CE RAM.
+	 * Without CE: It determines which slots are uploaded.
+	 */
+	unsigned first_active_slot;
+	unsigned num_active_slots;
+
+	/* Slots that have been changed and need to be uploaded. */
 	uint64_t dirty_mask;
 
 	/* Whether CE is used to upload this descriptor array. */
@@ -315,6 +324,11 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx,
 void si_emit_compute_shader_userdata(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
 		      uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel);
+
 /* si_state.c */
 struct si_shader_selector;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 45d996b6b6..8ac430975d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2151,6 +2151,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	sctx->do_update_shaders = true;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_update_tess_uses_prim_id(struct si_context *sctx)
@@ -2188,6 +2189,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 			si_update_tess_uses_prim_id(sctx);
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
@@ -2206,6 +2208,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 
 	if (enable_changed)
 		sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
@@ -2230,6 +2234,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
@@ -2247,6 +2252,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
 		si_update_tess_uses_prim_id(sctx);
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)