[Mesa-dev] [PATCH 05/10] radeonsi: do only 1 big CE dump at end of IBs and one reload in the preamble

Marek Olšák maraeo at gmail.com
Wed May 17 19:38:47 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

A later commit will only upload descriptors used by shaders, so we won't do
full dumps anymore, so the only way to have a complete mirror of CE RAM
in memory is to do a separate dump after the last draw call.
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 56 ++++++++++++---------------
 src/gallium/drivers/radeonsi/si_hw_context.c  |  8 +++-
 src/gallium/drivers/radeonsi/si_pipe.c        |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  3 ++
 src/gallium/drivers/radeonsi/si_state.h       |  6 +--
 5 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 91cc9a6..38e4ae1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -142,74 +142,69 @@ static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned s
 	radeon_emit(sctx->ce_ib, va);
 	radeon_emit(sctx->ce_ib, va >> 32);
 
 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
 	                       RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 
 	sctx->ce_need_synchronization = true;
 	return true;
 }
 
-static void si_ce_reinitialize_descriptors(struct si_context *sctx,
-                                           struct si_descriptors *desc)
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx)
 {
-	if (desc->buffer) {
-		struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
-		unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-		uint64_t va = buffer->gpu_address + desc->buffer_offset;
-		struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
-
-		if (!ib)
-			ib = sctx->ce_ib;
+	bool success = si_ce_upload(sctx, 0, sctx->total_ce_ram_allocated,
+				    &sctx->ce_ram_saved_offset,
+				    &sctx->ce_ram_saved_buffer);
+	(void)success;
+	assert(success);
+}
 
-		list_size = align(list_size, 32);
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx)
+{
+	if (!sctx->ce_ram_saved_buffer)
+		return;
 
-		radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
-		radeon_emit(ib, va);
-		radeon_emit(ib, va >> 32);
-		radeon_emit(ib, list_size / 4);
-		radeon_emit(ib, desc->ce_offset);
+	struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+	if (!ib)
+		ib = sctx->ce_ib;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
-		                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
-	}
-	desc->ce_ram_dirty = false;
-}
+	uint64_t va = sctx->ce_ram_saved_buffer->gpu_address +
+		      sctx->ce_ram_saved_offset;
 
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx)
-{
-	int i;
+	radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+	radeon_emit(ib, va);
+	radeon_emit(ib, va >> 32);
+	radeon_emit(ib, sctx->total_ce_ram_allocated / 4);
+	radeon_emit(ib, 0);
 
-	for (i = 0; i < SI_NUM_DESCS; ++i)
-		si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]);
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				  sctx->ce_ram_saved_buffer,
+				  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 }
 
 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
 {
 	radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 	radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
 	                CONTEXT_CONTROL_LOAD_CE_RAM(1));
 	radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 }
 
 static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc,
 				  struct r600_atom * atom)
 {
 	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
 
 	if (sctx->ce_ib && desc->uses_ce) {
 		uint32_t const* list = (uint32_t const*)desc->list;
 
-		if (desc->ce_ram_dirty)
-			si_ce_reinitialize_descriptors(sctx, desc);
-
 		while(desc->dirty_mask) {
 			int begin, count;
 			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
 						       &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
 
 			radeon_emit(sctx->ce_ib,
 			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
@@ -240,22 +235,20 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
 	if (atom)
 		si_mark_atom_dirty(sctx, atom);
 
 	return true;
 }
 
 static void
 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
 {
-	desc->ce_ram_dirty = true;
-
 	if (!desc->buffer)
 		return;
 
 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 				  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 }
 
 /* SAMPLER VIEWS */
 
 static unsigned
@@ -2037,20 +2030,21 @@ void si_init_all_descriptors(struct si_context *sctx)
 				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 				 /* The second set of usage/priority is used by
 				  * const buffers in RW buffer slots. */
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
 				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
 				 &ce_offset);
 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+	sctx->total_ce_ram_allocated = ce_offset;
 
 	if (sctx->b.chip_class >= GFX9)
 		assert(ce_offset <= 4096);
 	else
 		assert(ce_offset <= 32768);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.bind_sampler_states = si_bind_sampler_states;
 	sctx->b.b.set_shader_images = si_set_shader_images;
 	sctx->b.b.set_constant_buffer = si_pipe_set_constant_buffer;
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index e15f6a9..5e97d56 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -116,20 +116,24 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	 * This code is only needed when the driver flushes the GFX IB
 	 * internally, and it never asks for a fence handle.
 	 */
 	if (radeon_emitted(ctx->b.dma.cs, 0)) {
 		assert(fence == NULL); /* internal flushes only */
 		ctx->b.dma.flush(ctx, flags, NULL);
 	}
 
 	ctx->gfx_flush_in_progress = true;
 
+	/* This CE dump should be done in parallel with the last draw. */
+	if (ctx->ce_ib)
+		si_ce_save_all_descriptors_at_ib_end(ctx);
+
 	r600_preflush_suspend_features(&ctx->b);
 
 	ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
 			SI_CONTEXT_PS_PARTIAL_FLUSH;
 
 	/* DRM 3.1.0 doesn't flush TC for VI correctly. */
 	if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1)
 		ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 |
 				SI_CONTEXT_INV_VMEM_L1;
 
@@ -200,22 +204,22 @@ void si_begin_new_cs(struct si_context *ctx)
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
 	if (ctx->init_config_gs_rings)
 		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	if (ctx->ce_preamble_ib)
 		si_ce_enable_loads(ctx->ce_preamble_ib);
 	else if (ctx->ce_ib)
 		si_ce_enable_loads(ctx->ce_ib);
 
-	if (ctx->ce_preamble_ib)
-		si_ce_reinitialize_all_descriptors(ctx);
+	if (ctx->ce_ib)
+		si_ce_restore_all_descriptors_at_ib_start(ctx);
 
 	if (ctx->b.chip_class >= CIK)
 		si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
 
 	ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
 	ctx->framebuffer.dirty_zsbuf = true;
 	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
 
 	si_mark_atom_dirty(ctx, &ctx->clip_regs);
 	si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index dd962e0..eaa3348 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -48,20 +48,21 @@ static void si_destroy_context(struct pipe_context *context)
 	 * properly.
 	 */
 	struct pipe_framebuffer_state fb = {};
 	context->set_framebuffer_state(context, &fb);
 
 	si_release_all_descriptors(sctx);
 
 	if (sctx->ce_suballocator)
 		u_suballocator_destroy(sctx->ce_suballocator);
 
+	r600_resource_reference(&sctx->ce_ram_saved_buffer, NULL);
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
 	pipe_resource_reference(&sctx->tf_ring, NULL);
 	pipe_resource_reference(&sctx->tess_offchip_ring, NULL);
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_buffer, NULL);
 	free(sctx->border_color_table);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
 	r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 449a802..13ec072 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -229,20 +229,23 @@ struct si_context {
 	struct blitter_context		*blitter;
 	void				*custom_dsa_flush;
 	void				*custom_blend_resolve;
 	void				*custom_blend_decompress;
 	void				*custom_blend_fastclear;
 	void				*custom_blend_dcc_decompress;
 	struct si_screen		*screen;
 
 	struct radeon_winsys_cs		*ce_ib;
 	struct radeon_winsys_cs		*ce_preamble_ib;
+	struct r600_resource		*ce_ram_saved_buffer;
+	unsigned			ce_ram_saved_offset;
+	unsigned			total_ce_ram_allocated;
 	bool				ce_need_synchronization;
 	struct u_suballocator		*ce_suballocator;
 
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
 	bool				gfx_flush_in_progress;
 	bool				compute_is_busy;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index c4ef903..9b506a8 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -225,23 +225,20 @@ struct si_descriptors {
 	unsigned buffer_offset;
 
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
 	/* elements of the list that are changed and need to be uploaded */
 	uint64_t dirty_mask;
 
 	/* Whether CE is used to upload this descriptor array. */
 	bool uses_ce;
-	/* Whether the CE ram is dirty and needs to be reinitialized entirely
-	 * before we can do partial updates. */
-	bool ce_ram_dirty;
 
 	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
 	unsigned shader_userdata_offset;
 };
 
 struct si_sampler_views {
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLERS];
 	struct si_sampler_state		*sampler_states[SI_NUM_SAMPLERS];
 
@@ -275,21 +272,22 @@ struct si_buffer_resources {
 #define si_pm4_delete_state(sctx, member, value) \
 	do { \
 		if ((sctx)->queued.named.member == (value)) { \
 			(sctx)->queued.named.member = NULL; \
 		} \
 		si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \
 				  si_pm4_block_idx(member)); \
 	} while(0)
 
 /* si_descriptors.c */
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx);
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx);
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx);
 void si_ce_enable_loads(struct radeon_winsys_cs *ib);
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
 				    struct r600_texture *tex,
 				    const struct legacy_surf_level *base_level_info,
 				    unsigned base_level, unsigned first_level,
 				    unsigned block_width, bool is_stencil,
 				    uint32_t *state);
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
 				 uint slot, struct pipe_constant_buffer *cbuf);
 void si_get_shader_buffers(struct si_context *sctx,
-- 
2.7.4



More information about the mesa-dev mailing list