[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Wed May 17 19:38:50 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85 ++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h         | 18 +++++-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
 			return NULL;
 		}
 	}
 
 	return program;
 }
 
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	sctx->cs_shader_state.program = (struct si_compute*)state;
+	struct si_compute *program = (struct si_compute*)state;
+
+	sctx->cs_shader_state.program = program;
+	if (!program)
+		return;
+
+	/* Wait because we need active slot usage masks. */
+	if (program->ir_type == PIPE_SHADER_IR_TGSI)
+		util_queue_fence_wait(&program->ready);
+
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+				  program->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+				  program->active_samplers_and_images);
 }
 
 static void si_set_global_binding(
 	struct pipe_context *ctx, unsigned first, unsigned n,
 	struct pipe_resource **resources,
 	uint32_t **handles)
 {
 	unsigned i;
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
 	bool cs_regalloc_hang =
 		(sctx->b.chip_class == SI ||
 		 sctx->b.family == CHIP_BONAIRE ||
 		 sctx->b.family == CHIP_KABINI) &&
 		info->block[0] * info->block[1] * info->block[2] > 256;
 
 	if (cs_regalloc_hang)
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 				 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-		util_queue_fence_wait(&program->ready);
-
-		if (program->shader.compilation_failed)
-			return;
-	}
+	if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+	    program->shader.compilation_failed)
+		return;
 
 	si_decompress_compute_textures(sctx);
 
 	/* Add buffer sizes for memory checking in need_cs_space. */
 	r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
 	/* TODO: add the scratch buffer */
 
 	if (info->indirect) {
 		r600_context_add_resource_size(ctx, info->indirect);
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	}
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	r600_resource_reference(&desc->buffer, NULL);
 	FREE(desc->list);
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
-			 unsigned *out_offset, struct r600_resource **out_buf) {
+			 unsigned *out_offset, struct r600_resource **out_buf)
+{
 	uint64_t va;
 
 	u_suballocator_alloc(sctx->ce_suballocator, size,
-			     sctx->screen->b.info.tcc_cache_line_size,
-			     out_offset, (struct pipe_resource**)out_buf);
+			     si_optimal_tcc_alignment(sctx, size),
+			     (unsigned*)out_offset,
+			     (struct pipe_resource**)out_buf);
 	if (!out_buf)
 			return false;
 
 	va = (*out_buf)->gpu_address + *out_offset;
 
 	radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
 	radeon_emit(sctx->ce_ib, ce_offset);
 	radeon_emit(sctx->ce_ib, size / 4);
 	radeon_emit(sctx->ce_ib, va);
 	radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
 	radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 	radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
 	                CONTEXT_CONTROL_LOAD_CE_RAM(1));
 	radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 }
 
 static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc,
 				  struct r600_atom * atom)
 {
-	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	unsigned slot_size = desc->element_dw_size * 4;
+	unsigned first_slot_offset = desc->first_active_slot * slot_size;
+	unsigned upload_size = desc->num_active_slots * slot_size;
+
+	if (!upload_size)
+		return true;
 
 	if (sctx->ce_ib && desc->uses_ce) {
 		uint32_t const* list = (uint32_t const*)desc->list;
 
 		while(desc->dirty_mask) {
 			int begin, count;
 			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
 						       &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
 
 			radeon_emit(sctx->ce_ib,
 			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
 			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
-		                           &desc->buffer_offset, &desc->buffer))
+		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+				  upload_size, (unsigned*)&desc->buffer_offset,
+				  &desc->buffer))
 			return false;
 	} else {
-		void *ptr;
+		uint32_t *ptr;
 
-		u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
-			       sctx->screen->b.info.tcc_cache_line_size,
-			       &desc->buffer_offset,
-			       (struct pipe_resource**)&desc->buffer, &ptr);
+		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+			       si_optimal_tcc_alignment(sctx, upload_size),
+			       (unsigned*)&desc->buffer_offset,
+			       (struct pipe_resource**)&desc->buffer,
+			       (void**)&ptr);
 		if (!desc->buffer)
 			return false; /* skip the draw call */
 
-		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
-		desc->gpu_list = ptr;
+		util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
+					upload_size);
+		desc->gpu_list = ptr - first_slot_offset / 4;
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 	}
+
+	/* The shader pointer should point to slot 0. */
+	desc->buffer_offset -= first_slot_offset;
+
 	desc->dirty_mask = 0;
 
 	if (atom)
 		si_mark_atom_dirty(sctx, atom);
 
 	return true;
 }
 
 static void
 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
@@ -1023,21 +1037,21 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	desc_list_byte_size = velems->desc_list_byte_size;
 	first_vb_use_mask = velems->first_vb_use_mask;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
 	 * the fine-grained upload path.
 	 */
 	u_upload_alloc(sctx->b.b.const_uploader, 0,
 		       desc_list_byte_size,
 		       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
-		       &desc->buffer_offset,
+		       (unsigned*)&desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
 
 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 
 	assert(count <= SI_MAX_ATTRIBS);
 
@@ -1883,21 +1897,22 @@ void si_shader_change_notify(struct si_context *sctx)
 	}
 }
 
 static void si_emit_shader_pointer(struct si_context *sctx,
 				   struct si_descriptors *desc,
 				   unsigned sh_base)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
-	assert(desc->buffer);
+	if (!desc->buffer)
+		return; /* the pointer is not used by current shaders */
 
 	va = desc->buffer->gpu_address +
 	     desc->buffer_offset;
 
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
 	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
 	radeon_emit(cs, va);
 	radeon_emit(cs, va >> 32);
 }
 
@@ -2026,20 +2041,22 @@ void si_init_all_descriptors(struct si_context *sctx)
 	}
 
 	si_init_buffer_resources(&sctx->rw_buffers,
 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
 				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 				 /* The second set of usage/priority is used by
 				  * const buffers in RW buffer slots. */
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
 				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
 				 &ce_offset);
+	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
 	sctx->total_ce_ram_allocated = ce_offset;
 
 	if (sctx->b.chip_class >= GFX9)
 		assert(ce_offset <= 4096);
 	else
 		assert(ce_offset <= 32768);
@@ -2148,10 +2165,48 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
 	}
 	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
 	si_vertex_buffers_begin_new_cs(sctx);
 
 	for (i = 0; i < SI_NUM_DESCS; ++i)
 		si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
 
 	si_shader_userdata_begin_new_cs(sctx);
 }
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask)
+{
+	struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+	/* Ignore no-op updates and updates that disable all slots. */
+	if (!new_active_mask ||
+	    new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+						   desc->num_active_slots))
+		return;
+
+	int first, count;
+	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+	assert(new_active_mask == 0);
+
+	/* Upload/dump descriptors if slots are being enabled. */
+	if (first < desc->first_active_slot ||
+	    first + count > desc->first_active_slot + desc->num_active_slots)
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+	desc->first_active_slot = first;
+	desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel)
+{
+	if (!sel)
+		return;
+
+	si_set_active_descriptors(sctx,
+		si_const_and_shader_buffer_descriptors_idx(sel->type),
+		sel->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+		si_sampler_and_image_descriptors_idx(sel->type),
+		sel->active_samplers_and_images);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f2003a5..dfabaa3 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -35,20 +35,21 @@
 
 #define SI_MAX_ATTRIBS			16
 #define SI_NUM_VERTEX_BUFFERS		SI_MAX_ATTRIBS
 #define SI_NUM_SAMPLERS			32 /* OpenGL textures units per shader */
 #define SI_NUM_CONST_BUFFERS		16
 #define SI_NUM_IMAGES			16
 #define SI_NUM_SHADER_BUFFERS		16
 
 struct si_screen;
 struct si_shader;
+struct si_shader_selector;
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
 	uint32_t		cb_target_mask;
 	bool			alpha_to_coverage;
 	bool			alpha_to_one;
 	bool			dual_src_blend;
 	/* Set 0xf or 0x0 (4 bits) per render target if the following is
 	 * true. ANDed with spi_shader_col_format.
 	 */
@@ -215,26 +216,34 @@ struct si_descriptors {
 	uint32_t *list;
 	/* The list in mapped GPU memory. */
 	uint32_t *gpu_list;
 	/* The size of one descriptor. */
 	unsigned element_dw_size;
 	/* The maximum number of descriptors. */
 	unsigned num_elements;
 
 	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
-	unsigned buffer_offset;
+	int buffer_offset; /* can be negative if not using lower slots */
 
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
-	/* elements of the list that are changed and need to be uploaded */
+	/* Slots that are used by currently-bound shaders.
+	 * With CE: It determines which slots are dumped to L2.
+	 *          It doesn't skip uploads to CE RAM.
+	 * Without CE: It determines which slots are uploaded.
+	 */
+	unsigned first_active_slot;
+	unsigned num_active_slots;
+
+	/* Slots that have been changed and need to be uploaded. */
 	uint64_t dirty_mask;
 
 	/* Whether CE is used to upload this descriptor array. */
 	bool uses_ce;
 
 	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
 	unsigned shader_userdata_offset;
 };
 
@@ -308,20 +317,25 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
 void si_update_all_texture_descriptors(struct si_context *sctx);
 void si_shader_change_notify(struct si_context *sctx);
 void si_update_compressed_colortex_masks(struct si_context *sctx);
 void si_emit_graphics_shader_userdata(struct si_context *sctx,
                                       struct r600_atom *atom);
 void si_emit_compute_shader_userdata(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
 		      uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel);
+
 /* si_state.c */
 struct si_shader_selector;
 
 void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
 		  struct r600_atom **list_elem,
 		  void (*emit_func)(struct si_context *ctx, struct r600_atom *state));
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
 void
 si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 45d996b..8ac4309 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2144,20 +2144,21 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	struct si_shader_selector *sel = state;
 
 	if (sctx->vs_shader.cso == sel)
 		return;
 
 	sctx->vs_shader.cso = sel;
 	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
 	sctx->do_update_shaders = true;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_update_tess_uses_prim_id(struct si_context *sctx)
 {
 	sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
 		(sctx->tes_shader.cso &&
 		 sctx->tes_shader.cso->info.uses_primid) ||
 		(sctx->tcs_shader.cso &&
 		 sctx->tcs_shader.cso->info.uses_primid) ||
 		(sctx->gs_shader.cso &&
@@ -2181,38 +2182,41 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 	sctx->do_update_shaders = true;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed) {
 		si_shader_change_notify(sctx);
 		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
 			si_update_tess_uses_prim_id(sctx);
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 	bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
 
 	if (sctx->tcs_shader.cso == sel)
 		return;
 
 	sctx->tcs_shader.cso = sel;
 	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
 	si_update_tess_uses_prim_id(sctx);
 	sctx->do_update_shaders = true;
 
 	if (enable_changed)
 		sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
 
 	if (sctx->tes_shader.cso == sel)
 		return;
@@ -2223,37 +2227,39 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 	si_update_tess_uses_prim_id(sctx);
 	sctx->do_update_shaders = true;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed) {
 		si_shader_change_notify(sctx);
 		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 
 	/* skip if supplied shader is one already in use */
 	if (sctx->ps_shader.cso == sel)
 		return;
 
 	sctx->ps_shader.cso = sel;
 	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 	sctx->do_update_shaders = true;
 	if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
 		si_update_tess_uses_prim_id(sctx);
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
 		util_queue_fence_wait(&shader->optimized_ready);
 		util_queue_fence_destroy(&shader->optimized_ready);
 	}
 
 	if (shader->pm4) {
-- 
2.7.4