[Mesa-dev] [PATCH 07/10] radeonsi: record which descriptor slots are used by shaders

Wed May 17 19:38:49 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_compute.c       |  3 +++
 src/gallium/drivers/radeonsi/si_compute.h       |  4 ++++
 src/gallium/drivers/radeonsi/si_shader.h        |  4 ++++
 src/gallium/drivers/radeonsi/si_state.h         |  3 +++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 27 +++++++++++++++++++++++++
 5 files changed, 41 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 3a519a72..22ef111 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -93,20 +93,23 @@ static void si_create_compute_state_async(void *job, int thread_index)
 		tm = program->compiler_ctx_state.tm;
 	}
 
 	memset(&sel, 0, sizeof(sel));
 
 	sel.screen = program->screen;
 	tgsi_scan_shader(program->tokens, &sel.info);
 	sel.tokens = program->tokens;
 	sel.type = PIPE_SHADER_COMPUTE;
 	sel.local_size = program->local_size;
+	si_get_active_slot_masks(&sel.info,
+				 &program->active_const_and_shader_buffers,
+				 &program->active_samplers_and_images);
 
 	program->shader.selector = &sel;
 	program->shader.is_monolithic = true;
 	program->uses_grid_size = sel.info.uses_grid_size;
 	program->uses_block_size = sel.info.uses_block_size;
 
 	if (si_shader_create(program->screen, tm, &program->shader, debug)) {
 		program->shader.compilation_failed = true;
 	} else {
 		bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h
index ed33104..764d708 100644
--- a/src/gallium/drivers/radeonsi/si_compute.h
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@@ -27,20 +27,24 @@
 #include "si_shader.h"
 
 #define MAX_GLOBAL_BUFFERS 22
 
 struct si_compute {
 	struct si_screen *screen;
 	struct tgsi_token *tokens;
 	struct util_queue_fence ready;
 	struct si_compiler_ctx_state compiler_ctx_state;
 
+	/* bitmasks of used descriptor slots */
+	uint32_t active_const_and_shader_buffers;
+	uint64_t active_samplers_and_images;
+
 	unsigned ir_type;
 	unsigned local_size;
 	unsigned private_size;
 	unsigned input_size;
 	struct si_shader shader;
 
 	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
 	unsigned use_code_object_v2 : 1;
 	unsigned variable_group_size : 1;
 	unsigned uses_grid_size:1;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ffb7dc3..aab902b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -345,20 +345,24 @@ struct si_shader_selector {
 	 */
 	unsigned	colors_written_4bit;
 
 	/* CS parameters */
 	unsigned local_size;
 
 	uint64_t	outputs_written;	/* "get_unique_index" bits */
 	uint32_t	patch_outputs_written;	/* "get_unique_index_patch" bits */
 
 	uint64_t	inputs_read;		/* "get_unique_index" bits */
+
+	/* bitmasks of used descriptor slots */
+	uint32_t	active_const_and_shader_buffers;
+	uint64_t	active_samplers_and_images;
 };
 
 /* Valid shader configurations:
  *
  * API shaders       VS | TCS | TES | GS |pass| PS
  * are compiled as:     |     |     |    |thru|
  *                      |     |     |    |    |
  * Only VS & PS:     VS |     |     |    |    | PS
  * GFX6 - with GS:   ES |     |     | GS | VS | PS
  *      - with tess: LS | HS  | VS  |    |    | PS
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 9b506a8..f2003a5 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -346,20 +346,23 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 			      const struct pipe_sampler_view *state,
 			      unsigned width0, unsigned height0,
 			      unsigned force_level);
 
 /* si_state_shader.c */
 bool si_update_shaders(struct si_context *sctx);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
 void si_destroy_shader_cache(struct si_screen *sscreen);
 void si_init_shader_selector_async(void *job, int thread_index);
+void si_get_active_slot_masks(const struct tgsi_shader_info *info,
+			      uint32_t *const_and_shader_buffers,
+			      uint64_t *samplers_and_images);
 
 /* si_state_draw.c */
 void si_init_ia_multi_vgt_param_table(struct si_context *sctx);
 void si_emit_cache_flush(struct si_context *sctx);
 void si_ce_pre_draw_synchronization(struct si_context *sctx);
 void si_ce_post_draw_synchronization(struct si_context *sctx);
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo);
 void si_trace_emit(struct si_context *sctx);
 
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 313af85..45d996b 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1896,20 +1896,44 @@ void si_init_shader_selector_async(void *job, int thread_index)
 		sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, tm, sel, debug);
 		if (!sel->gs_copy_shader) {
 			fprintf(stderr, "radeonsi: can't create GS copy shader\n");
 			return;
 		}
 
 		si_shader_vs(sscreen, sel->gs_copy_shader, sel);
 	}
 }
 
+/* Return descriptor slot usage masks from the given shader info. */
+void si_get_active_slot_masks(const struct tgsi_shader_info *info,
+			      uint32_t *const_and_shader_buffers,
+			      uint64_t *samplers_and_images)
+{
+	unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers;
+
+	num_shaderbufs = util_last_bit(info->shader_buffers_declared);
+	num_constbufs = util_last_bit(info->const_buffers_declared);
+	/* two 8-byte images share one 16-byte slot */
+	num_images = align(util_last_bit(info->images_declared), 2);
+	num_samplers = util_last_bit(info->samplers_declared);
+
+	/* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+	start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+	*const_and_shader_buffers =
+		u_bit_consecutive(start, num_shaderbufs + num_constbufs);
+
+	/* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */
+	start = si_get_image_slot(num_images - 1) / 2;
+	*samplers_and_images =
+		u_bit_consecutive64(start, num_images / 2 + num_samplers);
+}
+
 static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
 	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
 	int i;
 
 	if (!sel)
 		return NULL;
@@ -1922,20 +1946,23 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	if (!sel->tokens) {
 		FREE(sel);
 		return NULL;
 	}
 
 	sel->so = state->stream_output;
 	tgsi_scan_shader(state->tokens, &sel->info);
 	sel->type = sel->info.processor;
 	p_atomic_inc(&sscreen->b.num_shaders_created);
+	si_get_active_slot_masks(&sel->info,
+				 &sel->active_const_and_shader_buffers,
+				 &sel->active_samplers_and_images);
 
 	/* The prolog is a no-op if there are no inputs. */
 	sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
 			       sel->info.num_inputs;
 
 	/* Set which opcode uses which (i,j) pair. */
 	if (sel->info.uses_persp_opcode_interp_centroid)
 		sel->info.uses_persp_centroid = true;
 
 	if (sel->info.uses_linear_opcode_interp_centroid)
-- 
2.7.4