[Mesa-dev] [PATCH 15/20] radeonsi: add infrastr. for compiling optimized shader variants asynchronously

Wed Nov 16 18:38:38 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_shader.h        |   7 ++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 136 ++++++++++++++++++------
 2 files changed, 109 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 2ed0cb7..38aa361 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -428,20 +428,24 @@ struct si_shader_key {
 	/* Flags for monolithic compilation only. */
 	union {
 		struct {
 			/* One pair of bits for every input: SI_FIX_FETCH_* enums. */
 			uint32_t	fix_fetch;
 		} vs;
 		struct {
 			uint64_t	inputs_to_copy; /* for fixed-func TCS */
 		} tcs;
 	} mono;
+
+	/* Optimization flags for asynchronous compilation only. */
+	union {
+	} opt;
 };
 
 struct si_shader_config {
 	unsigned			num_sgprs;
 	unsigned			num_vgprs;
 	unsigned			spilled_sgprs;
 	unsigned			spilled_vgprs;
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
 	unsigned			spi_ps_input_addr;
@@ -477,21 +481,24 @@ struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
 
 	struct si_shader_part		*prolog;
 	struct si_shader_part		*epilog;
 
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
 	struct si_shader_key		key;
+	struct util_queue_fence		optimized_ready;
+	bool				compilation_failed;
 	bool				is_monolithic;
+	bool				is_optimized;
 	bool				is_binary_shared;
 	bool				is_gs_copy_shader;
 
 	/* The following data is all that's needed for binary shaders. */
 	struct radeon_shader_binary	binary;
 	struct si_shader_config		config;
 	struct si_shader_info		info;
 
 	/* Shader key + LLVM IR + disassembly + statistics.
 	 * Generated for debug contexts only.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 72d0518..00ccbbd 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1008,118 +1008,182 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		}
 
 		key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
 		break;
 	}
 	default:
 		assert(0);
 	}
 }
 
+static void si_build_shader_variant(void *job, int thread_index)
+{
+	struct si_shader *shader = (struct si_shader *)job;
+	struct si_shader_selector *sel = shader->selector;
+	struct si_screen *sscreen = sel->screen;
+	LLVMTargetMachineRef tm;
+	struct pipe_debug_callback *debug = &sel->debug;
+	int r;
+
+	if (thread_index >= 0) {
+		assert(thread_index < ARRAY_SIZE(sscreen->tm));
+		tm = sscreen->tm[thread_index];
+		if (!debug->async)
+			debug = NULL;
+	} else {
+		tm = sel->tm;
+	}
+
+	r = si_shader_create(sscreen, tm, shader, debug);
+	if (unlikely(r)) {
+		R600_ERR("Failed to build shader variant (type=%u) %d\n",
+			 sel->type, r);
+		shader->compilation_failed = true;
+		return;
+	}
+
+	if (sel->is_debug_context) {
+		FILE *f = open_memstream(&shader->shader_log,
+					 &shader->shader_log_size);
+		if (f) {
+			si_shader_dump(sscreen, shader, NULL, sel->type, f);
+			fclose(f);
+		}
+	}
+
+	si_shader_init_pm4_state(sscreen, shader);
+}
+
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select_with_key(struct si_screen *sscreen,
 				     struct si_shader_ctx_state *state,
 				     struct si_shader_key *key,
-				     LLVMTargetMachineRef tm,
-				     struct pipe_debug_callback *debug,
-				     bool wait,
-				     bool is_debug_context)
+				     int thread_index)
 {
 	static const struct si_shader_key zeroed;
 	struct si_shader_selector *sel = state->cso;
 	struct si_shader *current = state->current;
 	struct si_shader *iter, *shader = NULL;
-	int r;
-
+again:
 	/* Check if we don't need to change anything.
 	 * This path is also used for most shaders that don't need multiple
 	 * variants, it will cost just a computation of the key and this
 	 * test. */
-	if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0))
+	if (likely(current &&
+		   memcmp(&current->key, key, sizeof(*key)) == 0 &&
+		   (!current->is_optimized ||
+		    util_queue_fence_is_signalled(&current->optimized_ready))))
 		return 0;
 
 	/* This must be done before the mutex is locked, because async GS
 	 * compilation calls this function too, and therefore must enter
 	 * the mutex first.
+	 *
+	 * Only wait if we are in a draw call. Don't wait if we are
+	 * in a compiler thread.
 	 */
-	if (wait)
+	if (thread_index < 0)
 		util_queue_job_wait(&sel->ready);
 
 	pipe_mutex_lock(sel->mutex);
 
 	/* Find the shader variant. */
 	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
 		/* Don't check the "current" shader. We checked it above. */
 		if (current != iter &&
 		    memcmp(&iter->key, key, sizeof(*key)) == 0) {
+			/* If it's an optimized shader and its compilation has
+			 * been started but isn't done, use the unoptimized
+			 * shader so as not to cause a stall due to compilation.
+			 */
+			if (iter->is_optimized &&
+			    !util_queue_fence_is_signalled(&iter->optimized_ready)) {
+				memset(&key->opt, 0, sizeof(key->opt));
+				pipe_mutex_unlock(sel->mutex);
+				goto again;
+			}
+
+			if (iter->compilation_failed) {
+				pipe_mutex_unlock(sel->mutex);
+				return -1; /* skip the draw call */
+			}
+
 			state->current = iter;
 			pipe_mutex_unlock(sel->mutex);
 			return 0;
 		}
 	}
 
 	/* Build a new shader. */
 	shader = CALLOC_STRUCT(si_shader);
 	if (!shader) {
 		pipe_mutex_unlock(sel->mutex);
 		return -ENOMEM;
 	}
 	shader->selector = sel;
 	shader->key = *key;
+
+	/* Monolithic-only shaders don't make a distinction between optimized
+	 * and unoptimized. */
 	shader->is_monolithic =
 		!sel->main_shader_part ||
 		sel->main_shader_part->key.as_ls != key->as_ls ||
 		sel->main_shader_part->key.as_es != key->as_es ||
+		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0 ||
 		memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
 
-	r = si_shader_create(sscreen, tm, shader, debug);
-	if (unlikely(r)) {
-		R600_ERR("Failed to build shader variant (type=%u) %d\n",
-			 sel->type, r);
-		FREE(shader);
-		pipe_mutex_unlock(sel->mutex);
-		return r;
-	}
-
-	if (is_debug_context) {
-		FILE *f = open_memstream(&shader->shader_log,
-					 &shader->shader_log_size);
-		if (f) {
-			si_shader_dump(sscreen, shader, NULL, sel->type, f);
-			fclose(f);
-		}
-	}
-
-	si_shader_init_pm4_state(sscreen, shader);
+	shader->is_optimized =
+		!sscreen->use_monolithic_shaders &&
+		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+	if (shader->is_optimized)
+		util_queue_fence_init(&shader->optimized_ready);
 
 	if (!sel->last_variant) {
 		sel->first_variant = shader;
 		sel->last_variant = shader;
 	} else {
 		sel->last_variant->next_variant = shader;
 		sel->last_variant = shader;
 	}
-	state->current = shader;
+
+	/* If it's an optimized shader, compile it asynchronously. */
+	if (shader->is_optimized &&
+	    thread_index < 0) {
+		/* Compile it asynchronously. */
+		util_queue_add_job(&sscreen->shader_compiler_queue,
+				   shader, &shader->optimized_ready,
+				   si_build_shader_variant, NULL);
+
+		/* Use the default (unoptimized) shader for now. */
+		memset(&key->opt, 0, sizeof(key->opt));
+		pipe_mutex_unlock(sel->mutex);
+		goto again;
+	}
+
+	assert(!shader->is_optimized);
+	si_build_shader_variant(shader, thread_index);
+
+	if (!shader->compilation_failed)
+		state->current = shader;
+
 	pipe_mutex_unlock(sel->mutex);
-	return 0;
+	return shader->compilation_failed ? -1 : 0;
 }
 
 static int si_shader_select(struct pipe_context *ctx,
 			    struct si_shader_ctx_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_key key;
 
 	si_shader_selector_key(ctx, state->cso, &key);
-	return si_shader_select_with_key(sctx->screen, state, &key,
-					 sctx->tm, &sctx->b.debug, true,
-					 sctx->is_debug);
+	return si_shader_select_with_key(sctx->screen, state, &key, -1);
 }
 
 static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
 					  struct si_shader_key *key)
 {
 	unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
 
 	switch (info->processor) {
 	case PIPE_SHADER_VERTEX:
 		switch (next_shader) {
@@ -1240,22 +1304,21 @@ void si_init_shader_selector_async(void *job, int thread_index)
 				sel->info.uses_linear_center &&
 				sel->info.uses_linear_centroid;
 			key.part.ps.epilog.alpha_func = PIPE_FUNC_ALWAYS;
 			for (i = 0; i < 8; i++)
 				if (sel->info.colors_written & (1 << i))
 					key.part.ps.epilog.spi_shader_col_format |=
 						V_028710_SPI_SHADER_FP16_ABGR << (i * 4);
 			break;
 		}
 
-		if (si_shader_select_with_key(sscreen, &state, &key, tm, debug,
-					      false, sel->is_debug_context))
+		if (si_shader_select_with_key(sscreen, &state, &key, thread_index))
 			fprintf(stderr, "radeonsi: can't create a monolithic shader\n");
 	}
 
 	/* The GS copy shader is always pre-compiled. */
 	if (sel->type == PIPE_SHADER_GEOMETRY) {
 		sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, tm, sel, debug);
 		if (!sel->gs_copy_shader) {
 			fprintf(stderr, "radeonsi: can't create GS copy shader\n");
 			return;
 		}
@@ -1517,20 +1580,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->ps_shader.cso = sel;
 	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 	sctx->do_update_shaders = true;
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
+	if (shader->is_optimized) {
+		util_queue_job_wait(&shader->optimized_ready);
+		util_queue_fence_destroy(&shader->optimized_ready);
+	}
+
 	if (shader->pm4) {
 		switch (shader->selector->type) {
 		case PIPE_SHADER_VERTEX:
 			if (shader->key.as_ls)
 				si_pm4_delete_state(sctx, ls, shader->pm4);
 			else if (shader->key.as_es)
 				si_pm4_delete_state(sctx, es, shader->pm4);
 			else
 				si_pm4_delete_state(sctx, vs, shader->pm4);
 			break;
-- 
2.7.4