[Mesa-dev] [PATCH v2 7/8] radeonsi: use ready fences on all shaders, not just optimized ones

Fri Nov 3 20:09:16 UTC 2017

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

There's a race condition between si_shader_select_with_key and
si_bind_XX_shader:

  Thread 1                         Thread 2
  --------                         --------
  si_shader_select_with_key
    begin compiling the first
    variant
    (guarded by sel->mutex)
                                   si_bind_XX_shader
                                     select first_variant by default
                                     as state->current
                                   si_shader_select_with_key
                                     match state->current and early-out

Since thread 2 never takes sel->mutex, it may go on rendering without a
PM4 for that shader, for example.

The solution taken by this patch is to broaden the scope of
shader->optimized_ready to a fence shader->ready that applies to
all shaders. This does not hurt the fast path (if anything it makes
it faster, because we don't explicitly check is_optimized).

It will also allow reducing the scope of sel->mutex locks, but this is
deferred to a later commit for better bisectability.

Fixes dEQP-EGL.functional.sharing.gles2.multithread.simple.buffers.bufferdata_render

Reviewed-by: Marek Olšák <marek.olsak at amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c        |  3 +
 src/gallium/drivers/radeonsi/si_shader.h        |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 88 ++++++++++++++++++-------
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 6bc08dd3890..195c1cd7d94 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5388,20 +5388,23 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	if (!outputs)
 		return NULL;
 
 	shader = CALLOC_STRUCT(si_shader);
 	if (!shader) {
 		FREE(outputs);
 		return NULL;
 	}
 
+	/* We can leave the fence as permanently signaled because the GS copy
+	 * shader only becomes visible globally after it has been compiled. */
+	util_queue_fence_init(&shader->ready);
 
 	shader->selector = gs_selector;
 	shader->is_gs_copy_shader = true;
 
 	si_init_shader_ctx(&ctx, sscreen, tm);
 	ctx.shader = shader;
 	ctx.type = PIPE_SHADER_VERTEX;
 
 	builder = ctx.ac.builder;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b8f9d18f5c8..41851627a87 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -586,21 +586,21 @@ struct si_shader {
 
 	struct si_shader_part		*prolog;
 	struct si_shader		*previous_stage; /* for GFX9 */
 	struct si_shader_part		*prolog2;
 	struct si_shader_part		*epilog;
 
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
 	struct si_shader_key		key;
-	struct util_queue_fence		optimized_ready;
+	struct util_queue_fence		ready;
 	bool				compilation_failed;
 	bool				is_monolithic;
 	bool				is_optimized;
 	bool				is_binary_shared;
 	bool				is_gs_copy_shader;
 
 	/* The following data is all that's needed for binary shaders. */
 	struct ac_shader_binary	binary;
 	struct si_shader_config		config;
 	struct si_shader_info		info;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b72f6fc74a4..4c0292404e6 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1545,20 +1545,25 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
 				       struct si_shader_key *key)
 {
 	struct si_shader **mainp = si_get_main_shader_part(sel, key);
 
 	if (!*mainp) {
 		struct si_shader *main_part = CALLOC_STRUCT(si_shader);
 
 		if (!main_part)
 			return false;
 
+		/* We can leave the fence as permanently signaled because the
+		 * main part becomes visible globally only after it has been
+		 * compiled. */
+		util_queue_fence_init(&main_part->ready);
+
 		main_part->selector = sel;
 		main_part->key.as_es = key->as_es;
 		main_part->key.as_ls = key->as_ls;
 
 		if (si_compile_tgsi_shader(sscreen, compiler_state->tm,
 					   main_part, false,
 					   &compiler_state->debug) != 0) {
 			FREE(main_part);
 			return false;
 		}
@@ -1578,70 +1583,85 @@ static int si_shader_select_with_key(struct si_screen *sscreen,
 	struct si_shader_selector *previous_stage_sel = NULL;
 	struct si_shader *current = state->current;
 	struct si_shader *iter, *shader = NULL;
 
 again:
 	/* Check if we don't need to change anything.
 	 * This path is also used for most shaders that don't need multiple
 	 * variants, it will cost just a computation of the key and this
 	 * test. */
 	if (likely(current &&
-		   memcmp(&current->key, key, sizeof(*key)) == 0 &&
-		   (!current->is_optimized ||
-		    util_queue_fence_is_signalled(&current->optimized_ready))))
+		   memcmp(&current->key, key, sizeof(*key)) == 0)) {
+		if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
+			if (current->is_optimized) {
+				memset(&key->opt, 0, sizeof(key->opt));
+				goto current_not_ready;
+			}
+
+			util_queue_fence_wait(&current->ready);
+		}
+
 		return current->compilation_failed ? -1 : 0;
+	}
+current_not_ready:
 
 	/* This must be done before the mutex is locked, because async GS
 	 * compilation calls this function too, and therefore must enter
 	 * the mutex first.
 	 *
 	 * Only wait if we are in a draw call. Don't wait if we are
 	 * in a compiler thread.
 	 */
 	if (thread_index < 0)
 		util_queue_fence_wait(&sel->ready);
 
 	mtx_lock(&sel->mutex);
 
 	/* Find the shader variant. */
 	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
 		/* Don't check the "current" shader. We checked it above. */
 		if (current != iter &&
 		    memcmp(&iter->key, key, sizeof(*key)) == 0) {
-			/* If it's an optimized shader and its compilation has
-			 * been started but isn't done, use the unoptimized
-			 * shader so as not to cause a stall due to compilation.
-			 */
-			if (iter->is_optimized &&
-			    !util_queue_fence_is_signalled(&iter->optimized_ready)) {
-				memset(&key->opt, 0, sizeof(key->opt));
-				mtx_unlock(&sel->mutex);
-				goto again;
+			if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+				/* If it's an optimized shader and its compilation has
+				 * been started but isn't done, use the unoptimized
+				 * shader so as not to cause a stall due to compilation.
+				 */
+				if (iter->is_optimized) {
+					memset(&key->opt, 0, sizeof(key->opt));
+					mtx_unlock(&sel->mutex);
+					goto again;
+				}
+
+				util_queue_fence_wait(&iter->ready);
 			}
 
 			if (iter->compilation_failed) {
 				mtx_unlock(&sel->mutex);
 				return -1; /* skip the draw call */
 			}
 
 			state->current = iter;
 			mtx_unlock(&sel->mutex);
 			return 0;
 		}
 	}
 
 	/* Build a new shader. */
 	shader = CALLOC_STRUCT(si_shader);
 	if (!shader) {
 		mtx_unlock(&sel->mutex);
 		return -ENOMEM;
 	}
+
+	util_queue_fence_init(&shader->ready);
+
 	shader->selector = sel;
 	shader->key = *key;
 	shader->compiler_ctx_state = *compiler_state;
 
 	/* If this is a merged shader, get the first shader's selector. */
 	if (sscreen->b.chip_class >= GFX9) {
 		if (sel->type == PIPE_SHADER_TESS_CTRL)
 			previous_stage_sel = key->part.tcs.ls;
 		else if (sel->type == PIPE_SHADER_GEOMETRY)
 			previous_stage_sel = key->part.gs.es;
@@ -1704,49 +1724,62 @@ again:
 
 	/* Monolithic-only shaders don't make a distinction between optimized
 	 * and unoptimized. */
 	shader->is_monolithic =
 		is_pure_monolithic ||
 		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
 	shader->is_optimized =
 		!is_pure_monolithic &&
 		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-	if (shader->is_optimized)
-		util_queue_fence_init(&shader->optimized_ready);
-
-	if (!sel->last_variant) {
-		sel->first_variant = shader;
-		sel->last_variant = shader;
-	} else {
-		sel->last_variant->next_variant = shader;
-		sel->last_variant = shader;
-	}
 
 	/* If it's an optimized shader, compile it asynchronously. */
 	if (shader->is_optimized &&
 	    !is_pure_monolithic &&
 	    thread_index < 0) {
 		/* Compile it asynchronously. */
 		util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
-				   shader, &shader->optimized_ready,
+				   shader, &shader->ready,
 				   si_build_shader_variant_low_priority, NULL);
 
+		/* Add only after the ready fence was reset, to guard against a
+		 * race with si_bind_XX_shader. */
+		if (!sel->last_variant) {
+			sel->first_variant = shader;
+			sel->last_variant = shader;
+		} else {
+			sel->last_variant->next_variant = shader;
+			sel->last_variant = shader;
+		}
+
 		/* Use the default (unoptimized) shader for now. */
 		memset(&key->opt, 0, sizeof(key->opt));
 		mtx_unlock(&sel->mutex);
 		goto again;
 	}
 
+	/* Reset the fence before adding to the variant list. */
+	util_queue_fence_reset(&shader->ready);
+
+	if (!sel->last_variant) {
+		sel->first_variant = shader;
+		sel->last_variant = shader;
+	} else {
+		sel->last_variant->next_variant = shader;
+		sel->last_variant = shader;
+	}
+
 	assert(!shader->is_optimized);
 	si_build_shader_variant(shader, thread_index, false);
 
+	util_queue_fence_signal(&shader->ready);
+
 	if (!shader->compilation_failed)
 		state->current = shader;
 
 	mtx_unlock(&sel->mutex);
 	return shader->compilation_failed ? -1 : 0;
 }
 
 static int si_shader_select(struct pipe_context *ctx,
 			    struct si_shader_ctx_state *state,
 			    struct si_compiler_ctx_state *compiler_state)
@@ -1822,20 +1855,24 @@ static void si_init_shader_selector_async(void *job, int thread_index)
 	 */
 	if (!sscreen->use_monolithic_shaders) {
 		struct si_shader *shader = CALLOC_STRUCT(si_shader);
 		void *tgsi_binary = NULL;
 
 		if (!shader) {
 			fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
 			return;
 		}
 
+		/* We can leave the fence signaled because use of the default
+		 * main part is guarded by the selector's ready fence. */
+		util_queue_fence_init(&shader->ready);
+
 		shader->selector = sel;
 		si_parse_next_shader_property(&sel->info,
 					      sel->so.num_outputs != 0,
 					      &shader->key);
 
 		if (sel->tokens)
 			tgsi_binary = si_get_tgsi_binary(sel);
 
 		/* Try to load the shader from the shader cache. */
 		mtx_lock(&sscreen->shader_cache_mutex);
@@ -2441,24 +2478,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 		     sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
 			si_mark_atom_dirty(sctx, &sctx->msaa_config);
 	}
 	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
 		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
-				    &shader->optimized_ready);
-		util_queue_fence_destroy(&shader->optimized_ready);
+				    &shader->ready);
 	}
 
+	util_queue_fence_destroy(&shader->ready);
+
 	if (shader->pm4) {
 		switch (shader->selector->type) {
 		case PIPE_SHADER_VERTEX:
 			if (shader->key.as_ls) {
 				assert(sctx->b.chip_class <= VI);
 				si_pm4_delete_state(sctx, ls, shader->pm4);
 			} else if (shader->key.as_es) {
 				assert(sctx->b.chip_class <= VI);
 				si_pm4_delete_state(sctx, es, shader->pm4);
 			} else {
-- 
2.11.0