[Mesa-dev] [PATCH 4/4] radeonsi: do compilation from si_create_shader_selector asynchronously

Sat Jul 2 08:56:31 UTC 2016

Apart from the comment on patch 3 (which may just be me being confused 
about the e-mailed diffs), this series is

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

On 29.06.2016 18:32, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> Main shader parts and geometry shaders are compiled asynchronously
> by util_queue. si_create_shader_selector doesn't wait and returns.
> si_draw_vbo(si_shader_select) waits for completion.
>
> This has the best effect when shaders are compiled at app-loading time.
> It doesn't help much for shaders compiled on demand, even though
> VS+PS compilation should take as much as time as the bigger one of the two.
>
> If an app creates more shaders, at most 4 threads will be used to compile
> them.
>
> Debug output disables this for shader stats to be printed in the correct
> order.
>
> (We could go even further and build variants asynchronously too, then emit
> draw calls without waiting and emit incomplete shader states, then force IB
> chaining to give the compiler more time, then sync the compilation at the IB
> flush and patch the IB with correct shader states. This is great for
> compilation before draw calls, but there are some difficulties such as
> scratch and tess states requiring the compiler output, and an on-disk shader
> cache will likely be a much better and simpler solution.)
> ---
>   src/gallium/drivers/radeonsi/si_pipe.c          | 18 ++++++++++++
>   src/gallium/drivers/radeonsi/si_pipe.h          |  7 ++++-
>   src/gallium/drivers/radeonsi/si_shader.h        |  1 +
>   src/gallium/drivers/radeonsi/si_state_shaders.c | 39 +++++++++++++++++++++----
>   4 files changed, 58 insertions(+), 7 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 2e8d846..0abf01b 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -639,6 +639,13 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
>   	if (!sscreen->b.ws->unref(sscreen->b.ws))
>   		return;
>
> +	if (util_queue_is_initialized(&sscreen->shader_compiler_queue))
> +		util_queue_destroy(&sscreen->shader_compiler_queue);
> +
> +	for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
> +		if (sscreen->tm[i])
> +			LLVMDisposeTargetMachine(sscreen->tm[i]);
> +
>   	/* Free shader parts. */
>   	for (i = 0; i < ARRAY_SIZE(parts); i++) {
>   		while (parts[i]) {
> @@ -686,6 +693,7 @@ static bool si_init_gs_info(struct si_screen *sscreen)
>   struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
>   {
>   	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
> +	unsigned num_cpus, num_compiler_threads, i;
>
>   	if (!sscreen) {
>   		return NULL;
> @@ -730,6 +738,16 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
>   	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
>   		sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS;
>
> +	/* Only enable as many threads as we have target machines and CPUs. */
> +	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
> +	num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm));
> +
> +	for (i = 0; i < num_compiler_threads; i++)
> +		sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
> +
> +	util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
> +                        32, num_compiler_threads);
> +
>   	/* Create the auxiliary context. This must be done last. */
>   	sscreen->b.aux_context = sscreen->b.b.context_create(&sscreen->b.b, NULL, 0);
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 3aff0ac..542d6a8 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -27,6 +27,7 @@
>   #define SI_PIPE_H
>
>   #include "si_state.h"
> +#include "util/u_queue.h"
>
>   #include <llvm-c/TargetMachine.h>
>
> @@ -109,6 +110,10 @@ struct si_screen {
>   	 */
>   	pipe_mutex			shader_cache_mutex;
>   	struct hash_table		*shader_cache;
> +
> +	/* Shader compiler queue for multithreaded compilation. */
> +	struct util_queue		shader_compiler_queue;
> +	LLVMTargetMachineRef		tm[4]; /* used by the queue only */
>   };
>
>   struct si_blend_color {
> @@ -206,7 +211,7 @@ struct si_context {
>
>   	struct pipe_fence_handle	*last_gfx_fence;
>   	struct si_shader_ctx_state	fixed_func_tcs_shader;
> -	LLVMTargetMachineRef		tm;
> +	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
>   	bool				gfx_flush_in_progress;
>
>   	/* Atoms (direct states). */
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 41c6091..0570907 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -234,6 +234,7 @@ struct si_shader;
>    */
>   struct si_shader_selector {
>   	struct si_screen	*screen;
> +	struct util_queue_fence ready;
>
>   	/* Should only be used by si_init_shader_selector_async
>   	 * if thread_index == -1 (non-threaded). */
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index e433055..2de5707 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -965,7 +965,8 @@ static int si_shader_select_with_key(struct si_screen *sscreen,
>   				     struct si_shader_ctx_state *state,
>   				     union si_shader_key *key,
>   				     LLVMTargetMachineRef tm,
> -				     struct pipe_debug_callback *debug)
> +				     struct pipe_debug_callback *debug,
> +				     bool wait)
>   {
>   	struct si_shader_selector *sel = state->cso;
>   	struct si_shader *current = state->current;
> @@ -979,6 +980,13 @@ static int si_shader_select_with_key(struct si_screen *sscreen,
>   	if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0))
>   		return 0;
>
> +	/* This must be done before the mutex is locked, because async GS
> +	 * compilation calls this function too, and therefore must enter
> +	 * the mutex first.
> +	 */
> +	if (wait)
> +		util_queue_job_wait(&sel->ready);
> +
>   	pipe_mutex_lock(sel->mutex);
>
>   	/* Find the shader variant. */
> @@ -1031,7 +1039,7 @@ static int si_shader_select(struct pipe_context *ctx,
>
>   	si_shader_selector_key(ctx, state->cso, &key);
>   	return si_shader_select_with_key(sctx->screen, state, &key,
> -					 sctx->tm, &sctx->b.debug);
> +					 sctx->tm, &sctx->b.debug, true);
>   }
>
>   static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
> @@ -1068,10 +1076,19 @@ void si_init_shader_selector_async(void *job, int thread_index)
>   {
>   	struct si_shader_selector *sel = (struct si_shader_selector *)job;
>   	struct si_screen *sscreen = sel->screen;
> -	LLVMTargetMachineRef tm = sel->tm;
> -	struct pipe_debug_callback *debug = &sel->debug;
> +	LLVMTargetMachineRef tm;
> +	struct pipe_debug_callback *debug;
>   	unsigned i;
>
> +	if (thread_index >= 0) {
> +		assert(thread_index < ARRAY_SIZE(sscreen->tm));
> +		tm = sscreen->tm[thread_index];
> +		debug = NULL;
> +	} else {
> +		tm = sel->tm;
> +		debug = &sel->debug;
> +	}
> +
>   	/* Compile the main shader part for use with a prolog and/or epilog.
>   	 * If this fails, the driver will try to compile a monolithic shader
>   	 * on demand.
> @@ -1147,7 +1164,8 @@ void si_init_shader_selector_async(void *job, int thread_index)
>   			break;
>   		}
>
> -		if (si_shader_select_with_key(sscreen, &state, &key, tm, debug))
> +		if (si_shader_select_with_key(sscreen, &state, &key, tm, debug,
> +					      false))
>   			fprintf(stderr, "radeonsi: can't create a monolithic shader\n");
>   	}
>   }
> @@ -1279,8 +1297,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
>   		sel->db_shader_control |= S_02880C_EXEC_ON_HIER_FAIL(1) |
>   					  S_02880C_EXEC_ON_NOOP(1);
>   	pipe_mutex_init(sel->mutex);
> +	util_queue_fence_init(&sel->ready);
>
> -	si_init_shader_selector_async(sel, -1);
> +	if (sctx->b.debug.debug_message ||
> +	    !util_queue_is_initialized(&sscreen->shader_compiler_queue))
> +		si_init_shader_selector_async(sel, -1);
> +	else
> +		util_queue_add_job(&sscreen->shader_compiler_queue, sel,
> +                                   &sel->ready, si_init_shader_selector_async);
>
>   	return sel;
>   }
> @@ -1417,6 +1441,8 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
>   		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
>   	};
>
> +	util_queue_job_wait(&sel->ready);
> +
>   	if (current_shader[sel->type]->cso == sel) {
>   		current_shader[sel->type]->cso = NULL;
>   		current_shader[sel->type]->current = NULL;
> @@ -1431,6 +1457,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
>   	if (sel->main_shader_part)
>   		si_delete_shader(sctx, sel->main_shader_part);
>
> +	util_queue_fence_destroy(&sel->ready);
>   	pipe_mutex_destroy(sel->mutex);
>   	free(sel->tokens);
>   	free(sel);
>