Mesa (master): radeonsi: use a compiler queue with a low priority for optimized shaders

Wed Jun 7 16:43:55 UTC 2017

Module: Mesa
Branch: master
Commit: 86cc8097266c2bd9d8a6ccc3d7f61391f13119be
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=86cc8097266c2bd9d8a6ccc3d7f61391f13119be

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed May 31 13:18:53 2017 +0200

radeonsi: use a compiler queue with a low priority for optimized shaders

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

---

 src/gallium/drivers/radeonsi/si_pipe.c          | 31 +++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_pipe.h          |  3 +++
 src/gallium/drivers/radeonsi/si_state_shaders.c |  8 +++----
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 47426b41da..805392d713 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -742,11 +742,16 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
 		return;
 
 	util_queue_destroy(&sscreen->shader_compiler_queue);
+	util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
 	for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
 		if (sscreen->tm[i])
 			LLVMDisposeTargetMachine(sscreen->tm[i]);
 
+	for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++)
+		if (sscreen->tm_low_priority[i])
+			LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]);
+
 	/* Free shader parts. */
 	for (i = 0; i < ARRAY_SIZE(parts); i++) {
 		while (parts[i]) {
@@ -860,7 +865,7 @@ static void si_test_vmfault(struct si_screen *sscreen)
 struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 {
 	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-	unsigned num_cpus, num_compiler_threads, i;
+	unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
 
 	if (!sscreen) {
 		return NULL;
@@ -885,9 +890,11 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	/* Only enable as many threads as we have target machines, but at most
 	 * the number of CPUs - 1 if there is more than one.
 	 */
-	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	num_cpus = MAX2(1, num_cpus - 1);
-	num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm));
+	num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+	num_threads = MAX2(1, num_threads - 1);
+	num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm));
+	num_compiler_threads_lowprio =
+		MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority));
 
 	if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
 			     32, num_compiler_threads, 0)) {
@@ -896,6 +903,20 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 		return NULL;
 	}
 
+	/* The queue must be large enough so that adding optimized shaders
+	 * doesn't stall draw calls when the queue is full. Especially varying
+	 * packing generates a very high volume of optimized shader compilation
+	 * jobs.
+	 */
+	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
+			     "si_shader_low",
+			     1024, num_compiler_threads,
+			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+	       si_destroy_shader_cache(sscreen);
+	       FREE(sscreen);
+	       return NULL;
+	}
+
 	si_handle_env_var_force_family(sscreen);
 
 	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
@@ -959,6 +980,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 
 	for (i = 0; i < num_compiler_threads; i++)
 		sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
+	for (i = 0; i < num_compiler_threads_lowprio; i++)
+		sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen);
 
 	/* Create the auxiliary context. This must be done last. */
 	sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 13ec0729b1..e917cb1b78 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -110,6 +110,9 @@ struct si_screen {
 	/* Shader compiler queue for multithreaded compilation. */
 	struct util_queue		shader_compiler_queue;
 	LLVMTargetMachineRef		tm[4]; /* used by the queue only */
+
+	struct util_queue		shader_compiler_queue_low_priority;
+	LLVMTargetMachineRef		tm_low_priority[4];
 };
 
 struct si_blend_color {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 62bb221211..5a22add0ab 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1450,8 +1450,8 @@ static void si_build_shader_variant(void *job, int thread_index)
 	int r;
 
 	if (thread_index >= 0) {
-		assert(thread_index < ARRAY_SIZE(sscreen->tm));
-		tm = sscreen->tm[thread_index];
+		assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority));
+		tm = sscreen->tm_low_priority[thread_index];
 		if (!debug->async)
 			debug = NULL;
 	} else {
@@ -1679,7 +1679,7 @@ again:
 	    !is_pure_monolithic &&
 	    thread_index < 0) {
 		/* Compile it asynchronously. */
-		util_queue_add_job(&sscreen->shader_compiler_queue,
+		util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
 				   shader, &shader->optimized_ready,
 				   si_build_shader_variant, NULL);
 
@@ -2258,7 +2258,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
-		util_queue_drop_job(&sctx->screen->shader_compiler_queue,
+		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
 				    &shader->optimized_ready);
 		util_queue_fence_destroy(&shader->optimized_ready);
 	}