[Mesa-dev] [PATCH 05/11] radeonsi: use a compiler queue with a low priority for optimized shaders
Marek Olšák
maraeo at gmail.com
Thu Jun 1 18:18:23 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/drivers/radeonsi/si_pipe.c | 31 +++++++++++++++++++++----
src/gallium/drivers/radeonsi/si_pipe.h | 3 +++
src/gallium/drivers/radeonsi/si_state_shaders.c | 8 +++----
3 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 8bf6fd9..082ba99 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -734,25 +734,30 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
sscreen->gs_prologs,
sscreen->ps_prologs,
sscreen->ps_epilogs
};
unsigned i;
if (!sscreen->b.ws->unref(sscreen->b.ws))
return;
util_queue_destroy(&sscreen->shader_compiler_queue);
+ util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
if (sscreen->tm[i])
LLVMDisposeTargetMachine(sscreen->tm[i]);
+ for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++)
+ if (sscreen->tm_low_priority[i])
+ LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]);
+
/* Free shader parts. */
for (i = 0; i < ARRAY_SIZE(parts); i++) {
while (parts[i]) {
struct si_shader_part *part = parts[i];
parts[i] = part->next;
radeon_shader_binary_clean(&part->binary);
FREE(part);
}
}
@@ -852,21 +857,21 @@ static void si_test_vmfault(struct si_screen *sscreen)
if (sscreen->b.debug_flags & DBG_TEST_VMFAULT_SHADER) {
util_test_constant_buffer(ctx, buf);
puts("VM fault test: Shader - done.");
}
exit(0);
}
struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
{
struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
- unsigned num_cpus, num_compiler_threads, i;
+ unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
if (!sscreen) {
return NULL;
}
/* Set functions first. */
sscreen->b.b.context_create = si_pipe_create_context;
sscreen->b.b.destroy = si_destroy_screen;
sscreen->b.b.get_param = si_get_param;
sscreen->b.b.get_shader_param = si_get_shader_param;
@@ -877,31 +882,47 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
if (!r600_common_screen_init(&sscreen->b, ws) ||
!si_init_gs_info(sscreen) ||
!si_init_shader_cache(sscreen)) {
FREE(sscreen);
return NULL;
}
/* Only enable as many threads as we have target machines, but at most
* the number of CPUs - 1 if there is more than one.
*/
- num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
- num_cpus = MAX2(1, num_cpus - 1);
- num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm));
+ num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+ num_threads = MAX2(1, num_threads - 1);
+ num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm));
+ num_compiler_threads_lowprio =
+ MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority));
if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
32, num_compiler_threads, 0)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen);
return NULL;
}
+ /* The queue must be large enough so that adding optimized shaders
+ * doesn't stall draw calls when the queue is full. Especially varying
+ * packing generates a very high volume of optimized shader compilation
+ * jobs.
+ */
+ if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
+ "si_shader_low",
+ 1024, num_compiler_threads,
+ UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+ si_destroy_shader_cache(sscreen);
+ FREE(sscreen);
+ return NULL;
+ }
+
si_handle_env_var_force_family(sscreen);
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
/* Hawaii has a bug with offchip buffers > 256 that can be worked
* around by setting 4K granularity.
*/
sscreen->tess_offchip_block_dw_size =
sscreen->b.family == CHIP_HAWAII ? 4096 : 8192;
@@ -951,20 +972,22 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
sscreen->b.barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
SI_CONTEXT_INV_VMEM_L1 |
SI_CONTEXT_INV_GLOBAL_L2;
sscreen->b.barrier_flags.compute_to_L2 = SI_CONTEXT_CS_PARTIAL_FLUSH;
if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS;
for (i = 0; i < num_compiler_threads; i++)
sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
+ for (i = 0; i < num_compiler_threads_lowprio; i++)
+ sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen);
/* Create the auxiliary context. This must be done last. */
sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0);
if (sscreen->b.debug_flags & DBG_TEST_DMA)
r600_test_dma(&sscreen->b);
if (sscreen->b.debug_flags & (DBG_TEST_VMFAULT_CP |
DBG_TEST_VMFAULT_SDMA |
DBG_TEST_VMFAULT_SHADER))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 13ec072..e917cb1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -103,20 +103,23 @@ struct si_screen {
* variants of VS and TES are cached, so LS and ES aren't.
* - GS and CS aren't cached, but it's certainly possible to cache
* those as well.
*/
mtx_t shader_cache_mutex;
struct hash_table *shader_cache;
/* Shader compiler queue for multithreaded compilation. */
struct util_queue shader_compiler_queue;
LLVMTargetMachineRef tm[4]; /* used by the queue only */
+
+ struct util_queue shader_compiler_queue_low_priority;
+ LLVMTargetMachineRef tm_low_priority[4];
};
struct si_blend_color {
struct r600_atom atom;
struct pipe_blend_color state;
};
struct si_sampler_view {
struct pipe_sampler_view base;
/* [0..7] = image descriptor
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 62bb221..5a22add 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1443,22 +1443,22 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
static void si_build_shader_variant(void *job, int thread_index)
{
struct si_shader *shader = (struct si_shader *)job;
struct si_shader_selector *sel = shader->selector;
struct si_screen *sscreen = sel->screen;
LLVMTargetMachineRef tm;
struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
int r;
if (thread_index >= 0) {
- assert(thread_index < ARRAY_SIZE(sscreen->tm));
- tm = sscreen->tm[thread_index];
+ assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority));
+ tm = sscreen->tm_low_priority[thread_index];
if (!debug->async)
debug = NULL;
} else {
tm = shader->compiler_ctx_state.tm;
}
r = si_shader_create(sscreen, tm, shader, debug);
if (unlikely(r)) {
R600_ERR("Failed to build shader variant (type=%u) %d\n",
sel->type, r);
@@ -1672,21 +1672,21 @@ again:
} else {
sel->last_variant->next_variant = shader;
sel->last_variant = shader;
}
/* If it's an optimized shader, compile it asynchronously. */
if (shader->is_optimized &&
!is_pure_monolithic &&
thread_index < 0) {
/* Compile it asynchronously. */
- util_queue_add_job(&sscreen->shader_compiler_queue,
+ util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
shader, &shader->optimized_ready,
si_build_shader_variant, NULL);
/* Use the default (unoptimized) shader for now. */
memset(&key->opt, 0, sizeof(key->opt));
mtx_unlock(&sel->mutex);
goto again;
}
assert(!shader->is_optimized);
@@ -2251,21 +2251,21 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
sctx->do_update_shaders = true;
if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
si_update_tess_uses_prim_id(sctx);
si_mark_atom_dirty(sctx, &sctx->cb_render_state);
si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
{
if (shader->is_optimized) {
- util_queue_drop_job(&sctx->screen->shader_compiler_queue,
+ util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
&shader->optimized_ready);
util_queue_fence_destroy(&shader->optimized_ready);
}
if (shader->pm4) {
switch (shader->selector->type) {
case PIPE_SHADER_VERTEX:
if (shader->key.as_ls) {
assert(sctx->b.chip_class <= VI);
si_pm4_delete_state(sctx, ls, shader->pm4);
--
2.7.4
More information about the mesa-dev
mailing list