From 86cc8097266c2bd9d8a6ccc3d7f61391f13119be Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 31 May 2017 13:18:53 +0200 Subject: [PATCH] radeonsi: use a compiler queue with a low priority for optimized shaders MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 31 +++++++++++++++++++++---- src/gallium/drivers/radeonsi/si_pipe.h | 3 +++ src/gallium/drivers/radeonsi/si_state_shaders.c | 8 +++---- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 47426b4..805392d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -742,11 +742,16 @@ static void si_destroy_screen(struct pipe_screen* pscreen) return; util_queue_destroy(&sscreen->shader_compiler_queue); + util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++) if (sscreen->tm[i]) LLVMDisposeTargetMachine(sscreen->tm[i]); + for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++) + if (sscreen->tm_low_priority[i]) + LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]); + /* Free shader parts. */ for (i = 0; i < ARRAY_SIZE(parts); i++) { while (parts[i]) { @@ -860,7 +865,7 @@ static void si_test_vmfault(struct si_screen *sscreen) struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) { struct si_screen *sscreen = CALLOC_STRUCT(si_screen); - unsigned num_cpus, num_compiler_threads, i; + unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i; if (!sscreen) { return NULL; @@ -885,9 +890,11 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) /* Only enable as many threads as we have target machines, but at most * the number of CPUs - 1 if there is more than one. */ - num_cpus = sysconf(_SC_NPROCESSORS_ONLN); - num_cpus = MAX2(1, num_cpus - 1); - num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm)); + num_threads = sysconf(_SC_NPROCESSORS_ONLN); + num_threads = MAX2(1, num_threads - 1); + num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm)); + num_compiler_threads_lowprio = + MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority)); if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader", 32, num_compiler_threads, 0)) { @@ -896,6 +903,20 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) return NULL; } + /* The queue must be large enough so that adding optimized shaders + * doesn't stall draw calls when the queue is full. Especially varying + * packing generates a very high volume of optimized shader compilation + * jobs. + */ + if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, + "si_shader_low", + 1024, num_compiler_threads, + UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { + si_destroy_shader_cache(sscreen); + FREE(sscreen); + return NULL; + } + si_handle_env_var_force_family(sscreen); if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) @@ -959,6 +980,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) for (i = 0; i < num_compiler_threads; i++) sscreen->tm[i] = si_create_llvm_target_machine(sscreen); + for (i = 0; i < num_compiler_threads_lowprio; i++) + sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen); /* Create the auxiliary context. This must be done last. */ sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 13ec072..e917cb1 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -110,6 +110,9 @@ struct si_screen { /* Shader compiler queue for multithreaded compilation. */ struct util_queue shader_compiler_queue; LLVMTargetMachineRef tm[4]; /* used by the queue only */ + + struct util_queue shader_compiler_queue_low_priority; + LLVMTargetMachineRef tm_low_priority[4]; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 62bb221..5a22add 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1450,8 +1450,8 @@ static void si_build_shader_variant(void *job, int thread_index) int r; if (thread_index >= 0) { - assert(thread_index < ARRAY_SIZE(sscreen->tm)); - tm = sscreen->tm[thread_index]; + assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority)); + tm = sscreen->tm_low_priority[thread_index]; if (!debug->async) debug = NULL; } else { @@ -1679,7 +1679,7 @@ again: !is_pure_monolithic && thread_index < 0) { /* Compile it asynchronously. */ - util_queue_add_job(&sscreen->shader_compiler_queue, + util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->optimized_ready, si_build_shader_variant, NULL); @@ -2258,7 +2258,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) { if (shader->is_optimized) { - util_queue_drop_job(&sctx->screen->shader_compiler_queue, + util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->optimized_ready); util_queue_fence_destroy(&shader->optimized_ready); } -- 2.7.4