[Mesa-dev] [PATCH 9/9] radeonsi: increase the number of compiler threads depending on the CPU

Tue Apr 17 00:52:20 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

The compiler queue was limited to 3 threads, so shader-db running
on a 16-thread CPU would have a bottleneck on the 3-thread queue.
---
 src/gallium/drivers/radeonsi/si_pipe.c | 39 +++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_pipe.h |  6 ++--
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f1f1e3ad890..d044b191b71 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -848,21 +848,21 @@ static void si_disk_cache_create(struct si_screen *sscreen)
 						  shader_debug_flags);
 			free(timestamp_str);
 		}
 	}
 }
 
 struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 					   const struct pipe_screen_config *config)
 {
 	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-	unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
+	unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i;
 
 	if (!sscreen) {
 		return NULL;
 	}
 
 	sscreen->ws = ws;
 	ws->query_info(ws, &sscreen->info);
 
 	sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
 							debug_options, 0);
@@ -905,40 +905,53 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 	(void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
 
 	if (!si_init_gs_info(sscreen) ||
 	    !si_init_shader_cache(sscreen)) {
 		FREE(sscreen);
 		return NULL;
 	}
 
 	si_disk_cache_create(sscreen);
 
-	/* Only enable as many threads as we have target machines, but at most
-	 * the number of CPUs - 1 if there is more than one.
-	 */
-	num_threads = sysconf(_SC_NPROCESSORS_ONLN);
-	num_threads = MAX2(1, num_threads - 1);
-	num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->compiler));
-	num_compiler_threads_lowprio =
-		MIN2(num_threads, ARRAY_SIZE(sscreen->compiler_lowp));
+	/* Determine the number of shader compiler threads. */
+	hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+	if (hw_threads >= 12) {
+		num_comp_hi_threads = hw_threads * 3 / 4;
+		num_comp_lo_threads = hw_threads / 3;
+	} else if (hw_threads >= 6) {
+		num_comp_hi_threads = hw_threads - 2;
+		num_comp_lo_threads = hw_threads / 2;
+	} else if (hw_threads >= 2) {
+		num_comp_hi_threads = hw_threads - 1;
+		num_comp_lo_threads = hw_threads / 2;
+	} else {
+		num_comp_hi_threads = 1;
+		num_comp_lo_threads = 1;
+	}
+
+	num_comp_hi_threads = MIN2(num_comp_hi_threads,
+				   ARRAY_SIZE(sscreen->compiler));
+	num_comp_lo_threads = MIN2(num_comp_lo_threads,
+				   ARRAY_SIZE(sscreen->compiler_lowp));
 
 	if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
-			     32, num_compiler_threads,
+			     64, num_comp_hi_threads,
 			     UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
 		si_destroy_shader_cache(sscreen);
 		FREE(sscreen);
 		return NULL;
 	}
 
 	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
 			     "si_shader_low",
-			     32, num_compiler_threads_lowprio,
+			     64, num_comp_lo_threads,
 			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
 			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
 	       si_destroy_shader_cache(sscreen);
 	       FREE(sscreen);
 	       return NULL;
 	}
 
 	si_handle_env_var_force_family(sscreen);
 
 	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
@@ -1075,23 +1088,23 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
 					    SI_CONTEXT_INV_VMEM_L1;
 	if (sscreen->info.chip_class <= VI) {
 		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
 		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 	}
 
 	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
 		sscreen->debug_flags |= DBG_ALL_SHADERS;
 
-	for (i = 0; i < num_compiler_threads; i++)
+	for (i = 0; i < num_comp_hi_threads; i++)
 		si_init_compiler(sscreen, &sscreen->compiler[i]);
-	for (i = 0; i < num_compiler_threads_lowprio; i++)
+	for (i = 0; i < num_comp_lo_threads; i++)
 		si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
 
 	/* Create the auxiliary context. This must be done last. */
 	sscreen->aux_context = si_create_context(&sscreen->b, 0);
 
 	if (sscreen->debug_flags & DBG(TEST_DMA))
 		si_test_dma(sscreen);
 
 	if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) |
 				      DBG(TEST_VMFAULT_SDMA) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a67786c84d9..27efc5099f0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -522,27 +522,27 @@ struct si_screen {
 	 * - GS and CS aren't cached, but it's certainly possible to cache
 	 *   those as well.
 	 */
 	mtx_t			shader_cache_mutex;
 	struct hash_table		*shader_cache;
 
 	/* Shader compiler queue for multithreaded compilation. */
 	struct util_queue		shader_compiler_queue;
 	/* Use at most 3 normal compiler threads on quadcore and better.
 	 * Hyperthreaded CPUs report the number of threads, but we want
-	 * the number of cores. */
-	struct si_compiler		compiler[3]; /* used by the queue only */
+	 * the number of cores. We only need this many threads for shader-db. */
+	struct si_compiler		compiler[24]; /* used by the queue only */
 
 	struct util_queue		shader_compiler_queue_low_priority;
 	/* Use at most 2 low priority threads on quadcore and better.
 	 * We want to minimize the impact on multithreaded Mesa. */
-	struct si_compiler		compiler_lowp[2]; /* at most 2 threads */
+	struct si_compiler		compiler_lowp[10];
 };
 
 struct si_blend_color {
 	struct pipe_blend_color		state;
 	bool				any_nonzeros;
 };
 
 struct si_sampler_view {
 	struct pipe_sampler_view	base;
         /* [0..7] = image descriptor
-- 
2.17.0