Mesa (master): ac,radeonsi: reduce optimizations for complex compute shaders on older APUs (v2)

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Aug 1 19:25:33 UTC 2018


Module: Mesa
Branch: master
Commit: cb6b241c301d5352a5bcaab52bbfaf89e700b2b2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=cb6b241c301d5352a5bcaab52bbfaf89e700b2b2

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Thu Jul 19 22:55:49 2018 -0400

ac,radeonsi: reduce optimizations for complex compute shaders on older APUs (v2)

To make dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.23
finish sooner on the older CPUs. (otherwise it gets killed and we fail
the test)

Acked-by: Dave Airlie <airlied at gmail.com>

---

 src/amd/common/ac_llvm_util.c                      | 18 +++++++++++---
 src/amd/common/ac_llvm_util.h                      | 11 +++++++-
 src/gallium/drivers/radeonsi/si_pipe.c             | 12 ++++++++-
 src/gallium/drivers/radeonsi/si_shader.c           | 29 ++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader_internal.h  |  3 ++-
 .../drivers/radeonsi/si_shader_tgsi_setup.c        |  8 ++++--
 6 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 678bc34e6f..10e1ca99d4 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -142,6 +142,7 @@ const char *ac_get_llvm_processor_name(enum radeon_family family)
 
 static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
 						     enum ac_target_machine_options tm_options,
+						     LLVMCodeGenOptLevel level,
 						     const char **out_triple)
 {
 	assert(family >= CHIP_TAHITI);
@@ -163,7 +164,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
 	                             triple,
 	                             ac_get_llvm_processor_name(family),
 				     features,
-	                             LLVMCodeGenLevelDefault,
+	                             level,
 	                             LLVMRelocDefault,
 	                             LLVMCodeModelDefault);
 
@@ -308,11 +309,20 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
 	const char *triple;
 	memset(compiler, 0, sizeof(*compiler));
 
-	compiler->tm = ac_create_target_machine(family,
-					    tm_options, &triple);
+	compiler->tm = ac_create_target_machine(family, tm_options,
+						LLVMCodeGenLevelDefault,
+						&triple);
 	if (!compiler->tm)
 		return false;
 
+	if (tm_options & AC_TM_CREATE_LOW_OPT) {
+		compiler->low_opt_tm =
+			ac_create_target_machine(family, tm_options,
+						 LLVMCodeGenLevelLess, NULL);
+		if (!compiler->low_opt_tm)
+			goto fail;
+	}
+
 	if (okay_to_leak_target_library_info || (HAVE_LLVM >= 0x0700)) {
 		compiler->target_library_info =
 			ac_create_target_library_info(triple);
@@ -341,6 +351,8 @@ ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
 	if (compiler->target_library_info)
 		ac_dispose_target_library_info(compiler->target_library_info);
 #endif
+	if (compiler->low_opt_tm)
+		LLVMDisposeTargetMachine(compiler->low_opt_tm);
 	if (compiler->tm)
 		LLVMDisposeTargetMachine(compiler->tm);
 }
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index d4dea4dfde..eaf5f21876 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -64,6 +64,7 @@ enum ac_target_machine_options {
 	AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4),
 	AC_TM_CHECK_IR = (1 << 5),
 	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
+	AC_TM_CREATE_LOW_OPT = (1 << 7),
 };
 
 enum ac_float_mode {
@@ -74,10 +75,18 @@ enum ac_float_mode {
 
 /* Per-thread persistent LLVM objects. */
 struct ac_llvm_compiler {
-	LLVMTargetMachineRef		tm;
 	LLVMTargetLibraryInfoRef	target_library_info;
 	LLVMPassManagerRef		passmgr;
+
+	/* Default compiler. */
+	LLVMTargetMachineRef		tm;
 	struct ac_compiler_passes	*passes;
+
+	/* Optional compiler for faster compilation with fewer optimizations.
+	 * LLVM modules can be created with "tm" too. There is no difference.
+	 */
+	LLVMTargetMachineRef		low_opt_tm; /* uses -O1 instead of -O2 */
+	struct ac_compiler_passes	*low_opt_passes;
 };
 
 const char *ac_get_llvm_processor_name(enum radeon_family family);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 9e3a579d74..cc05d2f8de 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -108,22 +108,32 @@ static const struct debug_named_value debug_options[] = {
 static void si_init_compiler(struct si_screen *sscreen,
 			     struct ac_llvm_compiler *compiler)
 {
+	/* Only create the less-optimizing version of the compiler on APUs
+	 * predating Ryzen (Raven). */
+	bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
+				       sscreen->info.chip_class <= VI;
+
 	enum ac_target_machine_options tm_options =
 		(sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
 		(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
 		(sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
 		(sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
 		(!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
-		(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0);
+		(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+		(create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
 
 	ac_init_llvm_once();
 	ac_init_llvm_compiler(compiler, true, sscreen->info.family, tm_options);
 	compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+	if (compiler->low_opt_tm)
+		compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
 	ac_destroy_llvm_passes(compiler->passes);
+	ac_destroy_llvm_passes(compiler->low_opt_passes);
 	ac_destroy_llvm_compiler(compiler);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 43ba23ff49..405833d3ba 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5645,7 +5645,8 @@ static int si_compile_llvm(struct si_screen *sscreen,
 			   LLVMModuleRef mod,
 			   struct pipe_debug_callback *debug,
 			   unsigned processor,
-			   const char *name)
+			   const char *name,
+			   bool less_optimized)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
@@ -5667,7 +5668,8 @@ static int si_compile_llvm(struct si_screen *sscreen,
 	}
 
 	if (!si_replace_shader(count, binary)) {
-		r = si_llvm_compile(mod, binary, compiler, debug);
+		r = si_llvm_compile(mod, binary, compiler, debug,
+				    less_optimized);
 		if (r)
 			return r;
 	}
@@ -5884,7 +5886,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 			    &ctx.shader->config, ctx.compiler,
 			    ctx.ac.module,
 			    debug, PIPE_SHADER_GEOMETRY,
-			    "GS Copy Shader");
+			    "GS Copy Shader", false);
 	if (!r) {
 		if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
 			fprintf(stderr, "GS Copy Shader:\n");
@@ -6790,6 +6792,22 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 	LLVMBuildRetVoid(builder);
 }
 
+static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
+				    struct si_shader_selector *sel)
+{
+	if (!compiler->low_opt_passes)
+		return false;
+
+	/* Assume a slow CPU. */
+	assert(!sel->screen->info.has_dedicated_vram &&
+	       sel->screen->info.chip_class <= VI);
+
+	/* For a crazy dEQP test containing 2597 memory opcodes, mostly
+	 * buffer stores. */
+	return sel->type == PIPE_SHADER_COMPUTE &&
+	       sel->info.num_memory_instructions > 1000;
+}
+
 int si_compile_tgsi_shader(struct si_screen *sscreen,
 			   struct ac_llvm_compiler *compiler,
 			   struct si_shader *shader,
@@ -7022,7 +7040,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
 	/* Compile to bytecode. */
 	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
-			    ctx.ac.module, debug, ctx.type, "TGSI shader");
+			    ctx.ac.module, debug, ctx.type, "TGSI shader",
+			    si_should_optimize_less(compiler, shader->selector));
 	si_llvm_dispose(&ctx);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
@@ -7189,7 +7208,7 @@ si_get_shader_part(struct si_screen *sscreen,
 	si_llvm_optimize_module(&ctx);
 
 	if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
-			    ctx.ac.module, debug, ctx.type, name)) {
+			    ctx.ac.module, debug, ctx.type, name, false)) {
 		FREE(result);
 		result = NULL;
 		goto out;
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 21e325c2d8..36351391d9 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -217,7 +217,8 @@ si_shader_context_from_abi(struct ac_shader_abi *abi)
 
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 			 struct ac_llvm_compiler *compiler,
-			 struct pipe_debug_callback *debug);
+			 struct pipe_debug_callback *debug,
+			 bool less_optimized);
 
 LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base,
 			  enum tgsi_opcode_type type);
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
index b486be2574..b9ed0fc3ab 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -82,8 +82,12 @@ static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
  */
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 			 struct ac_llvm_compiler *compiler,
-			 struct pipe_debug_callback *debug)
+			 struct pipe_debug_callback *debug,
+			 bool less_optimized)
 {
+	struct ac_compiler_passes *passes =
+		less_optimized && compiler->low_opt_passes ?
+			compiler->low_opt_passes : compiler->passes;
 	struct si_llvm_diagnostics diag;
 	LLVMContextRef llvm_ctx;
 
@@ -96,7 +100,7 @@ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 	LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag);
 
 	/* Compile IR. */
-	if (!ac_compile_module_to_binary(compiler->passes, M, binary))
+	if (!ac_compile_module_to_binary(passes, M, binary))
 		diag.retval = 1;
 
 	if (diag.retval != 0)




More information about the mesa-commit mailing list