[Mesa-dev] [PATCH 61/61] radeonsi: tell LLVM not to remove s_barrier instructions

Marek Olšák maraeo at gmail.com
Mon Apr 24 08:45:58 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

LLVM 5.0 removes s_barrier instructions if the max-work-group-size
attribute is not set. What a surprise.
---
 src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 3b00bea..086b279 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5674,21 +5674,21 @@ static const struct lp_build_tgsi_action tex_action = {
 
 static const struct lp_build_tgsi_action interp_action = {
 	.fetch_args = interp_fetch_args,
 	.emit = build_interp_intrinsic,
 };
 
 static void si_create_function(struct si_shader_context *ctx,
 			       const char *name,
 			       LLVMTypeRef *returns, unsigned num_returns,
 			       LLVMTypeRef *params, unsigned num_params,
-			       int last_sgpr)
+			       int last_sgpr, unsigned max_workgroup_size)
 {
 	int i;
 
 	si_llvm_create_func(ctx, name, returns, num_returns,
 			    params, num_params);
 	si_llvm_shader_type(ctx->main_fn, ctx->type);
 	ctx->return_value = LLVMGetUndef(ctx->return_type);
 
 	for (i = 0; i <= last_sgpr; ++i) {
 		LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
@@ -5701,20 +5701,24 @@ static void si_create_function(struct si_shader_context *ctx,
 		 * SGPR spilling significantly.
 		 */
 		if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
 			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
 			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
 			ac_add_attr_dereferenceable(P, UINT64_MAX);
 		} else
 			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
 	}
 
+	if (max_workgroup_size) {
+		si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
+				      max_workgroup_size);
+	}
 	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 					   "no-signed-zeros-fp-math",
 					   "true");
 
 	if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
 		/* These were copied from some LLVM test. */
 		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 						   "less-precise-fpmad",
 						   "true");
 		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
@@ -5782,20 +5786,36 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx)
 	struct gallivm_state *gallivm = &ctx->gallivm;
 
 	unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
 	ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
 		LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
 		"lds");
 }
 
 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
 {
+	switch (shader->selector->type) {
+	case PIPE_SHADER_TESS_CTRL:
+		/* Return this so that LLVM doesn't remove s_barrier
+		 * instructions on chips where we use s_barrier. */
+		return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+	case PIPE_SHADER_GEOMETRY:
+		return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
+
+	case PIPE_SHADER_COMPUTE:
+		break; /* see below */
+
+	default:
+		return 0;
+	}
+
 	const unsigned *properties = shader->selector->info.properties;
 	unsigned max_work_group_size =
 	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
 	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
 	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
 
 	if (!max_work_group_size) {
 		/* This is a variable group size compute shader,
 		 * compile it for the maximum possible group size.
 		 */
@@ -6172,39 +6192,36 @@ static void create_function(struct si_shader_context *ctx)
 		num_params = SI_PARAM_THREAD_ID + 1;
 		break;
 	default:
 		assert(0 && "unimplemented shader");
 		return;
 	}
 
 	assert(num_params <= ARRAY_SIZE(params));
 
 	si_create_function(ctx, "main", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr,
+			   si_get_max_workgroup_size(shader));
 
 	/* Reserve register locations for VGPR inputs the PS prolog may need. */
 	if (ctx->type == PIPE_SHADER_FRAGMENT &&
 	    ctx->separate_prolog) {
 		si_llvm_add_attribute(ctx->main_fn,
 				      "InitialPSInputAddr",
 				      S_0286D0_PERSP_SAMPLE_ENA(1) |
 				      S_0286D0_PERSP_CENTER_ENA(1) |
 				      S_0286D0_PERSP_CENTROID_ENA(1) |
 				      S_0286D0_LINEAR_SAMPLE_ENA(1) |
 				      S_0286D0_LINEAR_CENTER_ENA(1) |
 				      S_0286D0_LINEAR_CENTROID_ENA(1) |
 				      S_0286D0_FRONT_FACE_ENA(1) |
 				      S_0286D0_POS_FIXED_PT_ENA(1));
-	} else if (ctx->type == PIPE_SHADER_COMPUTE) {
-		si_llvm_add_attribute(ctx->main_fn,
-				      "amdgpu-max-work-group-size",
-				      si_get_max_workgroup_size(shader));
 	}
 
 	shader->info.num_input_sgprs = 0;
 	shader->info.num_input_vgprs = 0;
 
 	for (i = 0; i <= last_sgpr; ++i)
 		shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
 
 	for (; i < num_params; ++i)
 		shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
@@ -7701,21 +7718,21 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 		returns[i] = ctx->i32;
 	}
 
 	for (unsigned i = 0; i < num_vgprs; ++i) {
 		params[num_sgprs + i] = ctx->i32;
 		returns[num_sgprs + i] = ctx->f32;
 	}
 
 	/* Create the function. */
 	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
-			   params, num_sgprs + num_vgprs, num_sgprs - 1);
+			   params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
 	func = ctx->main_fn;
 
 	/* Set the full EXEC mask for the prolog, because we are only fiddling
 	 * with registers here. The main shader part will set the correct EXEC
 	 * mask.
 	 */
 	if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
 		si_init_exec_full_mask(ctx);
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -7861,21 +7878,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 		size = llvm_get_type_size(param_types[num_params]) / 4;
 		num_params++;
 
 		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
 		assert(gprs + size <= num_sgprs + num_vgprs &&
 		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
 
 		gprs += size;
 	}
 
-	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
+			   last_sgpr_param,
+			   si_get_max_workgroup_size(ctx->shader));
 
 	if (is_merged_shader(ctx->shader))
 		si_init_exec_full_mask(ctx);
 
 	/* Record the arguments of the function as if they were an output of
 	 * a previous part.
 	 */
 	num_out = 0;
 	num_out_sgpr = 0;
 
@@ -8499,21 +8518,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 		params[num_params++] = ctx->i32;
 		returns[num_returns++] = ctx->f32;
 	}
 
 	/* Vertex load indices. */
 	for (i = 0; i <= key->vs_prolog.last_input; i++)
 		returns[num_returns++] = ctx->f32;
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
 	if (key->vs_prolog.num_merged_next_stage_vgprs &&
 	    !key->vs_prolog.is_monolithic)
 		si_init_exec_from_input(ctx, 3, 0);
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
 	ret = ctx->return_value;
@@ -8643,21 +8662,22 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
 		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
 	}
 	last_sgpr = num_params - 1;
 
 	params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
 	params[num_params++] = ctx->i32; /* invocation ID within the patch */
 	params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
 
 	/* Create the function. */
-	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
+	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
+			   ctx->screen->b.chip_class >= CIK ? 128 : 64);
 	declare_lds_as_pointer(ctx);
 	func = ctx->main_fn;
 
 	si_write_tess_factors(bld_base,
 			      LLVMGetParam(func, last_sgpr + 1),
 			      LLVMGetParam(func, last_sgpr + 2),
 			      LLVMGetParam(func, last_sgpr + 3));
 
 	LLVMBuildRetVoid(gallivm->builder);
 }
@@ -8764,21 +8784,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
 		params[num_params++] = ctx->f32;
 
 	/* Declare outputs (same as inputs + add colors if needed) */
 	num_returns = num_params;
 	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
 	for (i = 0; i < num_color_channels; i++)
 		params[num_returns++] = ctx->f32;
 
 	/* Create the function. */
 	si_create_function(ctx, "ps_prolog", params, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
 	ret = ctx->return_value;
 	for (i = 0; i < num_params; i++) {
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
@@ -9006,21 +9026,22 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 
 	num_params = MAX2(num_params,
 			  last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
 
 	assert(num_params <= ARRAY_SIZE(params));
 
 	for (i = last_sgpr + 1; i < num_params; i++)
 		params[i] = ctx->f32;
 
 	/* Create the function. */
-	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
+	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
+			   last_sgpr, 0);
 	/* Disable elimination of unused inputs. */
 	si_llvm_add_attribute(ctx->main_fn,
 				  "InitialPSInputAddr", 0xffffff);
 
 	/* Process colors. */
 	unsigned vgpr = last_sgpr + 1;
 	unsigned colors_written = key->ps_epilog.colors_written;
 	int last_color_export = -1;
 
 	/* Find the last color export. */
-- 
2.7.4



More information about the mesa-dev mailing list