[Mesa-dev] [PATCH 61/61] radeonsi: tell LLVM not to remove s_barrier instructions

Fri Apr 28 12:06:51 UTC 2017

On 24.04.2017 10:45, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> LLVM 5.0 removes s_barrier instructions if the max-work-group-size
> attribute is not set. What a surprise.

One minor comment on patch 56, apart from that patches 54-61:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> ---
>  src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++++++++---------
>  1 file changed, 33 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 3b00bea..086b279 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5674,21 +5674,21 @@ static const struct lp_build_tgsi_action tex_action = {
>
>  static const struct lp_build_tgsi_action interp_action = {
>  	.fetch_args = interp_fetch_args,
>  	.emit = build_interp_intrinsic,
>  };
>
>  static void si_create_function(struct si_shader_context *ctx,
>  			       const char *name,
>  			       LLVMTypeRef *returns, unsigned num_returns,
>  			       LLVMTypeRef *params, unsigned num_params,
> -			       int last_sgpr)
> +			       int last_sgpr, unsigned max_workgroup_size)
>  {
>  	int i;
>
>  	si_llvm_create_func(ctx, name, returns, num_returns,
>  			    params, num_params);
>  	si_llvm_shader_type(ctx->main_fn, ctx->type);
>  	ctx->return_value = LLVMGetUndef(ctx->return_type);
>
>  	for (i = 0; i <= last_sgpr; ++i) {
>  		LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
> @@ -5701,20 +5701,24 @@ static void si_create_function(struct si_shader_context *ctx,
>  		 * SGPR spilling significantly.
>  		 */
>  		if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
>  			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
>  			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
>  			ac_add_attr_dereferenceable(P, UINT64_MAX);
>  		} else
>  			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
>  	}
>
> +	if (max_workgroup_size) {
> +		si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
> +				      max_workgroup_size);
> +	}
>  	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
>  					   "no-signed-zeros-fp-math",
>  					   "true");
>
>  	if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
>  		/* These were copied from some LLVM test. */
>  		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
>  						   "less-precise-fpmad",
>  						   "true");
>  		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
> @@ -5782,20 +5786,36 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx)
>  	struct gallivm_state *gallivm = &ctx->gallivm;
>
>  	unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
>  	ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
>  		LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
>  		"lds");
>  }
>
>  static unsigned si_get_max_workgroup_size(struct si_shader *shader)
>  {
> +	switch (shader->selector->type) {
> +	case PIPE_SHADER_TESS_CTRL:
> +		/* Return this so that LLVM doesn't remove s_barrier
> +		 * instructions on chips where we use s_barrier. */
> +		return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
> +
> +	case PIPE_SHADER_GEOMETRY:
> +		return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
> +
> +	case PIPE_SHADER_COMPUTE:
> +		break; /* see below */
> +
> +	default:
> +		return 0;
> +	}
> +
>  	const unsigned *properties = shader->selector->info.properties;
>  	unsigned max_work_group_size =
>  	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>  	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>  	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>
>  	if (!max_work_group_size) {
>  		/* This is a variable group size compute shader,
>  		 * compile it for the maximum possible group size.
>  		 */
> @@ -6172,39 +6192,36 @@ static void create_function(struct si_shader_context *ctx)
>  		num_params = SI_PARAM_THREAD_ID + 1;
>  		break;
>  	default:
>  		assert(0 && "unimplemented shader");
>  		return;
>  	}
>
>  	assert(num_params <= ARRAY_SIZE(params));
>
>  	si_create_function(ctx, "main", returns, num_returns, params,
> -			   num_params, last_sgpr);
> +			   num_params, last_sgpr,
> +			   si_get_max_workgroup_size(shader));
>
>  	/* Reserve register locations for VGPR inputs the PS prolog may need. */
>  	if (ctx->type == PIPE_SHADER_FRAGMENT &&
>  	    ctx->separate_prolog) {
>  		si_llvm_add_attribute(ctx->main_fn,
>  				      "InitialPSInputAddr",
>  				      S_0286D0_PERSP_SAMPLE_ENA(1) |
>  				      S_0286D0_PERSP_CENTER_ENA(1) |
>  				      S_0286D0_PERSP_CENTROID_ENA(1) |
>  				      S_0286D0_LINEAR_SAMPLE_ENA(1) |
>  				      S_0286D0_LINEAR_CENTER_ENA(1) |
>  				      S_0286D0_LINEAR_CENTROID_ENA(1) |
>  				      S_0286D0_FRONT_FACE_ENA(1) |
>  				      S_0286D0_POS_FIXED_PT_ENA(1));
> -	} else if (ctx->type == PIPE_SHADER_COMPUTE) {
> -		si_llvm_add_attribute(ctx->main_fn,
> -				      "amdgpu-max-work-group-size",
> -				      si_get_max_workgroup_size(shader));
>  	}
>
>  	shader->info.num_input_sgprs = 0;
>  	shader->info.num_input_vgprs = 0;
>
>  	for (i = 0; i <= last_sgpr; ++i)
>  		shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
>
>  	for (; i < num_params; ++i)
>  		shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
> @@ -7701,21 +7718,21 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
>  		returns[i] = ctx->i32;
>  	}
>
>  	for (unsigned i = 0; i < num_vgprs; ++i) {
>  		params[num_sgprs + i] = ctx->i32;
>  		returns[num_sgprs + i] = ctx->f32;
>  	}
>
>  	/* Create the function. */
>  	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
> -			   params, num_sgprs + num_vgprs, num_sgprs - 1);
> +			   params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
>  	func = ctx->main_fn;
>
>  	/* Set the full EXEC mask for the prolog, because we are only fiddling
>  	 * with registers here. The main shader part will set the correct EXEC
>  	 * mask.
>  	 */
>  	if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
>  		si_init_exec_full_mask(ctx);
>
>  	/* Copy inputs to outputs. This should be no-op, as the registers match,
> @@ -7861,21 +7878,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
>  		size = llvm_get_type_size(param_types[num_params]) / 4;
>  		num_params++;
>
>  		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
>  		assert(gprs + size <= num_sgprs + num_vgprs &&
>  		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
>
>  		gprs += size;
>  	}
>
> -	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
> +	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
> +			   last_sgpr_param,
> +			   si_get_max_workgroup_size(ctx->shader));
>
>  	if (is_merged_shader(ctx->shader))
>  		si_init_exec_full_mask(ctx);
>
>  	/* Record the arguments of the function as if they were an output of
>  	 * a previous part.
>  	 */
>  	num_out = 0;
>  	num_out_sgpr = 0;
>
> @@ -8499,21 +8518,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
>  		params[num_params++] = ctx->i32;
>  		returns[num_returns++] = ctx->f32;
>  	}
>
>  	/* Vertex load indices. */
>  	for (i = 0; i <= key->vs_prolog.last_input; i++)
>  		returns[num_returns++] = ctx->f32;
>
>  	/* Create the function. */
>  	si_create_function(ctx, "vs_prolog", returns, num_returns, params,
> -			   num_params, last_sgpr);
> +			   num_params, last_sgpr, 0);
>  	func = ctx->main_fn;
>
>  	if (key->vs_prolog.num_merged_next_stage_vgprs &&
>  	    !key->vs_prolog.is_monolithic)
>  		si_init_exec_from_input(ctx, 3, 0);
>
>  	/* Copy inputs to outputs. This should be no-op, as the registers match,
>  	 * but it will prevent the compiler from overwriting them unintentionally.
>  	 */
>  	ret = ctx->return_value;
> @@ -8643,21 +8662,22 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
>  		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
>  		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
>  	}
>  	last_sgpr = num_params - 1;
>
>  	params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
>  	params[num_params++] = ctx->i32; /* invocation ID within the patch */
>  	params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
>
>  	/* Create the function. */
> -	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
> +	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
> +			   ctx->screen->b.chip_class >= CIK ? 128 : 64);
>  	declare_lds_as_pointer(ctx);
>  	func = ctx->main_fn;
>
>  	si_write_tess_factors(bld_base,
>  			      LLVMGetParam(func, last_sgpr + 1),
>  			      LLVMGetParam(func, last_sgpr + 2),
>  			      LLVMGetParam(func, last_sgpr + 3));
>
>  	LLVMBuildRetVoid(gallivm->builder);
>  }
> @@ -8764,21 +8784,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
>  		params[num_params++] = ctx->f32;
>
>  	/* Declare outputs (same as inputs + add colors if needed) */
>  	num_returns = num_params;
>  	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
>  	for (i = 0; i < num_color_channels; i++)
>  		params[num_returns++] = ctx->f32;
>
>  	/* Create the function. */
>  	si_create_function(ctx, "ps_prolog", params, num_returns, params,
> -			   num_params, last_sgpr);
> +			   num_params, last_sgpr, 0);
>  	func = ctx->main_fn;
>
>  	/* Copy inputs to outputs. This should be no-op, as the registers match,
>  	 * but it will prevent the compiler from overwriting them unintentionally.
>  	 */
>  	ret = ctx->return_value;
>  	for (i = 0; i < num_params; i++) {
>  		LLVMValueRef p = LLVMGetParam(func, i);
>  		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
>  	}
> @@ -9006,21 +9026,22 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
>
>  	num_params = MAX2(num_params,
>  			  last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
>
>  	assert(num_params <= ARRAY_SIZE(params));
>
>  	for (i = last_sgpr + 1; i < num_params; i++)
>  		params[i] = ctx->f32;
>
>  	/* Create the function. */
> -	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
> +	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
> +			   last_sgpr, 0);
>  	/* Disable elimination of unused inputs. */
>  	si_llvm_add_attribute(ctx->main_fn,
>  				  "InitialPSInputAddr", 0xffffff);
>
>  	/* Process colors. */
>  	unsigned vgpr = last_sgpr + 1;
>  	unsigned colors_written = key->ps_epilog.colors_written;
>  	int last_color_export = -1;
>
>  	/* Find the last color export. */
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.