[Mesa-dev] [PATCH 61/61] radeonsi: tell LLVM not to remove s_barrier instructions
Nicolai Hähnle
nhaehnle at gmail.com
Fri Apr 28 12:06:51 UTC 2017
On 24.04.2017 10:45, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> LLVM 5.0 removes s_barrier instructions if the max-work-group-size
> attribute is not set. What a surprise.
One minor comment on patch 56, apart from that patches 54-61:
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
> ---
> src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++++++++---------
> 1 file changed, 33 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 3b00bea..086b279 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5674,21 +5674,21 @@ static const struct lp_build_tgsi_action tex_action = {
>
> static const struct lp_build_tgsi_action interp_action = {
> .fetch_args = interp_fetch_args,
> .emit = build_interp_intrinsic,
> };
>
> static void si_create_function(struct si_shader_context *ctx,
> const char *name,
> LLVMTypeRef *returns, unsigned num_returns,
> LLVMTypeRef *params, unsigned num_params,
> - int last_sgpr)
> + int last_sgpr, unsigned max_workgroup_size)
> {
> int i;
>
> si_llvm_create_func(ctx, name, returns, num_returns,
> params, num_params);
> si_llvm_shader_type(ctx->main_fn, ctx->type);
> ctx->return_value = LLVMGetUndef(ctx->return_type);
>
> for (i = 0; i <= last_sgpr; ++i) {
> LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
> @@ -5701,20 +5701,24 @@ static void si_create_function(struct si_shader_context *ctx,
> * SGPR spilling significantly.
> */
> if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
> lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
> lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
> ac_add_attr_dereferenceable(P, UINT64_MAX);
> } else
> lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
> }
>
> + if (max_workgroup_size) {
> + si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
> + max_workgroup_size);
> + }
> LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
> "no-signed-zeros-fp-math",
> "true");
>
> if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
> /* These were copied from some LLVM test. */
> LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
> "less-precise-fpmad",
> "true");
> LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
> @@ -5782,20 +5786,36 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx)
> struct gallivm_state *gallivm = &ctx->gallivm;
>
> unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
> ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
> LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
> "lds");
> }
>
> static unsigned si_get_max_workgroup_size(struct si_shader *shader)
> {
> + switch (shader->selector->type) {
> + case PIPE_SHADER_TESS_CTRL:
> + /* Return this so that LLVM doesn't remove s_barrier
> + * instructions on chips where we use s_barrier. */
> + return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
> +
> + case PIPE_SHADER_GEOMETRY:
> + return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
> +
> + case PIPE_SHADER_COMPUTE:
> + break; /* see below */
> +
> + default:
> + return 0;
> + }
> +
> const unsigned *properties = shader->selector->info.properties;
> unsigned max_work_group_size =
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>
> if (!max_work_group_size) {
> /* This is a variable group size compute shader,
> * compile it for the maximum possible group size.
> */
> @@ -6172,39 +6192,36 @@ static void create_function(struct si_shader_context *ctx)
> num_params = SI_PARAM_THREAD_ID + 1;
> break;
> default:
> assert(0 && "unimplemented shader");
> return;
> }
>
> assert(num_params <= ARRAY_SIZE(params));
>
> si_create_function(ctx, "main", returns, num_returns, params,
> - num_params, last_sgpr);
> + num_params, last_sgpr,
> + si_get_max_workgroup_size(shader));
>
> /* Reserve register locations for VGPR inputs the PS prolog may need. */
> if (ctx->type == PIPE_SHADER_FRAGMENT &&
> ctx->separate_prolog) {
> si_llvm_add_attribute(ctx->main_fn,
> "InitialPSInputAddr",
> S_0286D0_PERSP_SAMPLE_ENA(1) |
> S_0286D0_PERSP_CENTER_ENA(1) |
> S_0286D0_PERSP_CENTROID_ENA(1) |
> S_0286D0_LINEAR_SAMPLE_ENA(1) |
> S_0286D0_LINEAR_CENTER_ENA(1) |
> S_0286D0_LINEAR_CENTROID_ENA(1) |
> S_0286D0_FRONT_FACE_ENA(1) |
> S_0286D0_POS_FIXED_PT_ENA(1));
> - } else if (ctx->type == PIPE_SHADER_COMPUTE) {
> - si_llvm_add_attribute(ctx->main_fn,
> - "amdgpu-max-work-group-size",
> - si_get_max_workgroup_size(shader));
> }
>
> shader->info.num_input_sgprs = 0;
> shader->info.num_input_vgprs = 0;
>
> for (i = 0; i <= last_sgpr; ++i)
> shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
>
> for (; i < num_params; ++i)
> shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
> @@ -7701,21 +7718,21 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
> returns[i] = ctx->i32;
> }
>
> for (unsigned i = 0; i < num_vgprs; ++i) {
> params[num_sgprs + i] = ctx->i32;
> returns[num_sgprs + i] = ctx->f32;
> }
>
> /* Create the function. */
> si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
> - params, num_sgprs + num_vgprs, num_sgprs - 1);
> + params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
> func = ctx->main_fn;
>
> /* Set the full EXEC mask for the prolog, because we are only fiddling
> * with registers here. The main shader part will set the correct EXEC
> * mask.
> */
> if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
> si_init_exec_full_mask(ctx);
>
> /* Copy inputs to outputs. This should be no-op, as the registers match,
> @@ -7861,21 +7878,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
> size = llvm_get_type_size(param_types[num_params]) / 4;
> num_params++;
>
> assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
> assert(gprs + size <= num_sgprs + num_vgprs &&
> (gprs >= num_sgprs || gprs + size <= num_sgprs));
>
> gprs += size;
> }
>
> - si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
> + si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
> + last_sgpr_param,
> + si_get_max_workgroup_size(ctx->shader));
>
> if (is_merged_shader(ctx->shader))
> si_init_exec_full_mask(ctx);
>
> /* Record the arguments of the function as if they were an output of
> * a previous part.
> */
> num_out = 0;
> num_out_sgpr = 0;
>
> @@ -8499,21 +8518,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
> params[num_params++] = ctx->i32;
> returns[num_returns++] = ctx->f32;
> }
>
> /* Vertex load indices. */
> for (i = 0; i <= key->vs_prolog.last_input; i++)
> returns[num_returns++] = ctx->f32;
>
> /* Create the function. */
> si_create_function(ctx, "vs_prolog", returns, num_returns, params,
> - num_params, last_sgpr);
> + num_params, last_sgpr, 0);
> func = ctx->main_fn;
>
> if (key->vs_prolog.num_merged_next_stage_vgprs &&
> !key->vs_prolog.is_monolithic)
> si_init_exec_from_input(ctx, 3, 0);
>
> /* Copy inputs to outputs. This should be no-op, as the registers match,
> * but it will prevent the compiler from overwriting them unintentionally.
> */
> ret = ctx->return_value;
> @@ -8643,21 +8662,22 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
> params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
> params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
> }
> last_sgpr = num_params - 1;
>
> params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
> params[num_params++] = ctx->i32; /* invocation ID within the patch */
> params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
>
> /* Create the function. */
> - si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
> + si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
> + ctx->screen->b.chip_class >= CIK ? 128 : 64);
> declare_lds_as_pointer(ctx);
> func = ctx->main_fn;
>
> si_write_tess_factors(bld_base,
> LLVMGetParam(func, last_sgpr + 1),
> LLVMGetParam(func, last_sgpr + 2),
> LLVMGetParam(func, last_sgpr + 3));
>
> LLVMBuildRetVoid(gallivm->builder);
> }
> @@ -8764,21 +8784,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
> params[num_params++] = ctx->f32;
>
> /* Declare outputs (same as inputs + add colors if needed) */
> num_returns = num_params;
> num_color_channels = util_bitcount(key->ps_prolog.colors_read);
> for (i = 0; i < num_color_channels; i++)
> params[num_returns++] = ctx->f32;
>
> /* Create the function. */
> si_create_function(ctx, "ps_prolog", params, num_returns, params,
> - num_params, last_sgpr);
> + num_params, last_sgpr, 0);
> func = ctx->main_fn;
>
> /* Copy inputs to outputs. This should be no-op, as the registers match,
> * but it will prevent the compiler from overwriting them unintentionally.
> */
> ret = ctx->return_value;
> for (i = 0; i < num_params; i++) {
> LLVMValueRef p = LLVMGetParam(func, i);
> ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
> }
> @@ -9006,21 +9026,22 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
>
> num_params = MAX2(num_params,
> last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
>
> assert(num_params <= ARRAY_SIZE(params));
>
> for (i = last_sgpr + 1; i < num_params; i++)
> params[i] = ctx->f32;
>
> /* Create the function. */
> - si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
> + si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
> + last_sgpr, 0);
> /* Disable elimination of unused inputs. */
> si_llvm_add_attribute(ctx->main_fn,
> "InitialPSInputAddr", 0xffffff);
>
> /* Process colors. */
> unsigned vgpr = last_sgpr + 1;
> unsigned colors_written = key->ps_epilog.colors_written;
> int last_color_export = -1;
>
> /* Find the last color export. */
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list