[Mesa-dev] [PATCH 61/61] radeonsi: tell LLVM not to remove s_barrier instructions
Marek Olšák
maraeo at gmail.com
Mon Apr 24 08:45:58 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
LLVM 5.0 removes s_barrier instructions if the max-work-group-size
attribute is not set. What a surprise.
---
src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++++++++---------
1 file changed, 33 insertions(+), 12 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 3b00bea..086b279 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5674,21 +5674,21 @@ static const struct lp_build_tgsi_action tex_action = {
static const struct lp_build_tgsi_action interp_action = {
.fetch_args = interp_fetch_args,
.emit = build_interp_intrinsic,
};
static void si_create_function(struct si_shader_context *ctx,
const char *name,
LLVMTypeRef *returns, unsigned num_returns,
LLVMTypeRef *params, unsigned num_params,
- int last_sgpr)
+ int last_sgpr, unsigned max_workgroup_size)
{
int i;
si_llvm_create_func(ctx, name, returns, num_returns,
params, num_params);
si_llvm_shader_type(ctx->main_fn, ctx->type);
ctx->return_value = LLVMGetUndef(ctx->return_type);
for (i = 0; i <= last_sgpr; ++i) {
LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
@@ -5701,20 +5701,24 @@ static void si_create_function(struct si_shader_context *ctx,
* SGPR spilling significantly.
*/
if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
ac_add_attr_dereferenceable(P, UINT64_MAX);
} else
lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
}
+ if (max_workgroup_size) {
+ si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
+ max_workgroup_size);
+ }
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
"no-signed-zeros-fp-math",
"true");
if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
/* These were copied from some LLVM test. */
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
"less-precise-fpmad",
"true");
LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
@@ -5782,20 +5786,36 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx)
struct gallivm_state *gallivm = &ctx->gallivm;
unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
"lds");
}
static unsigned si_get_max_workgroup_size(struct si_shader *shader)
{
+ switch (shader->selector->type) {
+ case PIPE_SHADER_TESS_CTRL:
+ /* Return this so that LLVM doesn't remove s_barrier
+ * instructions on chips where we use s_barrier. */
+ return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+ case PIPE_SHADER_GEOMETRY:
+ return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
+
+ case PIPE_SHADER_COMPUTE:
+ break; /* see below */
+
+ default:
+ return 0;
+ }
+
const unsigned *properties = shader->selector->info.properties;
unsigned max_work_group_size =
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
if (!max_work_group_size) {
/* This is a variable group size compute shader,
* compile it for the maximum possible group size.
*/
@@ -6172,39 +6192,36 @@ static void create_function(struct si_shader_context *ctx)
num_params = SI_PARAM_THREAD_ID + 1;
break;
default:
assert(0 && "unimplemented shader");
return;
}
assert(num_params <= ARRAY_SIZE(params));
si_create_function(ctx, "main", returns, num_returns, params,
- num_params, last_sgpr);
+ num_params, last_sgpr,
+ si_get_max_workgroup_size(shader));
/* Reserve register locations for VGPR inputs the PS prolog may need. */
if (ctx->type == PIPE_SHADER_FRAGMENT &&
ctx->separate_prolog) {
si_llvm_add_attribute(ctx->main_fn,
"InitialPSInputAddr",
S_0286D0_PERSP_SAMPLE_ENA(1) |
S_0286D0_PERSP_CENTER_ENA(1) |
S_0286D0_PERSP_CENTROID_ENA(1) |
S_0286D0_LINEAR_SAMPLE_ENA(1) |
S_0286D0_LINEAR_CENTER_ENA(1) |
S_0286D0_LINEAR_CENTROID_ENA(1) |
S_0286D0_FRONT_FACE_ENA(1) |
S_0286D0_POS_FIXED_PT_ENA(1));
- } else if (ctx->type == PIPE_SHADER_COMPUTE) {
- si_llvm_add_attribute(ctx->main_fn,
- "amdgpu-max-work-group-size",
- si_get_max_workgroup_size(shader));
}
shader->info.num_input_sgprs = 0;
shader->info.num_input_vgprs = 0;
for (i = 0; i <= last_sgpr; ++i)
shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
for (; i < num_params; ++i)
shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
@@ -7701,21 +7718,21 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
returns[i] = ctx->i32;
}
for (unsigned i = 0; i < num_vgprs; ++i) {
params[num_sgprs + i] = ctx->i32;
returns[num_sgprs + i] = ctx->f32;
}
/* Create the function. */
si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
- params, num_sgprs + num_vgprs, num_sgprs - 1);
+ params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
func = ctx->main_fn;
/* Set the full EXEC mask for the prolog, because we are only fiddling
* with registers here. The main shader part will set the correct EXEC
* mask.
*/
if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
si_init_exec_full_mask(ctx);
/* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -7861,21 +7878,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
size = llvm_get_type_size(param_types[num_params]) / 4;
num_params++;
assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
assert(gprs + size <= num_sgprs + num_vgprs &&
(gprs >= num_sgprs || gprs + size <= num_sgprs));
gprs += size;
}
- si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+ si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
+ last_sgpr_param,
+ si_get_max_workgroup_size(ctx->shader));
if (is_merged_shader(ctx->shader))
si_init_exec_full_mask(ctx);
/* Record the arguments of the function as if they were an output of
* a previous part.
*/
num_out = 0;
num_out_sgpr = 0;
@@ -8499,21 +8518,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
params[num_params++] = ctx->i32;
returns[num_returns++] = ctx->f32;
}
/* Vertex load indices. */
for (i = 0; i <= key->vs_prolog.last_input; i++)
returns[num_returns++] = ctx->f32;
/* Create the function. */
si_create_function(ctx, "vs_prolog", returns, num_returns, params,
- num_params, last_sgpr);
+ num_params, last_sgpr, 0);
func = ctx->main_fn;
if (key->vs_prolog.num_merged_next_stage_vgprs &&
!key->vs_prolog.is_monolithic)
si_init_exec_from_input(ctx, 3, 0);
/* Copy inputs to outputs. This should be no-op, as the registers match,
* but it will prevent the compiler from overwriting them unintentionally.
*/
ret = ctx->return_value;
@@ -8643,21 +8662,22 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
}
last_sgpr = num_params - 1;
params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
params[num_params++] = ctx->i32; /* invocation ID within the patch */
params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
/* Create the function. */
- si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
+ si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
+ ctx->screen->b.chip_class >= CIK ? 128 : 64);
declare_lds_as_pointer(ctx);
func = ctx->main_fn;
si_write_tess_factors(bld_base,
LLVMGetParam(func, last_sgpr + 1),
LLVMGetParam(func, last_sgpr + 2),
LLVMGetParam(func, last_sgpr + 3));
LLVMBuildRetVoid(gallivm->builder);
}
@@ -8764,21 +8784,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
params[num_params++] = ctx->f32;
/* Declare outputs (same as inputs + add colors if needed) */
num_returns = num_params;
num_color_channels = util_bitcount(key->ps_prolog.colors_read);
for (i = 0; i < num_color_channels; i++)
params[num_returns++] = ctx->f32;
/* Create the function. */
si_create_function(ctx, "ps_prolog", params, num_returns, params,
- num_params, last_sgpr);
+ num_params, last_sgpr, 0);
func = ctx->main_fn;
/* Copy inputs to outputs. This should be no-op, as the registers match,
* but it will prevent the compiler from overwriting them unintentionally.
*/
ret = ctx->return_value;
for (i = 0; i < num_params; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
}
@@ -9006,21 +9026,22 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
num_params = MAX2(num_params,
last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
assert(num_params <= ARRAY_SIZE(params));
for (i = last_sgpr + 1; i < num_params; i++)
params[i] = ctx->f32;
/* Create the function. */
- si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
+ si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
+ last_sgpr, 0);
/* Disable elimination of unused inputs. */
si_llvm_add_attribute(ctx->main_fn,
"InitialPSInputAddr", 0xffffff);
/* Process colors. */
unsigned vgpr = last_sgpr + 1;
unsigned colors_written = key->ps_epilog.colors_written;
int last_color_export = -1;
/* Find the last color export. */
--
2.7.4
More information about the mesa-dev
mailing list