[Mesa-dev] [PATCH 29/61] radeonsi/gfx9: add support for monolithic merged LS-HS
Marek Olšák
maraeo at gmail.com
Mon Apr 24 08:45:26 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/drivers/radeonsi/si_shader.c | 143 +++++++++++++++++++++++++++----
src/gallium/drivers/radeonsi/si_shader.h | 2 +
2 files changed, 128 insertions(+), 17 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 823ffff..9c5dd5e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7592,34 +7592,37 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
LLVMBuildRet(builder, ret);
}
/**
* Given a list of shader part functions, build a wrapper function that
* runs them in sequence to form a monolithic shader.
*/
static void si_build_wrapper_function(struct si_shader_context *ctx,
LLVMValueRef *parts,
unsigned num_parts,
- unsigned main_part)
+ unsigned main_part,
+ unsigned next_shader_first_part)
{
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = ctx->gallivm.builder;
/* PS epilog has one arg per color component */
LLVMTypeRef param_types[48];
- LLVMValueRef out[48];
+ LLVMValueRef initial[48], out[48];
LLVMTypeRef function_type;
unsigned num_params;
- unsigned num_out;
+ unsigned num_out, initial_num_out;
MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+ MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
unsigned num_sgprs, num_vgprs;
unsigned last_sgpr_param;
unsigned gprs;
+ struct lp_build_if_state if_state;
for (unsigned i = 0; i < num_parts; ++i) {
lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
}
/* The parameters of the wrapper function correspond to those of the
* first part in terms of SGPRs and VGPRs, but we use the types of the
* main part to get the right types. This is relevant for the
* dereferenceable attribute on descriptor table pointers.
@@ -7657,20 +7660,27 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
assert(gprs + size <= num_sgprs + num_vgprs &&
(gprs >= num_sgprs || gprs + size <= num_sgprs));
gprs += size;
}
si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+ if (is_merged_shader(ctx->shader)) {
+ LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+ lp_build_intrinsic(ctx->gallivm.builder,
+ "llvm.amdgcn.init.exec", ctx->voidt,
+ &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+ }
+
/* Record the arguments of the function as if they were an output of
* a previous part.
*/
num_out = 0;
num_out_sgpr = 0;
for (unsigned i = 0; i < num_params; ++i) {
LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
LLVMTypeRef param_type = LLVMTypeOf(param);
LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
@@ -7693,30 +7703,52 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
for (unsigned j = 0; j < size; ++j)
out[num_out++] = LLVMBuildExtractElement(
builder, param, LLVMConstInt(ctx->i32, j, 0), "");
}
if (i <= last_sgpr_param)
num_out_sgpr = num_out;
}
+ memcpy(initial, out, sizeof(out));
+ initial_num_out = num_out;
+ initial_num_out_sgpr = num_out_sgpr;
+
/* Now chain the parts. */
for (unsigned part = 0; part < num_parts; ++part) {
LLVMValueRef in[48];
LLVMValueRef ret;
LLVMTypeRef ret_type;
unsigned out_idx = 0;
num_params = LLVMCountParams(parts[part]);
assert(num_params <= ARRAY_SIZE(param_types));
+ /* Merged shaders are executed conditionally depending
+ * on the number of enabled threads passed in the input SGPRs. */
+ if (is_merged_shader(ctx->shader) &&
+ (part == 0 || part == next_shader_first_part)) {
+ LLVMValueRef ena, count = initial[3];
+
+ /* The thread count for the 2nd shader is at bit-offset 8. */
+ if (part == next_shader_first_part) {
+ count = LLVMBuildLShr(builder, count,
+ LLVMConstInt(ctx->i32, 8, 0), "");
+ }
+ count = LLVMBuildAnd(builder, count,
+ LLVMConstInt(ctx->i32, 0x7f, 0), "");
+ ena = LLVMBuildICmp(builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), count, "");
+ lp_build_if(&if_state, &ctx->gallivm, ena);
+ }
+
/* Derive arguments for the next part from outputs of the
* previous one.
*/
for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
LLVMValueRef param;
LLVMTypeRef param_type;
bool is_sgpr;
unsigned param_size;
LLVMValueRef arg = NULL;
@@ -7750,23 +7782,47 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
} else {
arg = LLVMBuildBitCast(builder, arg, param_type, "");
}
}
in[param_idx] = arg;
out_idx += param_size;
}
ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
- ret_type = LLVMTypeOf(ret);
+
+ if (is_merged_shader(ctx->shader) &&
+ (part + 1 == next_shader_first_part ||
+ part + 1 == num_parts)) {
+ lp_build_endif(&if_state);
+
+ if (part + 1 == next_shader_first_part) {
+ /* A barrier is required between 2 merged shaders. */
+ si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
+
+ /* The second half of the merged shader should use
+ * the inputs from the toplevel (wrapper) function,
+ * not the return value from the last call.
+ *
+ * That's because the last call was executed condi-
+ * tionally, so we can't consume it in the main
+ * block.
+ */
+ memcpy(out, initial, sizeof(initial));
+ num_out = initial_num_out;
+ num_out_sgpr = initial_num_out_sgpr;
+ }
+ continue;
+ }
/* Extract the returned GPRs. */
+ ret_type = LLVMTypeOf(ret);
num_out = 0;
num_out_sgpr = 0;
if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
unsigned ret_size = LLVMCountStructElementTypes(ret_type);
for (unsigned i = 0; i < ret_size; ++i) {
LLVMValueRef val =
@@ -7840,78 +7896,130 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
}
if (need_epilog) {
union si_shader_part_key epilog_key;
si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
si_build_vs_epilog_function(&ctx, &epilog_key);
parts[need_prolog ? 2 : 1] = ctx.main_fn;
}
si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
- need_prolog ? 1 : 0);
+ need_prolog ? 1 : 0, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
- LLVMValueRef parts[2];
- union si_shader_part_key epilog_key;
+ if (sscreen->b.chip_class >= GFX9) {
+ struct si_shader_selector *ls = shader->key.part.tcs.ls;
+ LLVMValueRef parts[4];
+
+ /* TCS main part */
+ parts[2] = ctx.main_fn;
+
+ /* TCS epilog */
+ union si_shader_part_key tcs_epilog_key;
+ memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+ tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+ parts[3] = ctx.main_fn;
+
+ /* VS prolog */
+ if (ls->vs_needs_prolog) {
+ union si_shader_part_key vs_prolog_key;
+ si_get_vs_prolog_key(&ls->info,
+ shader->info.num_input_sgprs,
+ &shader->key.part.tcs.ls_prolog,
+ shader, &vs_prolog_key);
+ vs_prolog_key.vs_prolog.is_monolithic = true;
+ si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+ parts[0] = ctx.main_fn;
+ }
- parts[0] = ctx.main_fn;
+ /* VS as LS main part */
+ struct si_shader shader_ls = {};
+ shader_ls.selector = ls;
+ shader_ls.key.as_ls = 1;
+ shader_ls.key.mono = shader->key.mono;
+ shader_ls.key.opt = shader->key.opt;
+ si_llvm_context_set_tgsi(&ctx, &shader_ls);
- memset(&epilog_key, 0, sizeof(epilog_key));
- epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
- si_build_tcs_epilog_function(&ctx, &epilog_key);
- parts[1] = ctx.main_fn;
+ if (!si_compile_tgsi_main(&ctx, true)) {
+ si_llvm_dispose(&ctx);
+ return -1;
+ }
+ shader->info.uses_instanceid |= ls->info.uses_instanceid;
+ parts[1] = ctx.main_fn;
+
+ /* Reset the shader context. */
+ ctx.shader = shader;
+ ctx.type = PIPE_SHADER_TESS_CTRL;
- si_build_wrapper_function(&ctx, parts, 2, 0);
+ si_build_wrapper_function(&ctx,
+ parts + !ls->vs_needs_prolog,
+ 4 - !ls->vs_needs_prolog, 0,
+ ls->vs_needs_prolog ? 2 : 1);
+ } else {
+ LLVMValueRef parts[2];
+ union si_shader_part_key epilog_key;
+
+ parts[0] = ctx.main_fn;
+
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_build_tcs_epilog_function(&ctx, &epilog_key);
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ }
} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
!shader->key.as_es) {
LLVMValueRef parts[2];
union si_shader_part_key epilog_key;
parts[0] = ctx.main_fn;
si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
si_build_vs_epilog_function(&ctx, &epilog_key);
parts[1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 0);
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
LLVMValueRef parts[2];
union si_shader_part_key prolog_key;
parts[1] = ctx.main_fn;
memset(&prolog_key, 0, sizeof(prolog_key));
prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
si_build_gs_prolog_function(&ctx, &prolog_key);
parts[0] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 1);
+ si_build_wrapper_function(&ctx, parts, 2, 1, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
LLVMValueRef parts[3];
union si_shader_part_key prolog_key;
union si_shader_part_key epilog_key;
bool need_prolog;
si_get_ps_prolog_key(shader, &prolog_key, false);
need_prolog = si_need_ps_prolog(&prolog_key);
parts[need_prolog ? 1 : 0] = ctx.main_fn;
if (need_prolog) {
si_build_ps_prolog_function(&ctx, &prolog_key);
parts[0] = ctx.main_fn;
}
si_get_ps_epilog_key(shader, &epilog_key);
si_build_ps_epilog_function(&ctx, &epilog_key);
parts[need_prolog ? 2 : 1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
+ si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+ need_prolog ? 1 : 0, 0);
}
/* Dump LLVM IR before any optimization passes */
if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
r600_can_dump_shader(&sscreen->b, ctx.type))
LLVMDumpModule(ctx.gallivm.module);
si_llvm_finalize_module(&ctx,
r600_extra_shader_checks(&sscreen->b, ctx.type));
@@ -8157,21 +8265,22 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
/* Vertex load indices. */
for (i = 0; i <= key->vs_prolog.last_input; i++)
returns[num_returns++] = ctx->f32;
/* Create the function. */
si_create_function(ctx, "vs_prolog", returns, num_returns, params,
num_params, last_sgpr);
func = ctx->main_fn;
- if (key->vs_prolog.num_merged_next_stage_vgprs)
+ if (key->vs_prolog.num_merged_next_stage_vgprs &&
+ !key->vs_prolog.is_monolithic)
si_init_exec_from_input(ctx, 3, 0);
/* Copy inputs to outputs. This should be no-op, as the registers match,
* but it will prevent the compiler from overwriting them unintentionally.
*/
ret = ctx->return_value;
for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
}
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index afbe547..e24b8b8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -348,20 +348,22 @@ struct si_ps_epilog_bits {
unsigned clamp_color:1;
};
union si_shader_part_key {
struct {
struct si_vs_prolog_bits states;
unsigned num_input_sgprs:6;
/* For merged stages such as LS-HS, HS input VGPRs are first. */
unsigned num_merged_next_stage_vgprs:3;
unsigned last_input:4;
+ /* Prologs for monolithic shaders shouldn't set EXEC. */
+ unsigned is_monolithic:1;
} vs_prolog;
struct {
struct si_vs_epilog_bits states;
unsigned prim_id_param_offset:5;
} vs_epilog;
struct {
struct si_tcs_epilog_bits states;
} tcs_epilog;
struct {
struct si_gs_prolog_bits states;
--
2.7.4
More information about the mesa-dev
mailing list