[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
Nicolai Hähnle
nhaehnle at gmail.com
Fri Apr 28 11:25:33 UTC 2017
On 24.04.2017 10:45, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
> src/gallium/drivers/radeonsi/si_shader.c | 87 +++++++++++++++++++++++++-------
> 1 file changed, 70 insertions(+), 17 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index a4c2ac0..392f85d 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx)
> LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
> /* No idea why LLVM aligns allocas to 4 elements. */
> unsigned alignment = LLVMGetAlignment(inst);
> unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
> ctx->shader->config.private_mem_vgprs += dw_size;
> }
> bb = LLVMGetNextBasicBlock(bb);
> }
> }
>
> +static void si_init_exec_full_mask(struct si_shader_context *ctx)
> +{
> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
> + lp_build_intrinsic(ctx->gallivm.builder,
> + "llvm.amdgcn.init.exec", ctx->voidt,
> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
> +}
> +
> static void si_init_exec_from_input(struct si_shader_context *ctx,
> unsigned param, unsigned bitoffset)
> {
> LLVMValueRef args[] = {
> LLVMGetParam(ctx->main_fn, param),
> LLVMConstInt(ctx->i32, bitoffset, 0),
> };
> lp_build_intrinsic(ctx->gallivm.builder,
> "llvm.amdgcn.init.exec.from.input",
> ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader *shader,
> key->ps_epilog.states = shader->key.part.ps.epilog;
> }
>
> /**
> * Build the GS prolog function. Rotate the input vertices for triangle strips
> * with adjacency.
> */
> static void si_build_gs_prolog_function(struct si_shader_context *ctx,
> union si_shader_part_key *key)
> {
> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
> - const unsigned num_vgprs = 8;
> + unsigned num_sgprs, num_vgprs;
> struct gallivm_state *gallivm = &ctx->gallivm;
> LLVMBuilderRef builder = gallivm->builder;
> - LLVMTypeRef params[32];
> - LLVMTypeRef returns[32];
> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
> + LLVMTypeRef returns[48];
> LLVMValueRef func, ret;
>
> + if (ctx->screen->b.chip_class >= GFX9) {
> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
> + num_vgprs = 5; /* ES inputs are not needed by GS */
> + } else {
> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
> + num_vgprs = 8;
> + }
> +
> for (unsigned i = 0; i < num_sgprs; ++i) {
> params[i] = ctx->i32;
> returns[i] = ctx->i32;
> }
>
> for (unsigned i = 0; i < num_vgprs; ++i) {
> params[num_sgprs + i] = ctx->i32;
> returns[num_sgprs + i] = ctx->f32;
> }
>
> /* Create the function. */
> si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
> params, num_sgprs + num_vgprs, num_sgprs - 1);
> func = ctx->main_fn;
>
> + /* Set the full EXEC mask for the prolog, because we are only fiddling
> + * with registers here. The main shader part will set the correct EXEC
> + * mask.
> + */
> + if (ctx->screen->b.chip_class >= GFX9)
> + si_init_exec_full_mask(ctx);
> +
> /* Copy inputs to outputs. This should be no-op, as the registers match,
> * but it will prevent the compiler from overwriting them unintentionally.
> */
> ret = ctx->return_value;
> for (unsigned i = 0; i < num_sgprs; i++) {
> LLVMValueRef p = LLVMGetParam(func, i);
> ret = LLVMBuildInsertValue(builder, ret, p, i, "");
> }
> for (unsigned i = 0; i < num_vgprs; i++) {
> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
> p = LLVMBuildBitCast(builder, p, ctx->f32, "");
> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
> }
>
> if (key->gs_prolog.states.tri_strip_adj_fix) {
> /* Remap the input vertices for every other primitive. */
> - const unsigned vtx_params[6] = {
> + const unsigned gfx6_vtx_params[6] = {
> num_sgprs,
> num_sgprs + 1,
> num_sgprs + 3,
> num_sgprs + 4,
> num_sgprs + 5,
> num_sgprs + 6
> };
> + const unsigned gfx9_vtx_params[3] = {
> + num_sgprs,
> + num_sgprs + 1,
> + num_sgprs + 4,
> + };
> + LLVMValueRef vtx_in[6], vtx_out[6];
> LLVMValueRef prim_id, rotate;
>
> + if (ctx->screen->b.chip_class >= GFX9) {
> + for (unsigned i = 0; i < 3; i++) {
> + vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
> + vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
> + }
> + } else {
> + for (unsigned i = 0; i < 6; i++)
> + vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
> + }
> +
> prim_id = LLVMGetParam(func, num_sgprs + 2);
> rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
>
> for (unsigned i = 0; i < 6; ++i) {
> - LLVMValueRef base, rotated, actual;
> - base = LLVMGetParam(func, vtx_params[i]);
> - rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
> - actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
> - actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
> - ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
> + LLVMValueRef base, rotated;
> + base = vtx_in[i];
> + rotated = vtx_in[(i + 4) % 6];
> + vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
> + }
> +
> + if (ctx->screen->b.chip_class >= GFX9) {
> + for (unsigned i = 0; i < 3; i++) {
> + LLVMValueRef hi, out;
> +
> + hi = LLVMBuildShl(builder, vtx_out[i*2+1],
> + LLVMConstInt(ctx->i32, 16, 0), "");
> + out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
> + out = LLVMBuildBitCast(builder, out, ctx->f32, "");
> + ret = LLVMBuildInsertValue(builder, ret, out,
> + gfx9_vtx_params[i], "");
> + }
> + } else {
> + for (unsigned i = 0; i < 6; i++) {
> + LLVMValueRef out;
> +
> + out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
> + ret = LLVMBuildInsertValue(builder, ret, out,
> + gfx6_vtx_params[i], "");
> + }
> }
I believe this could be simplified quite a bit, since the vertex indices
are rotate by a multiple of 2. So there's no need to unpack the bits and
pack them again, instead just rotate the 3 input registers by 2 instead
of rotating 6 input registers by 4.
I'm fine with it if you want to do that in a follow-up patch.
Cheers,
Nicolai
> }
>
> LLVMBuildRet(builder, ret);
> }
>
> /**
> * Given a list of shader part functions, build a wrapper function that
> * runs them in sequence to form a monolithic shader.
> */
> @@ -7821,26 +7878,22 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
>
> assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
> assert(gprs + size <= num_sgprs + num_vgprs &&
> (gprs >= num_sgprs || gprs + size <= num_sgprs));
>
> gprs += size;
> }
>
> si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
>
> - if (is_merged_shader(ctx->shader)) {
> - LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
> - lp_build_intrinsic(ctx->gallivm.builder,
> - "llvm.amdgcn.init.exec", ctx->voidt,
> - &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
> - }
> + if (is_merged_shader(ctx->shader))
> + si_init_exec_full_mask(ctx);
>
> /* Record the arguments of the function as if they were an output of
> * a previous part.
> */
> num_out = 0;
> num_out_sgpr = 0;
>
> for (unsigned i = 0; i < num_params; ++i) {
> LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
> LLVMTypeRef param_type = LLVMTypeOf(param);
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list