[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

Fri Apr 28 15:59:53 UTC 2017

On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 24.04.2017 10:45, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> ---
>>  src/gallium/drivers/radeonsi/si_shader.c | 87
>> +++++++++++++++++++++++++-------
>>  1 file changed, 70 insertions(+), 17 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>> b/src/gallium/drivers/radeonsi/si_shader.c
>> index a4c2ac0..392f85d 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct
>> si_shader_context *ctx)
>>                         LLVMTypeRef type =
>> LLVMGetElementType(LLVMTypeOf(inst));
>>                         /* No idea why LLVM aligns allocas to 4 elements.
>> */
>>                         unsigned alignment = LLVMGetAlignment(inst);
>>                         unsigned dw_size = align(llvm_get_type_size(type)
>> / 4, alignment);
>>                         ctx->shader->config.private_mem_vgprs += dw_size;
>>                 }
>>                 bb = LLVMGetNextBasicBlock(bb);
>>         }
>>  }
>>
>> +static void si_init_exec_full_mask(struct si_shader_context *ctx)
>> +{
>> +       LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
>> +       lp_build_intrinsic(ctx->gallivm.builder,
>> +                          "llvm.amdgcn.init.exec", ctx->voidt,
>> +                          &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
>> +}
>> +
>>  static void si_init_exec_from_input(struct si_shader_context *ctx,
>>                                     unsigned param, unsigned bitoffset)
>>  {
>>         LLVMValueRef args[] = {
>>                 LLVMGetParam(ctx->main_fn, param),
>>                 LLVMConstInt(ctx->i32, bitoffset, 0),
>>         };
>>         lp_build_intrinsic(ctx->gallivm.builder,
>>                            "llvm.amdgcn.init.exec.from.input",
>>                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
>> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader
>> *shader,
>>         key->ps_epilog.states = shader->key.part.ps.epilog;
>>  }
>>
>>  /**
>>   * Build the GS prolog function. Rotate the input vertices for triangle
>> strips
>>   * with adjacency.
>>   */
>>  static void si_build_gs_prolog_function(struct si_shader_context *ctx,
>>                                         union si_shader_part_key *key)
>>  {
>> -       const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>> -       const unsigned num_vgprs = 8;
>> +       unsigned num_sgprs, num_vgprs;
>>         struct gallivm_state *gallivm = &ctx->gallivm;
>>         LLVMBuilderRef builder = gallivm->builder;
>> -       LLVMTypeRef params[32];
>> -       LLVMTypeRef returns[32];
>> +       LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
>> +       LLVMTypeRef returns[48];
>>         LLVMValueRef func, ret;
>>
>> +       if (ctx->screen->b.chip_class >= GFX9) {
>> +               num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
>> +               num_vgprs = 5; /* ES inputs are not needed by GS */
>> +       } else {
>> +               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>> +               num_vgprs = 8;
>> +       }
>> +
>>         for (unsigned i = 0; i < num_sgprs; ++i) {
>>                 params[i] = ctx->i32;
>>                 returns[i] = ctx->i32;
>>         }
>>
>>         for (unsigned i = 0; i < num_vgprs; ++i) {
>>                 params[num_sgprs + i] = ctx->i32;
>>                 returns[num_sgprs + i] = ctx->f32;
>>         }
>>
>>         /* Create the function. */
>>         si_create_function(ctx, "gs_prolog", returns, num_sgprs +
>> num_vgprs,
>>                            params, num_sgprs + num_vgprs, num_sgprs - 1);
>>         func = ctx->main_fn;
>>
>> +       /* Set the full EXEC mask for the prolog, because we are only
>> fiddling
>> +        * with registers here. The main shader part will set the correct
>> EXEC
>> +        * mask.
>> +        */
>> +       if (ctx->screen->b.chip_class >= GFX9)
>> +               si_init_exec_full_mask(ctx);
>> +
>>         /* Copy inputs to outputs. This should be no-op, as the registers
>> match,
>>          * but it will prevent the compiler from overwriting them
>> unintentionally.
>>          */
>>         ret = ctx->return_value;
>>         for (unsigned i = 0; i < num_sgprs; i++) {
>>                 LLVMValueRef p = LLVMGetParam(func, i);
>>                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
>>         }
>>         for (unsigned i = 0; i < num_vgprs; i++) {
>>                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
>>                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
>>                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i,
>> "");
>>         }
>>
>>         if (key->gs_prolog.states.tri_strip_adj_fix) {
>>                 /* Remap the input vertices for every other primitive. */
>> -               const unsigned vtx_params[6] = {
>> +               const unsigned gfx6_vtx_params[6] = {
>>                         num_sgprs,
>>                         num_sgprs + 1,
>>                         num_sgprs + 3,
>>                         num_sgprs + 4,
>>                         num_sgprs + 5,
>>                         num_sgprs + 6
>>                 };
>> +               const unsigned gfx9_vtx_params[3] = {
>> +                       num_sgprs,
>> +                       num_sgprs + 1,
>> +                       num_sgprs + 4,
>> +               };
>> +               LLVMValueRef vtx_in[6], vtx_out[6];
>>                 LLVMValueRef prim_id, rotate;
>>
>> +               if (ctx->screen->b.chip_class >= GFX9) {
>> +                       for (unsigned i = 0; i < 3; i++) {
>> +                               vtx_in[i*2] = unpack_param(ctx,
>> gfx9_vtx_params[i], 0, 16);
>> +                               vtx_in[i*2+1] = unpack_param(ctx,
>> gfx9_vtx_params[i], 16, 16);
>> +                       }
>> +               } else {
>> +                       for (unsigned i = 0; i < 6; i++)
>> +                               vtx_in[i] = LLVMGetParam(func,
>> gfx6_vtx_params[i]);
>> +               }
>> +
>>                 prim_id = LLVMGetParam(func, num_sgprs + 2);
>>                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
>>
>>                 for (unsigned i = 0; i < 6; ++i) {
>> -                       LLVMValueRef base, rotated, actual;
>> -                       base = LLVMGetParam(func, vtx_params[i]);
>> -                       rotated = LLVMGetParam(func, vtx_params[(i + 4) %
>> 6]);
>> -                       actual = LLVMBuildSelect(builder, rotate, rotated,
>> base, "");
>> -                       actual = LLVMBuildBitCast(builder, actual,
>> ctx->f32, "");
>> -                       ret = LLVMBuildInsertValue(builder, ret, actual,
>> vtx_params[i], "");
>> +                       LLVMValueRef base, rotated;
>> +                       base = vtx_in[i];
>> +                       rotated = vtx_in[(i + 4) % 6];
>> +                       vtx_out[i] = LLVMBuildSelect(builder, rotate,
>> rotated, base, "");
>> +               }
>> +
>> +               if (ctx->screen->b.chip_class >= GFX9) {
>> +                       for (unsigned i = 0; i < 3; i++) {
>> +                               LLVMValueRef hi, out;
>> +
>> +                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
>> +                                                 LLVMConstInt(ctx->i32,
>> 16, 0), "");
>> +                               out = LLVMBuildOr(builder, vtx_out[i*2],
>> hi, "");
>> +                               out = LLVMBuildBitCast(builder, out,
>> ctx->f32, "");
>> +                               ret = LLVMBuildInsertValue(builder, ret,
>> out,
>> +
>> gfx9_vtx_params[i], "");
>> +                       }
>> +               } else {
>> +                       for (unsigned i = 0; i < 6; i++) {
>> +                               LLVMValueRef out;
>> +
>> +                               out = LLVMBuildBitCast(builder,
>> vtx_out[i], ctx->f32, "");
>> +                               ret = LLVMBuildInsertValue(builder, ret,
>> out,
>> +
>> gfx6_vtx_params[i], "");
>> +                       }
>>                 }
>
>
> I believe this could be simplified quite a bit, since the vertex indices are
> rotate by a multiple of 2. So there's no need to unpack the bits and pack
> them again, instead just rotate the 3 input registers by 2 instead of
> rotating 6 input registers by 4.
>
> I'm fine with it if you want to do that in a follow-up patch.

To be honest with you, I'm really not into optimizing for a GS
workaround while hardly any app uses GS.

Marek