[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
Nicolai Hähnle
nhaehnle at gmail.com
Fri Apr 28 19:33:18 UTC 2017
On 28.04.2017 17:59, Marek Olšák wrote:
> On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 24.04.2017 10:45, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> ---
>>> src/gallium/drivers/radeonsi/si_shader.c | 87
>>> +++++++++++++++++++++++++-------
>>> 1 file changed, 70 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>> index a4c2ac0..392f85d 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct
>>> si_shader_context *ctx)
>>> LLVMTypeRef type =
>>> LLVMGetElementType(LLVMTypeOf(inst));
>>> /* No idea why LLVM aligns allocas to 4 elements.
>>> */
>>> unsigned alignment = LLVMGetAlignment(inst);
>>> unsigned dw_size = align(llvm_get_type_size(type)
>>> / 4, alignment);
>>> ctx->shader->config.private_mem_vgprs += dw_size;
>>> }
>>> bb = LLVMGetNextBasicBlock(bb);
>>> }
>>> }
>>>
>>> +static void si_init_exec_full_mask(struct si_shader_context *ctx)
>>> +{
>>> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
>>> + lp_build_intrinsic(ctx->gallivm.builder,
>>> + "llvm.amdgcn.init.exec", ctx->voidt,
>>> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
>>> +}
>>> +
>>> static void si_init_exec_from_input(struct si_shader_context *ctx,
>>> unsigned param, unsigned bitoffset)
>>> {
>>> LLVMValueRef args[] = {
>>> LLVMGetParam(ctx->main_fn, param),
>>> LLVMConstInt(ctx->i32, bitoffset, 0),
>>> };
>>> lp_build_intrinsic(ctx->gallivm.builder,
>>> "llvm.amdgcn.init.exec.from.input",
>>> ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
>>> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader
>>> *shader,
>>> key->ps_epilog.states = shader->key.part.ps.epilog;
>>> }
>>>
>>> /**
>>> * Build the GS prolog function. Rotate the input vertices for triangle
>>> strips
>>> * with adjacency.
>>> */
>>> static void si_build_gs_prolog_function(struct si_shader_context *ctx,
>>> union si_shader_part_key *key)
>>> {
>>> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>>> - const unsigned num_vgprs = 8;
>>> + unsigned num_sgprs, num_vgprs;
>>> struct gallivm_state *gallivm = &ctx->gallivm;
>>> LLVMBuilderRef builder = gallivm->builder;
>>> - LLVMTypeRef params[32];
>>> - LLVMTypeRef returns[32];
>>> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
>>> + LLVMTypeRef returns[48];
>>> LLVMValueRef func, ret;
>>>
>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
>>> + num_vgprs = 5; /* ES inputs are not needed by GS */
>>> + } else {
>>> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>>> + num_vgprs = 8;
>>> + }
>>> +
>>> for (unsigned i = 0; i < num_sgprs; ++i) {
>>> params[i] = ctx->i32;
>>> returns[i] = ctx->i32;
>>> }
>>>
>>> for (unsigned i = 0; i < num_vgprs; ++i) {
>>> params[num_sgprs + i] = ctx->i32;
>>> returns[num_sgprs + i] = ctx->f32;
>>> }
>>>
>>> /* Create the function. */
>>> si_create_function(ctx, "gs_prolog", returns, num_sgprs +
>>> num_vgprs,
>>> params, num_sgprs + num_vgprs, num_sgprs - 1);
>>> func = ctx->main_fn;
>>>
>>> + /* Set the full EXEC mask for the prolog, because we are only
>>> fiddling
>>> + * with registers here. The main shader part will set the correct
>>> EXEC
>>> + * mask.
>>> + */
>>> + if (ctx->screen->b.chip_class >= GFX9)
>>> + si_init_exec_full_mask(ctx);
>>> +
>>> /* Copy inputs to outputs. This should be no-op, as the registers
>>> match,
>>> * but it will prevent the compiler from overwriting them
>>> unintentionally.
>>> */
>>> ret = ctx->return_value;
>>> for (unsigned i = 0; i < num_sgprs; i++) {
>>> LLVMValueRef p = LLVMGetParam(func, i);
>>> ret = LLVMBuildInsertValue(builder, ret, p, i, "");
>>> }
>>> for (unsigned i = 0; i < num_vgprs; i++) {
>>> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
>>> p = LLVMBuildBitCast(builder, p, ctx->f32, "");
>>> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i,
>>> "");
>>> }
>>>
>>> if (key->gs_prolog.states.tri_strip_adj_fix) {
>>> /* Remap the input vertices for every other primitive. */
>>> - const unsigned vtx_params[6] = {
>>> + const unsigned gfx6_vtx_params[6] = {
>>> num_sgprs,
>>> num_sgprs + 1,
>>> num_sgprs + 3,
>>> num_sgprs + 4,
>>> num_sgprs + 5,
>>> num_sgprs + 6
>>> };
>>> + const unsigned gfx9_vtx_params[3] = {
>>> + num_sgprs,
>>> + num_sgprs + 1,
>>> + num_sgprs + 4,
>>> + };
>>> + LLVMValueRef vtx_in[6], vtx_out[6];
>>> LLVMValueRef prim_id, rotate;
>>>
>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>> + for (unsigned i = 0; i < 3; i++) {
>>> + vtx_in[i*2] = unpack_param(ctx,
>>> gfx9_vtx_params[i], 0, 16);
>>> + vtx_in[i*2+1] = unpack_param(ctx,
>>> gfx9_vtx_params[i], 16, 16);
>>> + }
>>> + } else {
>>> + for (unsigned i = 0; i < 6; i++)
>>> + vtx_in[i] = LLVMGetParam(func,
>>> gfx6_vtx_params[i]);
>>> + }
>>> +
>>> prim_id = LLVMGetParam(func, num_sgprs + 2);
>>> rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
>>>
>>> for (unsigned i = 0; i < 6; ++i) {
>>> - LLVMValueRef base, rotated, actual;
>>> - base = LLVMGetParam(func, vtx_params[i]);
>>> - rotated = LLVMGetParam(func, vtx_params[(i + 4) %
>>> 6]);
>>> - actual = LLVMBuildSelect(builder, rotate, rotated,
>>> base, "");
>>> - actual = LLVMBuildBitCast(builder, actual,
>>> ctx->f32, "");
>>> - ret = LLVMBuildInsertValue(builder, ret, actual,
>>> vtx_params[i], "");
>>> + LLVMValueRef base, rotated;
>>> + base = vtx_in[i];
>>> + rotated = vtx_in[(i + 4) % 6];
>>> + vtx_out[i] = LLVMBuildSelect(builder, rotate,
>>> rotated, base, "");
>>> + }
>>> +
>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>> + for (unsigned i = 0; i < 3; i++) {
>>> + LLVMValueRef hi, out;
>>> +
>>> + hi = LLVMBuildShl(builder, vtx_out[i*2+1],
>>> + LLVMConstInt(ctx->i32,
>>> 16, 0), "");
>>> + out = LLVMBuildOr(builder, vtx_out[i*2],
>>> hi, "");
>>> + out = LLVMBuildBitCast(builder, out,
>>> ctx->f32, "");
>>> + ret = LLVMBuildInsertValue(builder, ret,
>>> out,
>>> +
>>> gfx9_vtx_params[i], "");
>>> + }
>>> + } else {
>>> + for (unsigned i = 0; i < 6; i++) {
>>> + LLVMValueRef out;
>>> +
>>> + out = LLVMBuildBitCast(builder,
>>> vtx_out[i], ctx->f32, "");
>>> + ret = LLVMBuildInsertValue(builder, ret,
>>> out,
>>> +
>>> gfx6_vtx_params[i], "");
>>> + }
>>> }
>>
>>
>> I believe this could be simplified quite a bit, since the vertex indices are
>> rotate by a multiple of 2. So there's no need to unpack the bits and pack
>> them again, instead just rotate the 3 input registers by 2 instead of
>> rotating 6 input registers by 4.
>>
>> I'm fine with it if you want to do that in a follow-up patch.
>
> To be honest with you, I'm really not into optimizing for a GS
> workaround while hardly any app uses GS.
I'm not really worrying about performance here, either. It's about the
size and complexity of the code.
Cheers,
Nicolai
>
> Marek
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list