[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
Marek Olšák
maraeo at gmail.com
Fri Apr 28 19:34:58 UTC 2017
On Fri, Apr 28, 2017 at 9:33 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 28.04.2017 17:59, Marek Olšák wrote:
>>
>> On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle <nhaehnle at gmail.com>
>> wrote:
>>>
>>> On 24.04.2017 10:45, Marek Olšák wrote:
>>>>
>>>>
>>>> From: Marek Olšák <marek.olsak at amd.com>
>>>>
>>>> ---
>>>> src/gallium/drivers/radeonsi/si_shader.c | 87
>>>> +++++++++++++++++++++++++-------
>>>> 1 file changed, 70 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>>> index a4c2ac0..392f85d 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>>> @@ -7368,20 +7368,28 @@ static void
>>>> si_count_scratch_private_memory(struct
>>>> si_shader_context *ctx)
>>>> LLVMTypeRef type =
>>>> LLVMGetElementType(LLVMTypeOf(inst));
>>>> /* No idea why LLVM aligns allocas to 4
>>>> elements.
>>>> */
>>>> unsigned alignment = LLVMGetAlignment(inst);
>>>> unsigned dw_size =
>>>> align(llvm_get_type_size(type)
>>>> / 4, alignment);
>>>> ctx->shader->config.private_mem_vgprs +=
>>>> dw_size;
>>>> }
>>>> bb = LLVMGetNextBasicBlock(bb);
>>>> }
>>>> }
>>>>
>>>> +static void si_init_exec_full_mask(struct si_shader_context *ctx)
>>>> +{
>>>> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
>>>> + lp_build_intrinsic(ctx->gallivm.builder,
>>>> + "llvm.amdgcn.init.exec", ctx->voidt,
>>>> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
>>>> +}
>>>> +
>>>> static void si_init_exec_from_input(struct si_shader_context *ctx,
>>>> unsigned param, unsigned bitoffset)
>>>> {
>>>> LLVMValueRef args[] = {
>>>> LLVMGetParam(ctx->main_fn, param),
>>>> LLVMConstInt(ctx->i32, bitoffset, 0),
>>>> };
>>>> lp_build_intrinsic(ctx->gallivm.builder,
>>>> "llvm.amdgcn.init.exec.from.input",
>>>> ctx->voidt, args, 2,
>>>> LP_FUNC_ATTR_CONVERGENT);
>>>> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct
>>>> si_shader
>>>> *shader,
>>>> key->ps_epilog.states = shader->key.part.ps.epilog;
>>>> }
>>>>
>>>> /**
>>>> * Build the GS prolog function. Rotate the input vertices for triangle
>>>> strips
>>>> * with adjacency.
>>>> */
>>>> static void si_build_gs_prolog_function(struct si_shader_context *ctx,
>>>> union si_shader_part_key *key)
>>>> {
>>>> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>>>> - const unsigned num_vgprs = 8;
>>>> + unsigned num_sgprs, num_vgprs;
>>>> struct gallivm_state *gallivm = &ctx->gallivm;
>>>> LLVMBuilderRef builder = gallivm->builder;
>>>> - LLVMTypeRef params[32];
>>>> - LLVMTypeRef returns[32];
>>>> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
>>>> + LLVMTypeRef returns[48];
>>>> LLVMValueRef func, ret;
>>>>
>>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>>> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
>>>> + num_vgprs = 5; /* ES inputs are not needed by GS */
>>>> + } else {
>>>> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>>>> + num_vgprs = 8;
>>>> + }
>>>> +
>>>> for (unsigned i = 0; i < num_sgprs; ++i) {
>>>> params[i] = ctx->i32;
>>>> returns[i] = ctx->i32;
>>>> }
>>>>
>>>> for (unsigned i = 0; i < num_vgprs; ++i) {
>>>> params[num_sgprs + i] = ctx->i32;
>>>> returns[num_sgprs + i] = ctx->f32;
>>>> }
>>>>
>>>> /* Create the function. */
>>>> si_create_function(ctx, "gs_prolog", returns, num_sgprs +
>>>> num_vgprs,
>>>> params, num_sgprs + num_vgprs, num_sgprs -
>>>> 1);
>>>> func = ctx->main_fn;
>>>>
>>>> + /* Set the full EXEC mask for the prolog, because we are only
>>>> fiddling
>>>> + * with registers here. The main shader part will set the
>>>> correct
>>>> EXEC
>>>> + * mask.
>>>> + */
>>>> + if (ctx->screen->b.chip_class >= GFX9)
>>>> + si_init_exec_full_mask(ctx);
>>>> +
>>>> /* Copy inputs to outputs. This should be no-op, as the
>>>> registers
>>>> match,
>>>> * but it will prevent the compiler from overwriting them
>>>> unintentionally.
>>>> */
>>>> ret = ctx->return_value;
>>>> for (unsigned i = 0; i < num_sgprs; i++) {
>>>> LLVMValueRef p = LLVMGetParam(func, i);
>>>> ret = LLVMBuildInsertValue(builder, ret, p, i, "");
>>>> }
>>>> for (unsigned i = 0; i < num_vgprs; i++) {
>>>> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
>>>> p = LLVMBuildBitCast(builder, p, ctx->f32, "");
>>>> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs +
>>>> i,
>>>> "");
>>>> }
>>>>
>>>> if (key->gs_prolog.states.tri_strip_adj_fix) {
>>>> /* Remap the input vertices for every other primitive.
>>>> */
>>>> - const unsigned vtx_params[6] = {
>>>> + const unsigned gfx6_vtx_params[6] = {
>>>> num_sgprs,
>>>> num_sgprs + 1,
>>>> num_sgprs + 3,
>>>> num_sgprs + 4,
>>>> num_sgprs + 5,
>>>> num_sgprs + 6
>>>> };
>>>> + const unsigned gfx9_vtx_params[3] = {
>>>> + num_sgprs,
>>>> + num_sgprs + 1,
>>>> + num_sgprs + 4,
>>>> + };
>>>> + LLVMValueRef vtx_in[6], vtx_out[6];
>>>> LLVMValueRef prim_id, rotate;
>>>>
>>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>>> + for (unsigned i = 0; i < 3; i++) {
>>>> + vtx_in[i*2] = unpack_param(ctx,
>>>> gfx9_vtx_params[i], 0, 16);
>>>> + vtx_in[i*2+1] = unpack_param(ctx,
>>>> gfx9_vtx_params[i], 16, 16);
>>>> + }
>>>> + } else {
>>>> + for (unsigned i = 0; i < 6; i++)
>>>> + vtx_in[i] = LLVMGetParam(func,
>>>> gfx6_vtx_params[i]);
>>>> + }
>>>> +
>>>> prim_id = LLVMGetParam(func, num_sgprs + 2);
>>>> rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
>>>>
>>>> for (unsigned i = 0; i < 6; ++i) {
>>>> - LLVMValueRef base, rotated, actual;
>>>> - base = LLVMGetParam(func, vtx_params[i]);
>>>> - rotated = LLVMGetParam(func, vtx_params[(i + 4)
>>>> %
>>>> 6]);
>>>> - actual = LLVMBuildSelect(builder, rotate,
>>>> rotated,
>>>> base, "");
>>>> - actual = LLVMBuildBitCast(builder, actual,
>>>> ctx->f32, "");
>>>> - ret = LLVMBuildInsertValue(builder, ret, actual,
>>>> vtx_params[i], "");
>>>> + LLVMValueRef base, rotated;
>>>> + base = vtx_in[i];
>>>> + rotated = vtx_in[(i + 4) % 6];
>>>> + vtx_out[i] = LLVMBuildSelect(builder, rotate,
>>>> rotated, base, "");
>>>> + }
>>>> +
>>>> + if (ctx->screen->b.chip_class >= GFX9) {
>>>> + for (unsigned i = 0; i < 3; i++) {
>>>> + LLVMValueRef hi, out;
>>>> +
>>>> + hi = LLVMBuildShl(builder,
>>>> vtx_out[i*2+1],
>>>> + LLVMConstInt(ctx->i32,
>>>> 16, 0), "");
>>>> + out = LLVMBuildOr(builder, vtx_out[i*2],
>>>> hi, "");
>>>> + out = LLVMBuildBitCast(builder, out,
>>>> ctx->f32, "");
>>>> + ret = LLVMBuildInsertValue(builder, ret,
>>>> out,
>>>> +
>>>> gfx9_vtx_params[i], "");
>>>> + }
>>>> + } else {
>>>> + for (unsigned i = 0; i < 6; i++) {
>>>> + LLVMValueRef out;
>>>> +
>>>> + out = LLVMBuildBitCast(builder,
>>>> vtx_out[i], ctx->f32, "");
>>>> + ret = LLVMBuildInsertValue(builder, ret,
>>>> out,
>>>> +
>>>> gfx6_vtx_params[i], "");
>>>> + }
>>>> }
>>>
>>>
>>>
>>> I believe this could be simplified quite a bit, since the vertex indices
>>> are
>>> rotate by a multiple of 2. So there's no need to unpack the bits and pack
>>> them again, instead just rotate the 3 input registers by 2 instead of
>>> rotating 6 input registers by 4.
>>>
>>> I'm fine with it if you want to do that in a follow-up patch.
>>
>>
>> To be honest with you, I'm really not into optimizing for a GS
>> workaround while hardly any app uses GS.
>
>
> I'm not really worrying about performance here, either. It's about the size
> and complexity of the code.
OK gotcha.
Marek
More information about the mesa-dev
mailing list