[Mesa-dev] [PATCH 56/61] radeonsi: get InstanceID from VGPR1 (or VGPR2 for tess) instead of VGPR3

Fri Apr 28 19:33:53 UTC 2017

On 28.04.2017 18:08, Marek Olšák wrote:
> On Fri, Apr 28, 2017 at 1:54 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 24.04.2017 10:45, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> VGPR1 = InstanceID / StepRate0; // StepRate0 can be set to 1
>>> ---
>>>  src/gallium/drivers/radeonsi/si_shader.c        | 20 ++++++++++++++------
>>>  src/gallium/drivers/radeonsi/si_shader.h        |  1 +
>>>  src/gallium/drivers/radeonsi/si_state.c         |  1 +
>>>  src/gallium/drivers/radeonsi/si_state_shaders.c | 24
>>> +++++++++++++++++-------
>>>  4 files changed, 33 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>> index edb50a3..ce509af 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -5838,23 +5838,28 @@ static void declare_vs_specific_input_sgprs(struct
>>> si_shader_context *ctx,
>>>         params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
>>>  }
>>>
>>>  static void declare_vs_input_vgprs(struct si_shader_context *ctx,
>>>                                    LLVMTypeRef *params, unsigned
>>> *num_params,
>>>                                    unsigned *num_prolog_vgprs)
>>>  {
>>>         struct si_shader *shader = ctx->shader;
>>>
>>>         params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
>>> -       params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
>>> -       params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
>>> -       params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
>>> +       if (shader->key.as_ls) {
>>> +               params[ctx->param_rel_auto_id = (*num_params)++] =
>>> ctx->i32;
>>> +               params[ctx->param_instance_id = (*num_params)++] =
>>> ctx->i32;
>>> +       } else {
>>> +               params[ctx->param_instance_id = (*num_params)++] =
>>> ctx->i32;
>>> +               params[ctx->param_vs_prim_id = (*num_params)++] =
>>> ctx->i32;
>>> +       }
>>> +       params[(*num_params)++] = ctx->i32; /* unused */
>>>
>>>         if (!shader->is_gs_copy_shader) {
>>>                 /* Vertex load indices. */
>>>                 ctx->param_vertex_index0 = (*num_params);
>>>                 for (unsigned i = 0; i <
>>> shader->selector->info.num_inputs; i++)
>>>                         params[(*num_params)++] = ctx->i32;
>>>                 *num_prolog_vgprs += shader->selector->info.num_inputs;
>>>         }
>>>  }
>>>
>>> @@ -7497,25 +7502,28 @@ static bool si_compile_tgsi_main(struct
>>> si_shader_context *ctx,
>>>  static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
>>>                                  unsigned num_input_sgprs,
>>>                                  const struct si_vs_prolog_bits
>>> *prolog_key,
>>>                                  struct si_shader *shader_out,
>>>                                  union si_shader_part_key *key)
>>>  {
>>>         memset(key, 0, sizeof(*key));
>>>         key->vs_prolog.states = *prolog_key;
>>>         key->vs_prolog.num_input_sgprs = num_input_sgprs;
>>>         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
>>> +       key->vs_prolog.as_ls = shader_out->key.as_ls;
>>>
>>> -       if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL)
>>> +       if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
>>> +               key->vs_prolog.as_ls = 1;
>>>                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
>>> -       else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY)
>>> +       } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
>>>                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
>>> +       }
>>>
>>>         /* Set the instanceID flag. */
>>>         for (unsigned i = 0; i < info->num_inputs; i++)
>>>                 if (key->vs_prolog.states.instance_divisors[i])
>>>                         shader_out->info.uses_instanceid = true;
>>>  }
>>>
>>>  /**
>>>   * Compute the VS epilog key, which contains all the information needed
>>> to
>>>   * build the VS epilog function, and set the PrimitiveID output offset.
>>> @@ -8508,21 +8516,21 @@ static void si_build_vs_prolog_function(struct
>>> si_shader_context *ctx,
>>>         LLVMValueRef ret, func;
>>>         int last_sgpr, num_params, num_returns, i;
>>>         unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
>>>
>>> key->vs_prolog.num_merged_next_stage_vgprs;
>>>         unsigned num_input_vgprs =
>>> key->vs_prolog.num_merged_next_stage_vgprs + 4;
>>>         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
>>>                                       num_input_vgprs;
>>>         unsigned user_sgpr_base =
>>> key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
>>>
>>>         ctx->param_vertex_id = first_vs_vgpr;
>>> -       ctx->param_instance_id = first_vs_vgpr + 3;
>>> +       ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2
>>> : 1);
>>>
>>>         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
>>>         params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
>>>         returns = alloca((num_all_input_regs + key->vs_prolog.last_input +
>>> 1) *
>>>                          sizeof(LLVMTypeRef));
>>>         num_params = 0;
>>>         num_returns = 0;
>>>
>>>         /* Declare input and output SGPRs. */
>>>         num_params = 0;
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.h
>>> b/src/gallium/drivers/radeonsi/si_shader.h
>>> index 57685e0..6bca7f8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.h
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.h
>>> @@ -430,20 +430,21 @@ struct si_ps_epilog_bits {
>>>         unsigned        clamp_color:1;
>>>  };
>>>
>>>  union si_shader_part_key {
>>>         struct {
>>>                 struct si_vs_prolog_bits states;
>>>                 unsigned        num_input_sgprs:6;
>>>                 /* For merged stages such as LS-HS, HS input VGPRs are
>>> first. */
>>>                 unsigned        num_merged_next_stage_vgprs:3;
>>>                 unsigned        last_input:4;
>>> +               unsigned        as_ls:1;
>>>                 /* Prologs for monolithic shaders shouldn't set EXEC. */
>>>                 unsigned        is_monolithic:1;
>>>         } vs_prolog;
>>>         struct {
>>>                 struct si_vs_epilog_bits states;
>>>                 unsigned        prim_id_param_offset:5;
>>>         } vs_epilog;
>>>         struct {
>>>                 struct si_tcs_epilog_bits states;
>>>         } tcs_epilog;
>>> diff --git a/src/gallium/drivers/radeonsi/si_state.c
>>> b/src/gallium/drivers/radeonsi/si_state.c
>>> index 39494cc..938e7fb 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>>> @@ -4334,20 +4334,21 @@ static void si_init_config(struct si_context
>>> *sctx)
>>>         if (sctx->b.chip_class <= VI) {
>>>                 si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
>>>                 si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
>>>         }
>>>         si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
>>>
>>>         si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
>>>         si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
>>>
>>>         si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
>>> +       si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
>>>         si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
>>>         if (sctx->b.chip_class < CIK)
>>>                 si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
>>> S_008A14_NUM_CLIP_SEQ(3) |
>>>                                S_008A14_CLIP_VTX_REORDER_ENA(1));
>>>
>>>         si_pm4_set_reg(pm4, R_028BD4_PA_SC_CENTROID_PRIORITY_0,
>>> 0x76543210);
>>>         si_pm4_set_reg(pm4, R_028BD8_PA_SC_CENTROID_PRIORITY_1,
>>> 0xfedcba98);
>>>
>>>         si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> index 5bbc037..0c997e8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> @@ -450,22 +450,24 @@ static void si_shader_ls(struct si_screen *sscreen,
>>> struct si_shader *shader)
>>>         assert(sscreen->b.chip_class <= VI);
>>>
>>>         pm4 = si_get_shader_pm4_state(shader);
>>>         if (!pm4)
>>>                 return;
>>>
>>>         va = shader->bo->gpu_address;
>>>         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_BINARY);
>>>
>>>         /* We need at least 2 components for LS.
>>> -        * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
>>> -       vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
>>> +        * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0,
>>> InstanceID).
>>> +        * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
>>> +        */
>>> +       vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
>>>
>>>         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
>>>         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
>>>
>>>         shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs -
>>> 1) / 4) |
>>>                            S_00B528_SGPRS((shader->config.num_sgprs - 1) /
>>> 8) |
>>>                            S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
>>>                            S_00B528_DX10_CLAMP(1) |
>>>                            S_00B528_FLOAT_MODE(shader->config.float_mode);
>>>         shader->config.rsrc2 = S_00B52C_USER_SGPR(SI_VS_NUM_USER_SGPR) |
>>> @@ -483,22 +485,24 @@ static void si_shader_hs(struct si_screen *sscreen,
>>> struct si_shader *shader)
>>>                 return;
>>>
>>>         va = shader->bo->gpu_address;
>>>         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_BINARY);
>>>
>>>         if (sscreen->b.chip_class >= GFX9) {
>>>                 si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >>
>>> 8);
>>>                 si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >>
>>> 40);
>>>
>>>                 /* We need at least 2 components for LS.
>>> -                * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
>>> -               ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
>>> +                * VGPR0-3: (VertexID, RelAutoindex, InstanceID /
>>> StepRate0, InstanceID).
>>> +                * StepRate0 is set to 1. so that VGPR3 doesn't have to be
>>> loaded.
>>> +                */
>>> +               ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
>>>
>>>                 if (shader->config.scratch_bytes_per_wave) {
>>>                         fprintf(stderr, "HS: scratch buffer unsupported");
>>>                         abort();
>>>                 }
>>>
>>>                 shader->config.rsrc2 =
>>>                         S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) |
>>>                         S_00B42C_USER_SGPR_MSB(GFX9_TCS_NUM_USER_SGPR >>
>>> 5) |
>>>
>>> S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
>>> @@ -536,21 +540,22 @@ static void si_shader_es(struct si_screen *sscreen,
>>> struct si_shader *shader)
>>>         assert(sscreen->b.chip_class <= VI);
>>>
>>>         pm4 = si_get_shader_pm4_state(shader);
>>>         if (!pm4)
>>>                 return;
>>>
>>>         va = shader->bo->gpu_address;
>>>         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_BINARY);
>>>
>>>         if (shader->selector->type == PIPE_SHADER_VERTEX) {
>>> -               vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
>>> +               /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
>>> +               vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
>>>                 num_user_sgprs = SI_VS_NUM_USER_SGPR;
>>>         } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
>>>                 vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 :
>>> 2;
>>>                 num_user_sgprs = SI_TES_NUM_USER_SGPR;
>>>         } else
>>>                 unreachable("invalid shader selector type");
>>>
>>>         oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 :
>>> 0;
>>>
>>>         si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
>>> @@ -751,21 +756,22 @@ static void si_shader_gs(struct si_screen *sscreen,
>>> struct si_shader *shader)
>>>         va = shader->bo->gpu_address;
>>>         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_BINARY);
>>>
>>>         if (sscreen->b.chip_class >= GFX9) {
>>>                 unsigned input_prim =
>>> sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
>>>                 unsigned es_type = shader->key.part.gs.es->type;
>>>                 unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
>>>                 struct gfx9_gs_info gs_info;
>>>
>>>                 if (es_type == PIPE_SHADER_VERTEX)
>>> -                       es_vgpr_comp_cnt = shader->info.uses_instanceid ?
>>> 3 : 0;
>>> +                       /* VGPR0-3: (VertexID, InstanceID / StepRate0,
>>> ...) */
>>> +                       es_vgpr_comp_cnt = shader->info.uses_instanceid ?
>>> 1 : 0;
>>>                 else if (es_type == PIPE_SHADER_TESS_EVAL)
>>>                         es_vgpr_comp_cnt =
>>> shader->key.part.gs.es->info.uses_primid ? 3 : 2;
>>>                 else
>>>                         unreachable("invalid shader selector type");
>>>
>>>                 /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored
>>> and
>>>                  * VGPR[0:4] are always loaded.
>>>                  */
>>>                 if (sel->info.uses_invocationid)
>>>                         gs_vgpr_comp_cnt = 3; /* VGPR3 contains
>>> InvocationID. */
>>> @@ -868,21 +874,25 @@ static void si_shader_vs(struct si_screen *sscreen,
>>> struct si_shader *shader,
>>>                 si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
>>>         }
>>>
>>>         va = shader->bo->gpu_address;
>>>         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ,
>>> RADEON_PRIO_SHADER_BINARY);
>>>
>>>         if (gs) {
>>>                 vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY.
>>> */
>>>                 num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
>>>         } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
>>> -               vgpr_comp_cnt = shader->info.uses_instanceid ? 3 :
>>> (enable_prim_id ? 2 : 0);
>>> +               /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID,
>>> InstanceID)
>>> +                * If PrimID is disabled. InstanceID / StepRate1 is loaded
>>> instead.
>>
>>
>> StepRate0.
>
> It's really StepRate1 (VGPR2 where PrimID is loaded). StepRate0 is
> applied to VGPR1.

Oh okay, thanks for clearing that up.

Cheers,
Nicolai

>
> Marek
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.