[Mesa-dev] [PATCH 2/6] radeonsi:optimizing SET_CONTEXT_REG for shaders GS
Marek Olšák
maraeo at gmail.com
Tue Sep 25 04:05:46 UTC 2018
For the remaining patches (3-6), it would be better to precompute the
register values in struct si_shader, so that the emit functions are as
cheap as possible.
Marek
On Mon, Sep 24, 2018 at 10:51 PM, Marek Olšák <maraeo at gmail.com> wrote:
> On Tue, Sep 18, 2018 at 4:21 PM, Sonny Jiang <sonny.jiang at amd.com> wrote:
>> Signed-off-by: Sonny Jiang <sonny.jiang at amd.com>
>> ---
>> src/gallium/drivers/radeonsi/si_gfx_cs.c | 15 ++-
>> src/gallium/drivers/radeonsi/si_state.h | 19 ++-
>> .../drivers/radeonsi/si_state_shaders.c | 112 ++++++++++++------
>> 3 files changed, 109 insertions(+), 37 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> index 99376c4847..39a97bf3da 100644
>> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> @@ -352,9 +352,22 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
>> ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
>> ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
>> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
>> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
>>
>> /* Set all saved registers state to saved. */
>> - ctx->tracked_regs.reg_saved = 0xffffffff;
>> + ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
>> } else {
>> /* Set all saved registers state to unknown. */
>> ctx->tracked_regs.reg_saved = 0;
>> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
>> index 7fb09ca953..9efb4be5a5 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.h
>> +++ b/src/gallium/drivers/radeonsi/si_state.h
>> @@ -279,11 +279,28 @@ enum si_tracked_reg {
>>
>> SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
>>
>> + SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 4 consecutive registers */
>> + SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
>> + SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
>> + SI_TRACKED_VGT_GS_OUT_PRIM_TYPE,
>> +
>> + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
>> + SI_TRACKED_VGT_GS_MAX_VERT_OUT,
>> +
>> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
>> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
>> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
>> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
>> +
>> + SI_TRACKED_VGT_GS_INSTANCE_CNT,
>> + SI_TRACKED_VGT_GS_ONCHIP_CNTL,
>> + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
>> +
>> SI_NUM_TRACKED_REGS,
>> };
>>
>> struct si_tracked_regs {
>> - uint32_t reg_saved;
>> + uint64_t reg_saved;
>
> This is good, but the functions in si_build_pm4.h also need to be
> updated. They compute the masks with "1 << ...", which results in a
> 32-bit number. You need to use "1ull << ..." instead.
>
>
>> uint32_t reg_value[SI_NUM_TRACKED_REGS];
>> uint32_t spi_ps_input_cntl[32];
>> };
>> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> index 0716021d85..87b58cd6e5 100644
>> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> @@ -747,48 +747,99 @@ static void gfx9_get_gs_info(struct si_shader_selector *es,
>> assert(out->max_prims_per_subgroup <= max_out_prims);
>> }
>>
>> -static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
>> +static void si_emit_shader_gs(struct si_context *sctx)
>> {
>> + struct si_shader *shader = sctx->queued.named.gs->shader;
>> + if (!shader)
>> + return;
>> +
>> struct si_shader_selector *sel = shader->selector;
>> const ubyte *num_components = sel->info.num_stream_output_components;
>> unsigned gs_num_invocations = sel->gs_num_invocations;
>> - struct si_pm4_state *pm4;
>> uint64_t va;
>> unsigned max_stream = sel->max_gs_stream;
>> - unsigned offset;
>> + unsigned offset_1, offset_2, offset_3, vgt_gsvs_ring_itemsize;
>>
>> - pm4 = si_get_shader_pm4_state(shader);
>> - if (!pm4)
>> - return;
>> -
>> - offset = num_components[0] * sel->gs_max_out_vertices;
>> - si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
>> + offset_3 = offset_2 = offset_1 = num_components[0] * sel->gs_max_out_vertices;
>> if (max_stream >= 1)
>> - offset += num_components[1] * sel->gs_max_out_vertices;
>> - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
>> + offset_3 = offset_2 += num_components[1] * sel->gs_max_out_vertices;
>> if (max_stream >= 2)
>> - offset += num_components[2] * sel->gs_max_out_vertices;
>> - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
>> - si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
>> - si_conv_prim_to_gs_out(sel->gs_output_prim));
>> + offset_3 += num_components[2] * sel->gs_max_out_vertices;
>> +
>> +
>> + /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
>> + * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */
>> + radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
>> + SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
>> + offset_1, offset_2, offset_3,
>> + si_conv_prim_to_gs_out(sel->gs_output_prim));
>> +
>> + vgt_gsvs_ring_itemsize = offset_3;
>> if (max_stream >= 3)
>> - offset += num_components[3] * sel->gs_max_out_vertices;
>> - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
>> + vgt_gsvs_ring_itemsize += num_components[3] * sel->gs_max_out_vertices;
>>
>> /* The GSVS_RING_ITEMSIZE register takes 15 bits */
>> - assert(offset < (1 << 15));
>> + assert(vgt_gsvs_ring_itemsize < (1 << 15));
>> +
>> + /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
>> + radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
>> + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
>> + vgt_gsvs_ring_itemsize);
>> +
>> + /* R_028B38_VGT_GS_MAX_VERT_OUT */
>> + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
>> + SI_TRACKED_VGT_GS_MAX_VERT_OUT,
>> + sel->gs_max_out_vertices);
>> +
>> + /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
>> + * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
>> + radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
>> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
>> + num_components[0],
>> + (max_stream >= 1) ? num_components[1] : 0,
>> + (max_stream >= 2) ? num_components[2] : 0,
>> + (max_stream >= 3) ? num_components[3] : 0);
>> +
>> + /* R_028B90_VGT_GS_INSTANCE_CNT */
>> + radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
>> + SI_TRACKED_VGT_GS_INSTANCE_CNT,
>> + S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
>> + S_028B90_ENABLE(gs_num_invocations > 0));
>>
>> - si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
>> + va = shader->bo->gpu_address;
>>
>> - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
>> - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
>> - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
>> - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
>> + if (sctx->chip_class >= GFX9) {
>> + struct gfx9_gs_info gs_info;
>> + gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
>
> This call is expensive. The register values should ideally be
> precomputed in si_shader_gs and saved in struct si_shader, so that
> emitting the values here is cheap.
>
> Marek
More information about the mesa-dev
mailing list