[Mesa-dev] [PATCH 13/13] radeonsi: remove 2 unused user SGPRs from merged TES-GS with 32-bit pointers
Marek Olšák
maraeo at gmail.com
Sat Feb 17 19:43:28 UTC 2018
From: Marek Olšák <marek.olsak at amd.com>
The effect of the last 13 commits on user SGPR counts:
64-bit pointers:
TCS: 14 -> 12
Merged VS-TCS: 24 -> 20
Merged VS-GS: 18 -> 16
Merged TES-GS: 18 -> 14
32-bit pointers:
TCS: 10 -> 8
Merged VS-TCS: 16 -> 12
Merged VS-GS: 11 -> 9
Merged TES-GS: 11 -> 6
---
src/gallium/drivers/radeonsi/si_descriptors.c | 2 +-
src/gallium/drivers/radeonsi/si_shader.c | 30 +++++++++++++++++++------
src/gallium/drivers/radeonsi/si_shader.h | 9 +++++++-
src/gallium/drivers/radeonsi/si_state_shaders.c | 5 +++--
4 files changed, 35 insertions(+), 11 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 7fdac23..a4dae44 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -2190,21 +2190,21 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx,
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
/* Find the location of the VB descriptor pointer. */
/* TODO: In the future, the pointer will be packed in unused
* bits of the first 2 VB descriptors. */
unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
if (sctx->b.chip_class >= GFX9) {
if (sctx->tes_shader.cso)
sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
else if (sctx->gs_shader.cso)
- sh_dw_offset = GFX9_GS_NUM_USER_SGPR;
+ sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
}
unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
si_emit_shader_pointer_head(cs, sh_offset, 1);
si_emit_shader_pointer_body(sctx->screen, cs,
sctx->vb_descriptors_buffer->gpu_address +
sctx->vb_descriptors_offset);
sctx->vertex_buffer_pointer_dirty = false;
}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index dfbb1f2..7b37765 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3413,21 +3413,26 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
8 + SI_SGPR_RW_BUFFERS);
ret = si_insert_input_ptr(ctx, ret,
ctx->param_bindless_samplers_and_images,
8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
#if !HAVE_32BIT_POINTERS
ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1,
8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
#endif
- unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
+ unsigned vgpr;
+ if (ctx->type == PIPE_SHADER_VERTEX)
+ vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+ else
+ vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
for (unsigned i = 0; i < 5; i++) {
unsigned param = ctx->param_gs_vtx01_offset + i;
ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
}
ctx->return_value = ret;
}
static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
unsigned max_outputs,
LLVMValueRef *addrs)
@@ -4772,26 +4777,27 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
declare_global_desc_pointers(ctx, &fninfo);
declare_per_stage_desc_pointers(ctx, &fninfo,
(ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL));
if (ctx->type == PIPE_SHADER_VERTEX) {
declare_vs_specific_input_sgprs(ctx, &fninfo);
} else {
- /* TESS_EVAL (and also GEOMETRY):
- * Declare as many input SGPRs as the VS has. */
ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+ if (!HAVE_32BIT_POINTERS) {
+ /* Declare as many input SGPRs as the VS has. */
+ add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+ ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+ }
}
if (!HAVE_32BIT_POINTERS) {
declare_samplers_and_images(ctx, &fninfo,
ctx->type == PIPE_SHADER_GEOMETRY);
}
if (ctx->type == PIPE_SHADER_VERTEX) {
ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
ac_array_in_const32_addr_space(ctx->v4i32));
}
@@ -4805,22 +4811,29 @@ static void create_function(struct si_shader_context *ctx)
if (ctx->type == PIPE_SHADER_VERTEX) {
declare_vs_input_vgprs(ctx, &fninfo,
&num_prolog_vgprs);
} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
declare_tes_input_vgprs(ctx, &fninfo);
}
if (ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL) {
+ unsigned num_user_sgprs;
+
+ if (ctx->type == PIPE_SHADER_VERTEX)
+ num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
+ else
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
/* ES return values are inputs to GS. */
- for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
+ for (i = 0; i < 8 + num_user_sgprs; i++)
returns[num_returns++] = ctx->i32; /* SGPRs */
for (i = 0; i < 5; i++)
returns[num_returns++] = ctx->f32; /* VGPRs */
}
break;
case PIPE_SHADER_TESS_EVAL:
declare_global_desc_pointers(ctx, &fninfo);
declare_per_stage_desc_pointers(ctx, &fninfo, true);
ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -6325,21 +6338,24 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
{
unsigned num_sgprs, num_vgprs;
struct si_function_info fninfo;
LLVMBuilderRef builder = ctx->ac.builder;
LLVMTypeRef returns[48];
LLVMValueRef func, ret;
si_init_function_info(&fninfo);
if (ctx->screen->info.chip_class >= GFX9) {
- num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+ if (key->gs_prolog.states.gfx9_prev_is_vs)
+ num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+ else
+ num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
num_vgprs = 5; /* ES inputs are not needed by GS */
} else {
num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
num_vgprs = 8;
}
for (unsigned i = 0; i < num_sgprs; ++i) {
add_arg(&fninfo, ARG_SGPR, ctx->i32);
returns[i] = ctx->i32;
}
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 471f2e9..f589789 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -205,21 +205,27 @@ enum {
GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
GFX9_SGPR_TCS_OUT_OFFSETS,
GFX9_SGPR_TCS_OUT_LAYOUT,
#if !HAVE_32BIT_POINTERS
GFX9_SGPR_align_for_vb_pointer,
#endif
GFX9_TCS_NUM_USER_SGPR,
/* GS limits */
GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
- GFX9_GS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR,
+#if HAVE_32BIT_POINTERS
+ GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+ GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
+#else
+ GFX9_VSGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR,
+ GFX9_TESGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR,
+#endif
SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS + (HAVE_32BIT_POINTERS ? 1 : 2),
/* PS only */
SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
SI_PS_NUM_USER_SGPR,
};
/* LLVM function parameter indices */
enum {
SI_NUM_RESOURCE_PARAMS = 4,
@@ -418,20 +424,21 @@ struct si_vs_prolog_bits {
/* Common TCS bits between the shader key and the epilog key. */
struct si_tcs_epilog_bits {
unsigned prim_mode:3;
unsigned invoc0_tess_factors_are_def:1;
unsigned tes_reads_tess_factors:1;
};
struct si_gs_prolog_bits {
unsigned tri_strip_adj_fix:1;
+ unsigned gfx9_prev_is_vs:1;
};
/* Common PS bits between the shader key and the prolog key. */
struct si_ps_prolog_bits {
unsigned color_two_side:1;
unsigned flatshade_colors:1;
unsigned poly_stipple:1;
unsigned force_persp_sample_interp:1;
unsigned force_linear_sample_interp:1;
unsigned force_persp_center_interp:1;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4b1ff94..7cea13e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -759,23 +759,23 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
else if (sel->info.uses_primid)
gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
else if (input_prim >= PIPE_PRIM_TRIANGLES)
gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
else
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
unsigned num_user_sgprs;
if (es_type == PIPE_SHADER_VERTEX)
- num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_GS_NUM_USER_SGPR);
+ num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
else
- num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, va >> 40);
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
S_00B228_DX10_CLAMP(1) |
@@ -1291,20 +1291,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
}
break;
case PIPE_SHADER_GEOMETRY:
if (sctx->b.chip_class >= GFX9) {
if (sctx->tes_shader.cso) {
key->part.gs.es = sctx->tes_shader.cso;
} else {
si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
key, &key->part.gs.vs_prolog);
key->part.gs.es = sctx->vs_shader.cso;
+ key->part.gs.prolog.gfx9_prev_is_vs = 1;
}
/* Merged ES-GS can have unbalanced wave usage.
*
* ES threads are per-vertex, while GS threads are
* per-primitive. So without any amplification, there
* are fewer GS threads than ES threads, which can result
* in empty (no-op) GS waves. With too much amplification,
* there are more GS threads than ES threads, which
* can result in empty (no-op) ES waves.
--
2.7.4
More information about the mesa-dev
mailing list