[Mesa-dev] [PATCH 36/61] radeonsi/gfx9: set registers and shader key for merged ES-GS

Fri Apr 28 11:19:11 UTC 2017

On 24.04.2017 10:45, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
>  src/gallium/drivers/radeonsi/si_shader.c        |   8 +
>  src/gallium/drivers/radeonsi/si_shader.h        |   2 +
>  src/gallium/drivers/radeonsi/si_state.c         |  22 +--
>  src/gallium/drivers/radeonsi/si_state_draw.c    |   3 +-
>  src/gallium/drivers/radeonsi/si_state_shaders.c | 210 ++++++++++++++++++++++--
>  5 files changed, 218 insertions(+), 27 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index ee8cae1..3a785c2 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -6905,20 +6905,28 @@ static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
>  		fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
>  		fprintf(f, "  mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
>  		break;
>
>  	case PIPE_SHADER_TESS_EVAL:
>  		fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
>  		fprintf(f, "  as_es = %u\n", key->as_es);
>  		break;
>
>  	case PIPE_SHADER_GEOMETRY:
> +		if (shader->is_gs_copy_shader)
> +			break;
> +
> +		if (shader->selector->screen->b.chip_class >= GFX9 &&
> +		    key->part.gs.es->type == PIPE_SHADER_VERTEX) {
> +			si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
> +					      "part.gs.vs_prolog", f);
> +		}
>  		fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
>  		break;
>
>  	case PIPE_SHADER_COMPUTE:
>  		break;
>
>  	case PIPE_SHADER_FRAGMENT:
>  		fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
>  		fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
>  		fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 75df99d..cc41174 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -412,20 +412,22 @@ struct si_shader_key {
>  		} vs;
>  		struct {
>  			struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
>  			struct si_shader_selector *ls;   /* for merged LS-HS */
>  			struct si_tcs_epilog_bits epilog;
>  		} tcs; /* tessellation control shader */
>  		struct {
>  			struct si_vs_epilog_bits epilog; /* same as VS */
>  		} tes; /* tessellation evaluation shader */
>  		struct {
> +			struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
> +			struct si_shader_selector *es;   /* for merged ES-GS */
>  			struct si_gs_prolog_bits prolog;
>  		} gs;
>  		struct {
>  			struct si_ps_prolog_bits prolog;
>  			struct si_ps_epilog_bits epilog;
>  		} ps;
>  	} part;
>
>  	/* These two are initially set according to the NEXT_SHADER property,
>  	 * or guessed if the property doesn't seem correct.
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index ec99326..39494cc 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -4324,22 +4324,24 @@ static void si_init_config(struct si_context *sctx)
>
>  	si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
>  	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
>  	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>  	si_pm4_cmd_end(pm4, false);
>
>  	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
>  	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
>
>  	/* FIXME calculate these values somehow ??? */
> -	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
> -	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
> +	if (sctx->b.chip_class <= VI) {
> +		si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
> +		si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
> +	}
>  	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
>
>  	si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
>  	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
>
>  	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
>  	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
>  	if (sctx->b.chip_class < CIK)
>  		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
>  			       S_008A14_CLIP_VTX_REORDER_ENA(1));
> @@ -4475,34 +4477,34 @@ static void si_init_config(struct si_context *sctx)
>  		si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
>  		si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
>  		si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
>  	} else {
>  		si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
>  		si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
>  		si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
>  	}
>
>  	if (sctx->b.chip_class >= CIK) {
> -		/* If this is 0, Bonaire can hang even if GS isn't being used.
> -		 * Other chips are unaffected. These are suboptimal values,
> -		 * but we don't use on-chip GS.
> -		 */
> -		si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
> -			       S_028A44_ES_VERTS_PER_SUBGRP(64) |
> -			       S_028A44_GS_PRIMS_PER_SUBGRP(4));
> -
>  		if (sctx->b.chip_class >= GFX9) {
>  			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
>  		} else {
>  			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
>  			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
>  			si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
> +
> +			/* If this is 0, Bonaire can hang even if GS isn't being used.
> +			 * Other chips are unaffected. These are suboptimal values,
> +			 * but we don't use on-chip GS.
> +			 */
> +			si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
> +				       S_028A44_ES_VERTS_PER_SUBGRP(64) |
> +				       S_028A44_GS_PRIMS_PER_SUBGRP(4));
>  		}
>  		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
>
>  		if (sscreen->b.info.num_good_compute_units /
>  		    (sscreen->b.info.max_se * sscreen->b.info.max_sh_per_se) <= 4) {
>  			/* Too few available compute units per SH. Disallowing
>  			 * VS to run on CU0 could hurt us more than late VS
>  			 * allocation would help.
>  			 *
>  			 * LATE_ALLOC_VS = 2 is the highest safe number.
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index de97c0e..393f64f 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -472,21 +472,22 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
>  		 (info->count_from_stream_output ||
>  		  si_num_prims_for_vertices(info) < primgroup_size));
>  	key.u.primitive_restart = info->primitive_restart;
>  	key.u.count_from_stream_output = info->count_from_stream_output != NULL;
>
>  	ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
>  			     S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
>
>  	if (sctx->gs_shader.cso) {
>  		/* GS requirement. */
> -		if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
> +		if (sctx->b.chip_class <= VI &&
> +		    SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
>  			ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
>
>  		/* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
>  		 * The hw doc says all multi-SE chips are affected, but Vulkan
>  		 * only applies it to Hawaii. Do what Vulkan does.
>  		 */
>  		if (sctx->b.family == CHIP_HAWAII &&
>  		    G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
>  		    (info->indirect ||
>  		     (info->instance_count > 1 &&
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 806d55b..b35bdfa 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -565,41 +565,149 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
>
>  	polaris_set_vgt_vertex_reuse(sscreen, shader, pm4);
>  }
>
>  /**
>   * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
>   * geometry shader.
>   */
>  static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel)
>  {
> +	enum chip_class chip_class = sel->screen->b.chip_class;
>  	unsigned gs_max_vert_out = sel->gs_max_out_vertices;
>  	unsigned cut_mode;
>
>  	if (gs_max_vert_out <= 128) {
>  		cut_mode = V_028A40_GS_CUT_128;
>  	} else if (gs_max_vert_out <= 256) {
>  		cut_mode = V_028A40_GS_CUT_256;
>  	} else if (gs_max_vert_out <= 512) {
>  		cut_mode = V_028A40_GS_CUT_512;
>  	} else {
>  		assert(gs_max_vert_out <= 1024);
>  		cut_mode = V_028A40_GS_CUT_1024;
>  	}
>
>  	return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
>  	       S_028A40_CUT_MODE(cut_mode)|
> -	       S_028A40_ES_WRITE_OPTIMIZE(1) |
> -	       S_028A40_GS_WRITE_OPTIMIZE(1);
> +	       S_028A40_ES_WRITE_OPTIMIZE(chip_class <= VI) |
> +	       S_028A40_GS_WRITE_OPTIMIZE(1) |
> +	       S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
>  }
>
> -static void si_shader_gs(struct si_shader *shader)
> +struct gfx9_gs_info {
> +	unsigned es_verts_per_subgroup;
> +	unsigned gs_prims_per_subgroup;
> +	unsigned gs_inst_prims_in_subgroup;
> +	unsigned max_prims_per_subgroup;
> +	unsigned lds_size;
> +};
> +
> +static void gfx9_get_gs_info(struct si_shader_selector *es,
> +				   struct si_shader_selector *gs,
> +				   struct gfx9_gs_info *out)
> +{
> +	unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
> +	unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
> +	bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
> +			      input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
> +
> +	/* All these are in dwords: */
> +	/* We can't allow using the whole LDS, because GS waves compete with
> +	 * other shader stages for LDS space. */

Is this a strict requirement to prevent hangs? If so, couldn't the hang 
occur in other ways as well?

If it's just for performance, please note that in the comment.

> +	const unsigned max_lds_size = 8 * 1024;
> +	const unsigned esgs_itemsize = es->esgs_itemsize / 4;
> +	unsigned esgs_lds_size;
> +
> +	/* All these are per subgroup: */
> +	const unsigned max_out_prims = 32 * 1024;
> +	const unsigned max_es_verts = 255;

I assume the idea here is 4 waves to a CU, so why not 256? The hardware 
register goes up to 2047 even.

> +	const unsigned ideal_gs_prims = 64;
> +	unsigned max_gs_prims, gs_prims;
> +	unsigned min_es_verts, es_verts, worst_case_es_verts;
> +
> +	assert(gs_num_invocations <= 32); /* GL maximum */
> +
> +	if (uses_adjacency || gs_num_invocations > 1)
> +		max_gs_prims = 127 / gs_num_invocations;
> +	else
> +		max_gs_prims = 255;

Same question as for max_es_verts here.

Also, why the different base number? For adjacency, I could imagine it's 
because you have basically double the number of vertices per primitive, 
so you fewer GS invocations. But why the same reduction of the base 
number when gs_num_invocations > 1?

> +
> +	/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
> +	 * Make sure we don't go over the maximum value.
> +	 */
> +	max_gs_prims = MIN2(max_gs_prims,
> +			    max_out_prims /
> +			    (gs->gs_max_out_vertices * gs_num_invocations));
> +	assert(max_gs_prims > 0);
> +
> +	/* If the primitive has adjacency, halve the number of vertices
> +	 * that will be reused in multiple primitives.
> +	 */
> +	min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);

I don't understand this. In the worst case, you have e.g. a single 
triangle with adjacency which needs 6 ES vertices, and this is already 
reflected in gs_input_verts_per_prim.

I see another reference below about vertex re-use, but I don't see how 
that applies to LINES_ADJACENCY and TRIANGLES_ADJACENCY.

> +
> +	gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
> +	worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
> +
> +	/* Compute ESGS LDS size based on the worst case number of ES vertices
> +	 * needed to create the target number of GS prims per subgroup.
> +	 */
> +	esgs_lds_size = esgs_itemsize * worst_case_es_verts;
> +
> +	/* If total LDS usage is too big, refactor partitions based on ratio
> +	 * of ESGS item sizes.
> +	 */
> +	if (esgs_lds_size > max_lds_size) {
> +		/* Our target GS Prims Per Subgroup was too large. Calculate
> +		 * the maximum number of GS Prims Per Subgroup that will fit
> +		 * into LDS, capped by the maximum that the hardware can support.
> +		 */
> +		gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
> +				max_gs_prims);
> +		assert(gs_prims > 0);
> +		worst_case_es_verts = MIN2(min_es_verts * gs_prims,
> +					   max_es_verts);
> +
> +		esgs_lds_size = esgs_itemsize * worst_case_es_verts;
> +		assert(esgs_lds_size <= max_lds_size);
> +	}
> +
> +	/* Now calculate remaining ESGS information. */
> +	if (esgs_lds_size)
> +		es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
> +	else
> +		es_verts = max_es_verts;
> +
> +	/* Vertices for adjacency primitives are not always reused, so restore
> +	 * it for ES_VERTS_PER_SUBGRP.
> +	 */
> +	min_es_verts = gs->gs_input_verts_per_prim;
> +
> +	/* For normal primitives, the VGT only checks if they are past the ES

What are "normal" primitives?

> +	 * verts per subgroup after allocating a full GS primitive and if they
> +	 * are, kick off a new subgroup.  But if those additional ES verts are
> +	 * unique (e.g. not reused) we need to make sure there is enough LDS
> +	 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
> +	 */
> +	es_verts -= min_es_verts - 1;
> +
> +	out->es_verts_per_subgroup = es_verts;
> +	out->gs_prims_per_subgroup = gs_prims;
> +	out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
> +	out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
> +				      gs->gs_max_out_vertices;
> +	out->lds_size = align(esgs_lds_size, 128) / 128;
> +
> +	assert(out->max_prims_per_subgroup <= max_out_prims);
> +}
> +
> +static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
>  {
>  	struct si_shader_selector *sel = shader->selector;
>  	const ubyte *num_components = sel->info.num_stream_output_components;
>  	unsigned gs_num_invocations = sel->gs_num_invocations;
>  	struct si_pm4_state *pm4;
>  	uint64_t va;
>  	unsigned max_stream = sel->max_gs_stream;
>  	unsigned offset;
>
>  	pm4 = si_get_shader_pm4_state(shader);
> @@ -614,44 +722,99 @@ static void si_shader_gs(struct si_shader *shader)
>  	if (max_stream >= 2)
>  		offset += num_components[2] * sel->gs_max_out_vertices;
>  	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
>  	if (max_stream >= 3)
>  		offset += num_components[3] * sel->gs_max_out_vertices;
>  	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
>
>  	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
>  	assert(offset < (1 << 15));
>
> -	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);
> +	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
>
>  	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
>  	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
>  	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
>  	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
>
>  	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
>  		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
>  		       S_028B90_ENABLE(gs_num_invocations > 0));
>
>  	va = shader->bo->gpu_address;
>  	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
> -	si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
> -	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
> -
> -	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
> -		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
> -		       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
> -		       S_00B228_DX10_CLAMP(1) |
> -		       S_00B228_FLOAT_MODE(shader->config.float_mode));
> -	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
> -		       S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
> -		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
> +
> +	if (sscreen->b.chip_class >= GFX9) {
> +		unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
> +		unsigned es_type = shader->key.part.gs.es->type;
> +		unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
> +		struct gfx9_gs_info gs_info;
> +
> +		if (es_type == PIPE_SHADER_VERTEX)
> +			es_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
> +		else if (es_type == PIPE_SHADER_TESS_EVAL)
> +			es_vgpr_comp_cnt = 3; /* all components are needed for TES */
> +		else
> +			unreachable("invalid shader selector type");
> +
> +		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
> +		 * VGPR[0:4] are always loaded.
> +		 */
> +		if (sel->info.uses_invocationid)
> +			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
> +		else if (sel->info.uses_primid)
> +			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
> +		else if (input_prim >= PIPE_PRIM_TRIANGLES)
> +			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
> +		else
> +			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
> +
> +		gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
> +
> +		si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
> +		si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, va >> 40);
> +
> +		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
> +			       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
> +			       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
> +			       S_00B228_DX10_CLAMP(1) |
> +			       S_00B228_FLOAT_MODE(shader->config.float_mode) |
> +			       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
> +		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
> +			       S_00B22C_USER_SGPR(GFX9_GS_NUM_USER_SGPR) |
> +			       S_00B22C_USER_SGPR_MSB(GFX9_GS_NUM_USER_SGPR >> 5) |
> +			       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
> +			       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
> +			       S_00B22C_LDS_SIZE(gs_info.lds_size) |
> +			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
> +
> +		si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
> +			       S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) |
> +			       S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) |
> +			       S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup));
> +		si_pm4_set_reg(pm4, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
> +			       S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
> +		si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
> +			       shader->key.part.gs.es->esgs_itemsize / 4);
> +	} else {
> +		si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
> +		si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
> +
> +		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
> +			       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
> +			       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
> +			       S_00B228_DX10_CLAMP(1) |
> +			       S_00B228_FLOAT_MODE(shader->config.float_mode));
> +		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
> +			       S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
> +			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
> +	}
>  }
>
>  /**
>   * Compute the state for \p shader, which will run as a vertex shader on the
>   * hardware.
>   *
>   * If \p gs is non-NULL, it points to the geometry shader for which this shader
>   * is the copy shader.
>   */
>  static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
> @@ -961,21 +1124,21 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen,
>  	case PIPE_SHADER_TESS_CTRL:
>  		si_shader_hs(sscreen, shader);
>  		break;
>  	case PIPE_SHADER_TESS_EVAL:
>  		if (shader->key.as_es)
>  			si_shader_es(sscreen, shader);
>  		else
>  			si_shader_vs(sscreen, shader, NULL);
>  		break;
>  	case PIPE_SHADER_GEOMETRY:
> -		si_shader_gs(shader);
> +		si_shader_gs(sscreen, shader);
>  		break;
>  	case PIPE_SHADER_FRAGMENT:
>  		si_shader_ps(shader);
>  		break;
>  	default:
>  		assert(0);
>  	}
>  }
>
>  static unsigned si_get_alpha_test_func(struct si_context *sctx)
> @@ -1100,20 +1263,29 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
>  		if (sctx->gs_shader.cso)
>  			key->as_es = 1;
>  		else {
>  			si_shader_selector_key_hw_vs(sctx, sel, key);
>
>  			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
>  				key->part.tes.epilog.export_prim_id = 1;
>  		}
>  		break;
>  	case PIPE_SHADER_GEOMETRY:
> +		if (sctx->b.chip_class >= GFX9) {
> +			if (sctx->tes_shader.cso) {
> +				key->part.gs.es = sctx->tes_shader.cso;
> +			} else {
> +				si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
> +							  key, &key->part.gs.vs_prolog);
> +				key->part.gs.es = sctx->vs_shader.cso;
> +			}
> +		}
>  		key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
>  		break;
>  	case PIPE_SHADER_FRAGMENT: {
>  		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>  		struct si_state_blend *blend = sctx->queued.named.blend;
>
>  		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
>  		    sel->info.colors_written == 0x1)
>  			key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
>
> @@ -1721,20 +1893,26 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
>  				break;
>  			case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
>  			case TGSI_SEMANTIC_EDGEFLAG:
>  				break;
>  			default:
>  				sel->outputs_written2 |=
>  					1u << si_shader_io_get_unique_index2(name, index);
>  			}
>  		}
>  		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
> +
> +		/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
> +		 * conflicts, i.e. each vertex will start at a different bank.
> +		 */
> +		if (sctx->b.chip_class >= GFX9)
> +			sel->esgs_itemsize += 4;

Could this not be achieved by some form of rounding instead?

Cheers,
Nicolai

>  		break;
>
>  	case PIPE_SHADER_FRAGMENT:
>  		for (i = 0; i < sel->info.num_inputs; i++) {
>  			unsigned name = sel->info.input_semantic_name[i];
>  			unsigned index = sel->info.input_semantic_index[i];
>
>  			switch (name) {
>  			case TGSI_SEMANTIC_CLIPDIST:
>  			case TGSI_SEMANTIC_GENERIC:
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.