[Mesa-dev] [PATCH 31/31] radeonsi: generate GS prolog to (partially) fix triangle strip adjacency rotation

Nicolai Hähnle nhaehnle at gmail.com
Mon Oct 31 22:11:18 UTC 2016


From: Nicolai Hähnle <nicolai.haehnle at amd.com>

Fixes GL45-CTS.geometry_shader.adjacency.adjacency_indiced_triangle_strip and
others.

This leaves the case of triangle strips with adjacency and primitive restarts
open. It seems that the only thing that cares about that is a piglit test.
Fixing this efficiently would be really involved, and I don't want to use the
hammer of degrading to software handling of indices because there may well
be software that uses this draw mode (without caring about the precise
rotation of triangles).
---
 src/gallium/drivers/radeonsi/si_pipe.c          |   1 +
 src/gallium/drivers/radeonsi/si_pipe.h          |   2 +
 src/gallium/drivers/radeonsi/si_shader.c        | 112 ++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_shader.h        |  10 +++
 src/gallium/drivers/radeonsi/si_state_draw.c    |  18 ++++
 src/gallium/drivers/radeonsi/si_state_shaders.c |   7 +-
 6 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index bf3b442..bc633bb 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -665,20 +665,21 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	return 0;
 }
 
 static void si_destroy_screen(struct pipe_screen* pscreen)
 {
 	struct si_screen *sscreen = (struct si_screen *)pscreen;
 	struct si_shader_part *parts[] = {
 		sscreen->vs_prologs,
 		sscreen->vs_epilogs,
 		sscreen->tcs_epilogs,
+		sscreen->gs_prologs,
 		sscreen->ps_prologs,
 		sscreen->ps_epilogs
 	};
 	unsigned i;
 
 	if (!sscreen)
 		return;
 
 	if (!sscreen->b.ws->unref(sscreen->b.ws))
 		return;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index e7617bc..8e6a94d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -89,20 +89,21 @@ struct si_screen {
 	bool				has_ds_bpermute;
 
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
 	bool				record_llvm_ir;
 
 	pipe_mutex			shader_parts_mutex;
 	struct si_shader_part		*vs_prologs;
 	struct si_shader_part		*vs_epilogs;
 	struct si_shader_part		*tcs_epilogs;
+	struct si_shader_part		*gs_prologs;
 	struct si_shader_part		*ps_prologs;
 	struct si_shader_part		*ps_epilogs;
 
 	/* Shader cache in memory.
 	 *
 	 * Design & limitations:
 	 * - The shader cache is per screen (= per process), never saved to
 	 *   disk, and skips redundant shader compilations from TGSI to bytecode.
 	 * - It can only be used with one-variant-per-shader support, in which
 	 *   case only the main (typically middle) part of shaders is cached.
@@ -312,20 +313,21 @@ struct si_context {
 	int			last_sh_base_reg;
 	int			last_primitive_restart_en;
 	int			last_restart_index;
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
 	int			last_rast_prim;
 	unsigned		last_sc_line_stipple;
 	int			last_vtx_reuse_depth;
 	int			current_rast_prim; /* primitive type after TES, GS */
+	bool			gs_tri_strip_adj_fix;
 	unsigned		last_gsvs_itemsize;
 
 	/* Scratch buffer */
 	struct r600_resource	*scratch_buffer;
 	bool			emit_scratch_reloc;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
 
 	struct r600_resource	*compute_scratch_buffer;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index fe15420..9141d62 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -6740,20 +6740,92 @@ static void si_get_ps_epilog_key(struct si_shader *shader,
 	struct tgsi_shader_info *info = &shader->selector->info;
 	memset(key, 0, sizeof(*key));
 	key->ps_epilog.colors_written = info->colors_written;
 	key->ps_epilog.writes_z = info->writes_z;
 	key->ps_epilog.writes_stencil = info->writes_stencil;
 	key->ps_epilog.writes_samplemask = info->writes_samplemask;
 	key->ps_epilog.states = shader->key.ps.epilog;
 }
 
 /**
+ * Build the GS prolog function. Rotate the input vertices for triangle strips
+ * with adjacency.
+ */
+static void si_build_gs_prolog_function(struct si_shader_context *ctx,
+					union si_shader_part_key *key)
+{
+	const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
+	const unsigned num_vgprs = 8;
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMTypeRef params[32];
+	LLVMTypeRef returns[32];
+	LLVMValueRef func, ret;
+
+	for (unsigned i = 0; i < num_sgprs; ++i) {
+		params[i] = ctx->i32;
+		returns[i] = ctx->i32;
+	}
+
+	for (unsigned i = 0; i < num_vgprs; ++i) {
+		params[num_sgprs + i] = ctx->i32;
+		returns[num_sgprs + i] = ctx->f32;
+	}
+
+	/* Create the function. */
+	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
+			   params, num_sgprs + num_vgprs, num_sgprs - 1);
+	func = ctx->main_fn;
+
+	/* Copy inputs to outputs. This should be no-op, as the registers match,
+	 * but it will prevent the compiler from overwriting them unintentionally.
+	 */
+	ret = ctx->return_value;
+	for (unsigned i = 0; i < num_sgprs; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+	}
+	for (unsigned i = 0; i < num_vgprs; i++) {
+		LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+		p = LLVMBuildBitCast(builder, p, ctx->f32, "");
+		ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+	}
+
+	if (key->gs_prolog.states.tri_strip_adj_fix) {
+		/* Remap the input vertices for every other primitive. */
+		const unsigned vtx_params[6] = {
+			num_sgprs,
+			num_sgprs + 1,
+			num_sgprs + 3,
+			num_sgprs + 4,
+			num_sgprs + 5,
+			num_sgprs + 6
+		};
+		LLVMValueRef prim_id, rotate;
+
+		prim_id = LLVMGetParam(func, num_sgprs + 2);
+		rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
+
+		for (unsigned i = 0; i < 6; ++i) {
+			LLVMValueRef base, rotated, actual;
+			base = LLVMGetParam(func, vtx_params[i]);
+			rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
+			actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
+			actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
+			ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
+		}
+	}
+
+	LLVMBuildRet(builder, ret);
+}
+
+/**
  * Given a list of shader part functions, build a wrapper function that
  * runs them in sequence to form a monolithic shader.
  */
 static void si_build_wrapper_function(struct si_shader_context *ctx,
 				      LLVMValueRef *parts,
 				      unsigned num_parts,
 				      unsigned main_part)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = ctx->gallivm.builder;
@@ -7012,20 +7084,32 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 		LLVMValueRef parts[2];
 		union si_shader_part_key epilog_key;
 
 		parts[0] = ctx.main_fn;
 
 		si_get_vs_epilog_key(shader, &shader->key.tes.epilog, &epilog_key);
 		si_build_vs_epilog_function(&ctx, &epilog_key);
 		parts[1] = ctx.main_fn;
 
 		si_build_wrapper_function(&ctx, parts, 2, 0);
+	} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+		LLVMValueRef parts[2];
+		union si_shader_part_key prolog_key;
+
+		parts[1] = ctx.main_fn;
+
+		memset(&prolog_key, 0, sizeof(prolog_key));
+		prolog_key.gs_prolog.states = shader->key.gs.prolog;
+		si_build_gs_prolog_function(&ctx, &prolog_key);
+		parts[0] = ctx.main_fn;
+
+		si_build_wrapper_function(&ctx, parts, 2, 1);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
 		LLVMValueRef parts[3];
 		union si_shader_part_key prolog_key;
 		union si_shader_part_key epilog_key;
 		bool need_prolog;
 
 		si_get_ps_prolog_key(shader, &prolog_key, false);
 		need_prolog = si_need_ps_prolog(&prolog_key);
 
 		parts[need_prolog ? 1 : 0] = ctx.main_fn;
@@ -7200,20 +7284,23 @@ si_get_shader_part(struct si_screen *sscreen,
 	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
 	ctx.type = type;
 
 	switch (type) {
 	case PIPE_SHADER_VERTEX:
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		assert(!prolog);
 		shader.key.tcs.epilog = key->tcs_epilog.states;
 		break;
+	case PIPE_SHADER_GEOMETRY:
+		assert(prolog);
+		break;
 	case PIPE_SHADER_FRAGMENT:
 		if (prolog)
 			shader.key.ps.prolog = key->ps_prolog.states;
 		else
 			shader.key.ps.epilog = key->ps_epilog.states;
 		break;
 	default:
 		unreachable("bad shader part");
 	}
 
@@ -7524,20 +7611,41 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
 
 	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
 					    PIPE_SHADER_TESS_CTRL, false,
 					    &epilog_key, tm, debug,
 					    si_build_tcs_epilog_function,
 					    "Tessellation Control Shader Epilog");
 	return shader->epilog != NULL;
 }
 
 /**
+ * Select and compile (or reuse) GS parts (prolog).
+ */
+static bool si_shader_select_gs_parts(struct si_screen *sscreen,
+				      LLVMTargetMachineRef tm,
+				      struct si_shader *shader,
+				      struct pipe_debug_callback *debug)
+{
+	union si_shader_part_key prolog_key;
+
+	memset(&prolog_key, 0, sizeof(prolog_key));
+	prolog_key.gs_prolog.states = shader->key.gs.prolog;
+
+	shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
+					    PIPE_SHADER_GEOMETRY, true,
+					    &prolog_key, tm, debug,
+					    si_build_gs_prolog_function,
+					    "Geometry Shader Prolog");
+	return shader->prolog != NULL;
+}
+
+/**
  * Build the pixel shader prolog function. This handles:
  * - two-side color selection and interpolation
  * - overriding interpolation parameters for the API PS
  * - polygon stippling
  *
  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
  * overriden by other states. (e.g. per-sample interpolation)
  * Interpolated colors are stored after the preloaded VGPRs.
  */
 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
@@ -8040,20 +8148,24 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_CTRL:
 			if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_EVAL:
 			if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
 				return -1;
 			break;
+		case PIPE_SHADER_GEOMETRY:
+			if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
+				return - 1;
+			break;
 		case PIPE_SHADER_FRAGMENT:
 			if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
 				return -1;
 
 			/* Make sure we have at least as many VGPRs as there
 			 * are allocated inputs.
 			 */
 			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
 							shader->info.num_input_vgprs);
 			break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 91f9cbf..d8ab2a4 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -318,20 +318,24 @@ struct si_vs_epilog_bits {
 	 *   if PS doesn't read them
 	 */
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
 	uint64_t	inputs_to_copy;
 };
 
+struct si_gs_prolog_bits {
+	unsigned	tri_strip_adj_fix:1;
+};
+
 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
 	unsigned	color_two_side:1;
 	unsigned	flatshade_colors:1;
 	unsigned	poly_stipple:1;
 	unsigned	force_persp_sample_interp:1;
 	unsigned	force_linear_sample_interp:1;
 	unsigned	force_persp_center_interp:1;
 	unsigned	force_linear_center_interp:1;
 	unsigned	bc_optimize_for_persp:1;
@@ -356,20 +360,23 @@ union si_shader_part_key {
 		unsigned	last_input:4;
 	} vs_prolog;
 	struct {
 		struct si_vs_epilog_bits states;
 		unsigned	prim_id_param_offset:5;
 	} vs_epilog;
 	struct {
 		struct si_tcs_epilog_bits states;
 	} tcs_epilog;
 	struct {
+		struct si_gs_prolog_bits states;
+	} gs_prolog;
+	struct {
 		struct si_ps_prolog_bits states;
 		unsigned	num_input_sgprs:5;
 		unsigned	num_input_vgprs:5;
 		/* Color interpolation and two-side color selection. */
 		unsigned	colors_read:8; /* color input components read */
 		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
 		unsigned	face_vgpr_index:5;
 		unsigned	wqm:1;
 		char		color_attr_index[2];
 		char		color_interp_vgpr_index[2]; /* -1 == constant */
@@ -394,20 +401,23 @@ union si_shader_key {
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
 	} vs;
 	struct {
 		struct si_tcs_epilog_bits epilog;
 	} tcs; /* tessellation control shader */
 	struct {
 		struct si_vs_epilog_bits epilog; /* same as VS */
 		unsigned	as_es:1; /* export shader */
 	} tes; /* tessellation evaluation shader */
+	struct {
+		struct si_gs_prolog_bits prolog;
+	} gs;
 };
 
 struct si_shader_config {
 	unsigned			num_sgprs;
 	unsigned			num_vgprs;
 	unsigned			spilled_sgprs;
 	unsigned			spilled_vgprs;
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
 	unsigned			spi_ps_input_addr;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index c0e2642..b934100 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -975,20 +975,38 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	else if (sctx->tes_shader.cso)
 		rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		rast_prim = info->mode;
 
 	if (rast_prim != sctx->current_rast_prim) {
 		sctx->current_rast_prim = rast_prim;
 		sctx->do_update_shaders = true;
 	}
 
+	if (sctx->gs_shader.cso) {
+		/* Determine whether the GS triangle strip adjacency fix should
+		 * be applied. Rotate every other triangle if
+		 * - triangle strips with adjacency are fed to the GS and
+		 * - primitive restart is disabled (the rotation doesn't help
+		 *   when the restart occurs after an odd number of triangles).
+		 */
+		bool gs_tri_strip_adj_fix =
+			!sctx->tcs_shader.cso && !sctx->tes_shader.cso &&
+			info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
+			!info->primitive_restart;
+
+		if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
+			sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
+			sctx->do_update_shaders = true;
+		}
+	}
+
 	if (sctx->do_update_shaders && !si_update_shaders(sctx))
 		return;
 
 	if (!si_upload_graphics_shader_descriptors(sctx))
 		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
 		pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer);
 		ib.user_buffer = sctx->index_buffer.user_buffer;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index bd217f3..62609cf 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -885,20 +885,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		if (sel == sctx->fixed_func_tcs_shader.cso)
 			key->tcs.epilog.inputs_to_copy = sctx->vs_shader.cso->outputs_written;
 		break;
 	case PIPE_SHADER_TESS_EVAL:
 		if (sctx->gs_shader.cso)
 			key->tes.as_es = 1;
 		else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.epilog.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
+		key->gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
 		break;
 	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 		struct si_state_blend *blend = sctx->queued.named.blend;
 
 		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
 		    sel->info.colors_written == 0x1)
 			key->ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
 
 		if (blend) {
@@ -1144,22 +1145,21 @@ void si_init_shader_selector_async(void *job, int thread_index)
 		if (!debug->async)
 			debug = NULL;
 	} else {
 		tm = sel->tm;
 	}
 
 	/* Compile the main shader part for use with a prolog and/or epilog.
 	 * If this fails, the driver will try to compile a monolithic shader
 	 * on demand.
 	 */
-	if (sel->type != PIPE_SHADER_GEOMETRY &&
-	    !sscreen->use_monolithic_shaders) {
+	if (!sscreen->use_monolithic_shaders) {
 		struct si_shader *shader = CALLOC_STRUCT(si_shader);
 		void *tgsi_binary;
 
 		if (!shader) {
 			fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
 			return;
 		}
 
 		shader->selector = sel;
 		si_parse_next_shader_property(&sel->info, &shader->key);
@@ -1190,22 +1190,21 @@ void si_init_shader_selector_async(void *job, int thread_index)
 				if (!si_shader_cache_insert_shader(sscreen, tgsi_binary, shader))
 					FREE(tgsi_binary);
 				pipe_mutex_unlock(sscreen->shader_cache_mutex);
 			}
 		}
 
 		sel->main_shader_part = shader;
 	}
 
 	/* Pre-compilation. */
-	if (sel->type == PIPE_SHADER_GEOMETRY ||
-	    sscreen->b.debug_flags & DBG_PRECOMPILE) {
+	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
 		union si_shader_key key;
 
 		memset(&key, 0, sizeof(key));
 		si_parse_next_shader_property(&sel->info, &key);
 
 		/* Set reasonable defaults, so that the shader key doesn't
 		 * cause any code to be eliminated.
 		 */
 		switch (sel->type) {
-- 
2.7.4



More information about the mesa-dev mailing list