[Mesa-dev] [PATCH 57/61] radeonsi: remove VS epilog code, compile VS with PrimID export on demand

Mon Apr 24 08:45:54 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

The use of PrimID in the pixel shader is too rare to deserve such
a sizable support code.

The initial idea of the VS epilog was to move the clipping code there and
remove it based on states, but optimized variants are now used to do that
and are easier to support, so the VS epilog has turned out to be not so
useful.
---
 src/gallium/drivers/radeonsi/si_pipe.c          |   1 -
 src/gallium/drivers/radeonsi/si_pipe.h          |  11 --
 src/gallium/drivers/radeonsi/si_shader.c        | 208 +++---------------------
 src/gallium/drivers/radeonsi/si_shader.h        |  15 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |   6 +-
 5 files changed, 31 insertions(+), 210 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 1b013c4..1a83564 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -691,21 +691,20 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
 		return 0;
 	}
 	return 0;
 }
 
 static void si_destroy_screen(struct pipe_screen* pscreen)
 {
 	struct si_screen *sscreen = (struct si_screen *)pscreen;
 	struct si_shader_part *parts[] = {
 		sscreen->vs_prologs,
-		sscreen->vs_epilogs,
 		sscreen->tcs_epilogs,
 		sscreen->gs_prologs,
 		sscreen->ps_prologs,
 		sscreen->ps_epilogs
 	};
 	unsigned i;
 
 	if (!sscreen->b.ws->unref(sscreen->b.ws))
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 918aa0f..ea61e1e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,21 +80,20 @@ struct si_screen {
 	bool				has_draw_indirect_multi;
 	bool				has_ds_bpermute;
 	bool				has_msaa_sample_loc_bug;
 
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
 	bool				record_llvm_ir;
 
 	mtx_t			shader_parts_mutex;
 	struct si_shader_part		*vs_prologs;
-	struct si_shader_part		*vs_epilogs;
 	struct si_shader_part		*tcs_epilogs;
 	struct si_shader_part		*gs_prologs;
 	struct si_shader_part		*ps_prologs;
 	struct si_shader_part		*ps_epilogs;
 
 	/* Shader cache in memory.
 	 *
 	 * Design & limitations:
 	 * - The shader cache is per screen (= per process), never saved to
 	 *   disk, and skips redundant shader compilations from TGSI to bytecode.
@@ -502,30 +501,20 @@ static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
 	if (sctx->gs_shader.current)
 		return sctx->gs_shader.cso->gs_copy_shader;
 	else if (sctx->tes_shader.current)
 		return sctx->tes_shader.current;
 	else
 		return sctx->vs_shader.current;
 }
 
-static inline bool si_vs_exports_prim_id(struct si_shader *shader)
-{
-	if (shader->selector->type == PIPE_SHADER_VERTEX)
-		return shader->key.part.vs.epilog.export_prim_id;
-	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		return shader->key.part.tes.epilog.export_prim_id;
-	else
-		return false;
-}
-
 static inline unsigned
 si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
 {
 	unsigned alignment, tcc_cache_line_size;
 
 	/* If the upload size is less than the cache line size (e.g. 16, 32),
 	 * the whole thing will fit into a cache line if we align it to its size.
 	 * The idea is that multiple small uploads can share a cache line.
 	 * If the upload size is greater, align it to the cache line size.
 	 */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ce509af..13cbd0a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -68,39 +68,32 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 				 struct lp_build_tgsi_context *bld_base,
 				 struct lp_build_emit_data *emit_data);
 
 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
 			       FILE *f);
 
 static unsigned llvm_get_type_size(LLVMTypeRef type);
 
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key);
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-					union si_shader_part_key *key);
 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 					 union si_shader_part_key *key);
 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key);
 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key);
 
 /* Ideally pass the sample mask input to the PS epilog as v13, which
  * is its usual location, so that the shader doesn't have to add v_mov.
  */
 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 
-/* The VS location of the PrimitiveID input is the same in the epilog,
- * so that the main shader part doesn't have to move it.
- */
-#define VS_EPILOG_PRIMID_LOC 2
-
 enum {
 	CONST_ADDR_SPACE = 2,
 	LOCAL_ADDR_SPACE = 3,
 };
 
 static bool is_merged_shader(struct si_shader *shader)
 {
 	if (shader->selector->screen->b.chip_class <= VI)
 		return false;
 
@@ -2982,33 +2975,39 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
 		outputs[i].semantic_index = info->output_semantic_index[i];
 
 		for (j = 0; j < 4; j++) {
 			outputs[i].values[j] =
 				LLVMBuildLoad(gallivm->builder,
 					      ctx->outputs[i][j],
 					      "");
 			outputs[i].vertex_stream[j] =
 				(info->output_streams[i] >> (2 * j)) & 3;
 		}
-
 	}
 
-	/* Return the primitive ID from the LLVM function. */
-	ctx->return_value =
-		LLVMBuildInsertValue(gallivm->builder,
-				     ctx->return_value,
-				     bitcast(bld_base, TGSI_TYPE_FLOAT,
-					     get_primitive_id(bld_base, 0)),
-				     VS_EPILOG_PRIMID_LOC, "");
-
 	if (ctx->shader->selector->so.num_outputs)
 		si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+	/* Export PrimitiveID. */
+	if (ctx->shader->key.mono.vs_export_prim_id) {
+		outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+		outputs[i].semantic_index = 0;
+		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+					       get_primitive_id(bld_base, 0));
+		for (j = 1; j < 4; j++)
+			outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
+
+		memset(outputs[i].vertex_stream, 0,
+		       sizeof(outputs[i].vertex_stream));
+		i++;
+	}
+
 	si_llvm_export_vs(bld_base, outputs, i);
 	FREE(outputs);
 }
 
 struct si_ps_exports {
 	unsigned num;
 	struct ac_export_args args[10];
 };
 
 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
@@ -5916,27 +5915,20 @@ static void create_function(struct si_shader_context *ctx)
 			/* The locations of the other parameters are assigned dynamically. */
 			declare_streamout_params(ctx, &shader->selector->so,
 						 params, ctx->i32, &num_params);
 		}
 
 		last_sgpr = num_params-1;
 
 		/* VGPRs */
 		declare_vs_input_vgprs(ctx, params, &num_params,
 				       &num_prolog_vgprs);
-
-		/* PrimitiveID output. */
-		if (!shader->is_gs_copy_shader &&
-		    !shader->key.as_es && !shader->key.as_ls) {
-			for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-				returns[num_returns++] = ctx->f32;
-		}
 		break;
 
 	case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
 		declare_default_desc_pointers(ctx, params, &num_params);
 		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
 		params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
 		params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
 		params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
 		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
 		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
@@ -6076,25 +6068,20 @@ static void create_function(struct si_shader_context *ctx)
 		} else {
 			params[num_params++] = ctx->i32;
 			declare_streamout_params(ctx, &shader->selector->so,
 						 params, ctx->i32, &num_params);
 			params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
 		}
 		last_sgpr = num_params - 1;
 
 		/* VGPRs */
 		declare_tes_input_vgprs(ctx, params, &num_params);
-
-		/* PrimitiveID output. */
-		if (!shader->key.as_es)
-			for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-				returns[num_returns++] = ctx->f32;
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
 		declare_default_desc_pointers(ctx, params, &num_params);
 		params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
 		params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
 		last_sgpr = num_params - 1;
 
 		/* VGPRs */
 		params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
@@ -7036,36 +7023,37 @@ static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
 	struct si_shader_key *key = &shader->key;
 
 	fprintf(f, "SHADER KEY\n");
 
 	switch (processor) {
 	case PIPE_SHADER_VERTEX:
 		si_dump_shader_key_vs(key, &key->part.vs.prolog,
 				      "part.vs.prolog", f);
 		fprintf(f, "  as_es = %u\n", key->as_es);
 		fprintf(f, "  as_ls = %u\n", key->as_ls);
-		fprintf(f, "  part.vs.epilog.export_prim_id = %u\n",
-			key->part.vs.epilog.export_prim_id);
+		fprintf(f, "  mono.vs_export_prim_id = %u\n",
+			key->mono.vs_export_prim_id);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
 		if (shader->selector->screen->b.chip_class >= GFX9) {
 			si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
 					      "part.tcs.ls_prolog", f);
 		}
 		fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
 		fprintf(f, "  mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
-		fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
 		fprintf(f, "  as_es = %u\n", key->as_es);
+		fprintf(f, "  mono.vs_export_prim_id = %u\n",
+			key->mono.vs_export_prim_id);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
 		if (shader->is_gs_copy_shader)
 			break;
 
 		if (shader->selector->screen->b.chip_class >= GFX9 &&
 		    key->part.gs.es->type == PIPE_SHADER_VERTEX) {
 			si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
 					      "part.gs.vs_prolog", f);
@@ -7518,42 +7506,20 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
 		key->vs_prolog.num_merged_next_stage_vgprs = 5;
 	}
 
 	/* Set the instanceID flag. */
 	for (unsigned i = 0; i < info->num_inputs; i++)
 		if (key->vs_prolog.states.instance_divisors[i])
 			shader_out->info.uses_instanceid = true;
 }
 
 /**
- * Compute the VS epilog key, which contains all the information needed to
- * build the VS epilog function, and set the PrimitiveID output offset.
- */
-static void si_get_vs_epilog_key(struct si_shader *shader,
-				 struct si_vs_epilog_bits *states,
-				 union si_shader_part_key *key)
-{
-	memset(key, 0, sizeof(*key));
-	key->vs_epilog.states = *states;
-
-	/* Set up the PrimitiveID output. */
-	if (shader->key.part.vs.epilog.export_prim_id) {
-		unsigned index = shader->selector->info.num_outputs;
-		unsigned offset = shader->info.nr_param_exports++;
-
-		key->vs_epilog.prim_id_param_offset = offset;
-		assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
-		shader->info.vs_output_param_offset[index] = offset;
-	}
-}
-
-/**
  * Compute the PS prolog key, which contains all the information needed to
  * build the PS prolog function, and set related bits in shader->config.
  */
 static void si_get_ps_prolog_key(struct si_shader *shader,
 				 union si_shader_part_key *key,
 				 bool separate_prolog)
 {
 	struct tgsi_shader_info *info = &shader->selector->info;
 
 	memset(key, 0, sizeof(*key));
@@ -8091,48 +8057,37 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	shader->info.uses_instanceid = sel->info.uses_instanceid;
 
 	ctx.load_system_value = declare_system_value;
 
 	if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
 		si_llvm_dispose(&ctx);
 		return -1;
 	}
 
 	if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-		LLVMValueRef parts[3];
-		bool need_prolog;
-		bool need_epilog;
-
-		need_prolog = sel->vs_needs_prolog;
-		need_epilog = !shader->key.as_es && !shader->key.as_ls;
+		LLVMValueRef parts[2];
+		bool need_prolog = sel->vs_needs_prolog;
 
-		parts[need_prolog ? 1 : 0] = ctx.main_fn;
+		parts[1] = ctx.main_fn;
 
 		if (need_prolog) {
 			union si_shader_part_key prolog_key;
 			si_get_vs_prolog_key(&sel->info,
 					     shader->info.num_input_sgprs,
 					     &shader->key.part.vs.prolog,
 					     shader, &prolog_key);
 			si_build_vs_prolog_function(&ctx, &prolog_key);
 			parts[0] = ctx.main_fn;
 		}
 
-		if (need_epilog) {
-			union si_shader_part_key epilog_key;
-			si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
-			si_build_vs_epilog_function(&ctx, &epilog_key);
-			parts[need_prolog ? 2 : 1] = ctx.main_fn;
-		}
-
-		si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
-					  need_prolog ? 1 : 0, 0);
+		si_build_wrapper_function(&ctx, parts + !need_prolog,
+					  1 + need_prolog, need_prolog, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->b.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
 			LLVMValueRef parts[4];
 
 			/* TCS main part */
 			parts[2] = ctx.main_fn;
 
 			/* TCS epilog */
 			union si_shader_part_key tcs_epilog_key;
@@ -8182,32 +8137,20 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
 			parts[0] = ctx.main_fn;
 
 			memset(&epilog_key, 0, sizeof(epilog_key));
 			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 			si_build_tcs_epilog_function(&ctx, &epilog_key);
 			parts[1] = ctx.main_fn;
 
 			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
 		}
-	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
-		   !shader->key.as_es) {
-		LLVMValueRef parts[2];
-		union si_shader_part_key epilog_key;
-
-		parts[0] = ctx.main_fn;
-
-		si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
-		si_build_vs_epilog_function(&ctx, &epilog_key);
-		parts[1] = ctx.main_fn;
-
-		si_build_wrapper_function(&ctx, parts, 2, 0, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
 		if (ctx.screen->b.chip_class >= GFX9) {
 			struct si_shader_selector *es = shader->key.part.gs.es;
 			LLVMValueRef es_prolog = NULL;
 			LLVMValueRef es_main = NULL;
 			LLVMValueRef gs_prolog = NULL;
 			LLVMValueRef gs_main = ctx.main_fn;
 
 			/* GS prolog */
 			union si_shader_part_key gs_prolog_key;
@@ -8593,71 +8536,20 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 		}
 
 		index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
 					   num_params++, "");
 	}
 
 	si_llvm_build_ret(ctx, ret);
 }
 
-/**
- * Build the vertex shader epilog function. This is also used by the tessellation
- * evaluation shader compiled as VS.
- *
- * The input is PrimitiveID.
- *
- * If PrimitiveID is required by the pixel shader, export it.
- * Otherwise, do nothing.
- */
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-					union si_shader_part_key *key)
-{
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-	LLVMTypeRef params[5];
-	int num_params, i;
-
-	/* Declare input VGPRs. */
-	num_params = key->vs_epilog.states.export_prim_id ?
-			   (VS_EPILOG_PRIMID_LOC + 1) : 0;
-	assert(num_params <= ARRAY_SIZE(params));
-
-	for (i = 0; i < num_params; i++)
-		params[i] = ctx->f32;
-
-	/* Create the function. */
-	si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
-
-	/* Emit exports. */
-	if (key->vs_epilog.states.export_prim_id) {
-		struct lp_build_context *base = &bld_base->base;
-		struct ac_export_args args;
-
-		args.enabled_channels = 0x1; /* enabled channels */
-		args.valid_mask = 0; /* whether the EXEC mask is valid */
-		args.done = 0; /* DONE bit */
-		args.target = V_008DFC_SQ_EXP_PARAM +
-			      key->vs_epilog.prim_id_param_offset;
-		args.compr = 0; /* COMPR flag (0 = 32-bit export) */
-		args.out[0] = LLVMGetParam(ctx->main_fn,
-				       VS_EPILOG_PRIMID_LOC); /* X */
-		args.out[1] = base->undef; /* Y */
-		args.out[2] = base->undef; /* Z */
-		args.out[3] = base->undef; /* W */
-
-		ac_build_export(&ctx->ac, &args);
-	}
-
-	LLVMBuildRetVoid(gallivm->builder);
-}
-
 static bool si_get_vs_prolog(struct si_screen *sscreen,
 			     LLVMTargetMachineRef tm,
 			     struct si_shader *shader,
 			     struct pipe_debug_callback *debug,
 			     struct si_shader *main_part,
 			     const struct si_vs_prolog_bits *key)
 {
 	struct si_shader_selector *vs = main_part->selector;
 
 	/* The prolog is a no-op if there are no inputs. */
@@ -8671,75 +8563,29 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
 
 	shader->prolog =
 		si_get_shader_part(sscreen, &sscreen->vs_prologs,
 				   PIPE_SHADER_VERTEX, true, &prolog_key, tm,
 				   debug, si_build_vs_prolog_function,
 				   "Vertex Shader Prolog");
 	return shader->prolog != NULL;
 }
 
 /**
- * Create & compile a vertex shader epilog. This a helper used by VS and TES.
- */
-static bool si_get_vs_epilog(struct si_screen *sscreen,
-			     LLVMTargetMachineRef tm,
-		             struct si_shader *shader,
-		             struct pipe_debug_callback *debug,
-			     struct si_vs_epilog_bits *states)
-{
-	union si_shader_part_key epilog_key;
-
-	si_get_vs_epilog_key(shader, states, &epilog_key);
-
-	shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
-					    PIPE_SHADER_VERTEX, true,
-					    &epilog_key, tm, debug,
-					    si_build_vs_epilog_function,
-					    "Vertex Shader Epilog");
-	return shader->epilog != NULL;
-}
-
-/**
  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
  */
 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
 				      LLVMTargetMachineRef tm,
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
-	if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
-			      &shader->key.part.vs.prolog))
-		return false;
-
-	/* Get the epilog. */
-	if (!shader->key.as_es && !shader->key.as_ls &&
-	    !si_get_vs_epilog(sscreen, tm, shader, debug,
-			      &shader->key.part.vs.epilog))
-		return false;
-
-	return true;
-}
-
-/**
- * Select and compile (or reuse) TES parts (epilog).
- */
-static bool si_shader_select_tes_parts(struct si_screen *sscreen,
-				       LLVMTargetMachineRef tm,
-				       struct si_shader *shader,
-				       struct pipe_debug_callback *debug)
-{
-	if (shader->key.as_es)
-		return true;
-
-	/* TES compiled as VS. */
-	return si_get_vs_epilog(sscreen, tm, shader, debug,
-				&shader->key.part.tes.epilog);
+	return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
+				&shader->key.part.vs.prolog);
 }
 
 /**
  * Compile the TCS epilog function. This writes tesselation factors to memory
  * based on the output primitive type of the tesselator (determined by TES).
  */
 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 					 union si_shader_part_key *key)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
@@ -9389,22 +9235,20 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		switch (sel->type) {
 		case PIPE_SHADER_VERTEX:
 			if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_CTRL:
 			if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_EVAL:
-			if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
-				return -1;
 			break;
 		case PIPE_SHADER_GEOMETRY:
 			if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_FRAGMENT:
 			if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
 				return -1;
 
 			/* Make sure we have at least as many VGPRs as there
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 6bca7f8..d051715 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -383,25 +383,20 @@ struct si_shader_selector {
  *      - with both: -> | HS  | ->  | GS | VS | PS
  *
  * -> = merged with the next stage
  */
 
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
 	unsigned	instance_divisors[SI_MAX_ATTRIBS];
 };
 
-/* Common VS bits between the shader key and the epilog key. */
-struct si_vs_epilog_bits {
-	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
-};
-
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
 	unsigned	tes_reads_tess_factors:1;
 };
 
 struct si_gs_prolog_bits {
 	unsigned	tri_strip_adj_fix:1;
 };
 
@@ -435,24 +430,20 @@ union si_shader_part_key {
 		struct si_vs_prolog_bits states;
 		unsigned	num_input_sgprs:6;
 		/* For merged stages such as LS-HS, HS input VGPRs are first. */
 		unsigned	num_merged_next_stage_vgprs:3;
 		unsigned	last_input:4;
 		unsigned	as_ls:1;
 		/* Prologs for monolithic shaders shouldn't set EXEC. */
 		unsigned	is_monolithic:1;
 	} vs_prolog;
 	struct {
-		struct si_vs_epilog_bits states;
-		unsigned	prim_id_param_offset:5;
-	} vs_epilog;
-	struct {
 		struct si_tcs_epilog_bits states;
 	} tcs_epilog;
 	struct {
 		struct si_gs_prolog_bits states;
 		/* Prologs of monolithic shaders shouldn't set EXEC. */
 		unsigned	is_monolithic:1;
 	} gs_prolog;
 	struct {
 		struct si_ps_prolog_bits states;
 		unsigned	num_input_sgprs:6;
@@ -472,31 +463,27 @@ union si_shader_part_key {
 		unsigned	writes_stencil:1;
 		unsigned	writes_samplemask:1;
 	} ps_epilog;
 };
 
 struct si_shader_key {
 	/* Prolog and epilog flags. */
 	union {
 		struct {
 			struct si_vs_prolog_bits prolog;
-			struct si_vs_epilog_bits epilog;
 		} vs;
 		struct {
 			struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
 			struct si_shader_selector *ls;   /* for merged LS-HS */
 			struct si_tcs_epilog_bits epilog;
 		} tcs; /* tessellation control shader */
 		struct {
-			struct si_vs_epilog_bits epilog; /* same as VS */
-		} tes; /* tessellation evaluation shader */
-		struct {
 			struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
 			struct si_shader_selector *es;   /* for merged ES-GS */
 			struct si_gs_prolog_bits prolog;
 		} gs;
 		struct {
 			struct si_ps_prolog_bits prolog;
 			struct si_ps_epilog_bits epilog;
 		} ps;
 	} part;
 
@@ -504,20 +491,22 @@ struct si_shader_key {
 	 * or guessed if the property doesn't seem correct.
 	 */
 	unsigned as_es:1; /* export shader, which precedes GS */
 	unsigned as_ls:1; /* local shader, which precedes TCS */
 
 	/* Flags for monolithic compilation only. */
 	struct {
 		/* One byte for every input: SI_FIX_FETCH_* enums. */
 		uint8_t		vs_fix_fetch[SI_MAX_ATTRIBS];
 		uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+		/* When PS needs PrimID and GS is disabled. */
+		unsigned	vs_export_prim_id:1;
 	} mono;
 
 	/* Optimization flags for asynchronous compilation only. */
 	struct {
 		struct {
 			uint64_t	kill_outputs; /* "get_unique_index" bits */
 			uint32_t	kill_outputs2; /* "get_unique_index2" bits */
 			unsigned	clip_disable:1;
 		} hw_vs; /* HW VS (it can be VS, TES, GS) */
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0c997e8..f9cd5c3 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -845,21 +845,21 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                          struct si_shader_selector *gs)
 {
 	struct si_pm4_state *pm4;
 	unsigned num_user_sgprs;
 	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned oc_lds_en;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	bool enable_prim_id = si_vs_exports_prim_id(shader);
+	bool enable_prim_id = shader->key.mono.vs_export_prim_id;
 
 	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	/* We always write VGT_GS_MODE in the VS state, because every switch
 	 * between different shader pipelines involving a different GS or no
 	 * GS at all involves a switch of the VS (different GS use different
 	 * copy shaders). On the other hand, when the API switches from a GS to
 	 * no GS and then back to the same GS used originally, the GS state is
@@ -1263,21 +1263,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
 
 		if (sctx->tes_shader.cso)
 			key->as_ls = 1;
 		else if (sctx->gs_shader.cso)
 			key->as_es = 1;
 		else {
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->part.vs.epilog.export_prim_id = 1;
+				key->mono.vs_export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		if (sctx->b.chip_class >= GFX9) {
 			si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
 						  key, &key->part.tcs.ls_prolog);
 			key->part.tcs.ls = sctx->vs_shader.cso;
 		}
 
 		key->part.tcs.epilog.prim_mode =
@@ -1288,21 +1288,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		if (sel == sctx->fixed_func_tcs_shader.cso)
 			key->mono.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
 		break;
 	case PIPE_SHADER_TESS_EVAL:
 		if (sctx->gs_shader.cso)
 			key->as_es = 1;
 		else {
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->part.tes.epilog.export_prim_id = 1;
+				key->mono.vs_export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		if (sctx->b.chip_class >= GFX9) {
 			if (sctx->tes_shader.cso) {
 				key->part.gs.es = sctx->tes_shader.cso;
 			} else {
 				si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
 							  key, &key->part.gs.vs_prolog);
 				key->part.gs.es = sctx->vs_shader.cso;
-- 
2.7.4