[Mesa-dev] [PATCH 29/61] radeonsi/gfx9: add support for monolithic merged LS-HS

Mon Apr 24 08:45:26 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_shader.c | 143 +++++++++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader.h |   2 +
 2 files changed, 128 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 823ffff..9c5dd5e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7592,34 +7592,37 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 	LLVMBuildRet(builder, ret);
 }
 
 /**
  * Given a list of shader part functions, build a wrapper function that
  * runs them in sequence to form a monolithic shader.
  */
 static void si_build_wrapper_function(struct si_shader_context *ctx,
 				      LLVMValueRef *parts,
 				      unsigned num_parts,
-				      unsigned main_part)
+				      unsigned main_part,
+				      unsigned next_shader_first_part)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = ctx->gallivm.builder;
 	/* PS epilog has one arg per color component */
 	LLVMTypeRef param_types[48];
-	LLVMValueRef out[48];
+	LLVMValueRef initial[48], out[48];
 	LLVMTypeRef function_type;
 	unsigned num_params;
-	unsigned num_out;
+	unsigned num_out, initial_num_out;
 	MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+	MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
 	unsigned num_sgprs, num_vgprs;
 	unsigned last_sgpr_param;
 	unsigned gprs;
+	struct lp_build_if_state if_state;
 
 	for (unsigned i = 0; i < num_parts; ++i) {
 		lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
 		LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
 	}
 
 	/* The parameters of the wrapper function correspond to those of the
 	 * first part in terms of SGPRs and VGPRs, but we use the types of the
 	 * main part to get the right types. This is relevant for the
 	 * dereferenceable attribute on descriptor table pointers.
@@ -7657,20 +7660,27 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
 		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
 		assert(gprs + size <= num_sgprs + num_vgprs &&
 		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
 
 		gprs += size;
 	}
 
 	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
 
+	if (is_merged_shader(ctx->shader)) {
+		LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+		lp_build_intrinsic(ctx->gallivm.builder,
+				   "llvm.amdgcn.init.exec", ctx->voidt,
+				   &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+	}
+
 	/* Record the arguments of the function as if they were an output of
 	 * a previous part.
 	 */
 	num_out = 0;
 	num_out_sgpr = 0;
 
 	for (unsigned i = 0; i < num_params; ++i) {
 		LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
 		LLVMTypeRef param_type = LLVMTypeOf(param);
 		LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
@@ -7693,30 +7703,52 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
 			for (unsigned j = 0; j < size; ++j)
 				out[num_out++] = LLVMBuildExtractElement(
 					builder, param, LLVMConstInt(ctx->i32, j, 0), "");
 		}
 
 		if (i <= last_sgpr_param)
 			num_out_sgpr = num_out;
 	}
 
+	memcpy(initial, out, sizeof(out));
+	initial_num_out = num_out;
+	initial_num_out_sgpr = num_out_sgpr;
+
 	/* Now chain the parts. */
 	for (unsigned part = 0; part < num_parts; ++part) {
 		LLVMValueRef in[48];
 		LLVMValueRef ret;
 		LLVMTypeRef ret_type;
 		unsigned out_idx = 0;
 
 		num_params = LLVMCountParams(parts[part]);
 		assert(num_params <= ARRAY_SIZE(param_types));
 
+		/* Merged shaders are executed conditionally depending
+		 * on the number of enabled threads passed in the input SGPRs. */
+		if (is_merged_shader(ctx->shader) &&
+		    (part == 0 || part == next_shader_first_part)) {
+			LLVMValueRef ena, count = initial[3];
+
+			/* The thread count for the 2nd shader is at bit-offset 8. */
+			if (part == next_shader_first_part) {
+				count = LLVMBuildLShr(builder, count,
+						      LLVMConstInt(ctx->i32, 8, 0), "");
+			}
+			count = LLVMBuildAnd(builder, count,
+					     LLVMConstInt(ctx->i32, 0x7f, 0), "");
+			ena = LLVMBuildICmp(builder, LLVMIntULT,
+					    ac_get_thread_id(&ctx->ac), count, "");
+			lp_build_if(&if_state, &ctx->gallivm, ena);
+		}
+
 		/* Derive arguments for the next part from outputs of the
 		 * previous one.
 		 */
 		for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
 			LLVMValueRef param;
 			LLVMTypeRef param_type;
 			bool is_sgpr;
 			unsigned param_size;
 			LLVMValueRef arg = NULL;
 
@@ -7750,23 +7782,47 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 				} else {
 					arg = LLVMBuildBitCast(builder, arg, param_type, "");
 				}
 			}
 
 			in[param_idx] = arg;
 			out_idx += param_size;
 		}
 
 		ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
-		ret_type = LLVMTypeOf(ret);
+
+		if (is_merged_shader(ctx->shader) &&
+		    (part + 1 == next_shader_first_part ||
+		     part + 1 == num_parts)) {
+			lp_build_endif(&if_state);
+
+			if (part + 1 == next_shader_first_part) {
+				/* A barrier is required between 2 merged shaders. */
+				si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
+
+				/* The second half of the merged shader should use
+				 * the inputs from the toplevel (wrapper) function,
+				 * not the return value from the last call.
+				 *
+				 * That's because the last call was executed condi-
+				 * tionally, so we can't consume it in the main
+				 * block.
+				 */
+				memcpy(out, initial, sizeof(initial));
+				num_out = initial_num_out;
+				num_out_sgpr = initial_num_out_sgpr;
+			}
+			continue;
+		}
 
 		/* Extract the returned GPRs. */
+		ret_type = LLVMTypeOf(ret);
 		num_out = 0;
 		num_out_sgpr = 0;
 
 		if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
 			assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
 
 			unsigned ret_size = LLVMCountStructElementTypes(ret_type);
 
 			for (unsigned i = 0; i < ret_size; ++i) {
 				LLVMValueRef val =
@@ -7840,78 +7896,130 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 		}
 
 		if (need_epilog) {
 			union si_shader_part_key epilog_key;
 			si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
 			si_build_vs_epilog_function(&ctx, &epilog_key);
 			parts[need_prolog ? 2 : 1] = ctx.main_fn;
 		}
 
 		si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
-					  need_prolog ? 1 : 0);
+					  need_prolog ? 1 : 0, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-		LLVMValueRef parts[2];
-		union si_shader_part_key epilog_key;
+		if (sscreen->b.chip_class >= GFX9) {
+			struct si_shader_selector *ls = shader->key.part.tcs.ls;
+			LLVMValueRef parts[4];
+
+			/* TCS main part */
+			parts[2] = ctx.main_fn;
+
+			/* TCS epilog */
+			union si_shader_part_key tcs_epilog_key;
+			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+			si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+			parts[3] = ctx.main_fn;
+
+			/* VS prolog */
+			if (ls->vs_needs_prolog) {
+				union si_shader_part_key vs_prolog_key;
+				si_get_vs_prolog_key(&ls->info,
+						     shader->info.num_input_sgprs,
+						     &shader->key.part.tcs.ls_prolog,
+						     shader, &vs_prolog_key);
+				vs_prolog_key.vs_prolog.is_monolithic = true;
+				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+				parts[0] = ctx.main_fn;
+			}
 
-		parts[0] = ctx.main_fn;
+			/* VS as LS main part */
+			struct si_shader shader_ls = {};
+			shader_ls.selector = ls;
+			shader_ls.key.as_ls = 1;
+			shader_ls.key.mono = shader->key.mono;
+			shader_ls.key.opt = shader->key.opt;
+			si_llvm_context_set_tgsi(&ctx, &shader_ls);
 
-		memset(&epilog_key, 0, sizeof(epilog_key));
-		epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-		si_build_tcs_epilog_function(&ctx, &epilog_key);
-		parts[1] = ctx.main_fn;
+			if (!si_compile_tgsi_main(&ctx, true)) {
+				si_llvm_dispose(&ctx);
+				return -1;
+			}
+			shader->info.uses_instanceid |= ls->info.uses_instanceid;
+			parts[1] = ctx.main_fn;
+
+			/* Reset the shader context. */
+			ctx.shader = shader;
+			ctx.type = PIPE_SHADER_TESS_CTRL;
 
-		si_build_wrapper_function(&ctx, parts, 2, 0);
+			si_build_wrapper_function(&ctx,
+						  parts + !ls->vs_needs_prolog,
+						  4 - !ls->vs_needs_prolog, 0,
+						  ls->vs_needs_prolog ? 2 : 1);
+		} else {
+			LLVMValueRef parts[2];
+			union si_shader_part_key epilog_key;
+
+			parts[0] = ctx.main_fn;
+
+			memset(&epilog_key, 0, sizeof(epilog_key));
+			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+			si_build_tcs_epilog_function(&ctx, &epilog_key);
+			parts[1] = ctx.main_fn;
+
+			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+		}
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
 		   !shader->key.as_es) {
 		LLVMValueRef parts[2];
 		union si_shader_part_key epilog_key;
 
 		parts[0] = ctx.main_fn;
 
 		si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
 		si_build_vs_epilog_function(&ctx, &epilog_key);
 		parts[1] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, 2, 0);
+		si_build_wrapper_function(&ctx, parts, 2, 0, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
 		LLVMValueRef parts[2];
 		union si_shader_part_key prolog_key;
 
 		parts[1] = ctx.main_fn;
 
 		memset(&prolog_key, 0, sizeof(prolog_key));
 		prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
 		si_build_gs_prolog_function(&ctx, &prolog_key);
 		parts[0] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, 2, 1);
+		si_build_wrapper_function(&ctx, parts, 2, 1, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
 		LLVMValueRef parts[3];
 		union si_shader_part_key prolog_key;
 		union si_shader_part_key epilog_key;
 		bool need_prolog;
 
 		si_get_ps_prolog_key(shader, &prolog_key, false);
 		need_prolog = si_need_ps_prolog(&prolog_key);
 
 		parts[need_prolog ? 1 : 0] = ctx.main_fn;
 
 		if (need_prolog) {
 			si_build_ps_prolog_function(&ctx, &prolog_key);
 			parts[0] = ctx.main_fn;
 		}
 
 		si_get_ps_epilog_key(shader, &epilog_key);
 		si_build_ps_epilog_function(&ctx, &epilog_key);
 		parts[need_prolog ? 2 : 1] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
+		si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+					  need_prolog ? 1 : 0, 0);
 	}
 
 	/* Dump LLVM IR before any optimization passes */
 	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
 	    r600_can_dump_shader(&sscreen->b, ctx.type))
 		LLVMDumpModule(ctx.gallivm.module);
 
 	si_llvm_finalize_module(&ctx,
 				    r600_extra_shader_checks(&sscreen->b, ctx.type));
 
@@ -8157,21 +8265,22 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 
 	/* Vertex load indices. */
 	for (i = 0; i <= key->vs_prolog.last_input; i++)
 		returns[num_returns++] = ctx->f32;
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, params,
 			   num_params, last_sgpr);
 	func = ctx->main_fn;
 
-	if (key->vs_prolog.num_merged_next_stage_vgprs)
+	if (key->vs_prolog.num_merged_next_stage_vgprs &&
+	    !key->vs_prolog.is_monolithic)
 		si_init_exec_from_input(ctx, 3, 0);
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
 	ret = ctx->return_value;
 	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index afbe547..e24b8b8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -348,20 +348,22 @@ struct si_ps_epilog_bits {
 	unsigned	clamp_color:1;
 };
 
 union si_shader_part_key {
 	struct {
 		struct si_vs_prolog_bits states;
 		unsigned	num_input_sgprs:6;
 		/* For merged stages such as LS-HS, HS input VGPRs are first. */
 		unsigned	num_merged_next_stage_vgprs:3;
 		unsigned	last_input:4;
+		/* Prologs for monolithic shaders shouldn't set EXEC. */
+		unsigned	is_monolithic:1;
 	} vs_prolog;
 	struct {
 		struct si_vs_epilog_bits states;
 		unsigned	prim_id_param_offset:5;
 	} vs_epilog;
 	struct {
 		struct si_tcs_epilog_bits states;
 	} tcs_epilog;
 	struct {
 		struct si_gs_prolog_bits states;
-- 
2.7.4