[Mesa-dev] [PATCH 4/8] radeonsi: enable the barycentric optimization in all cases

Marek Olšák maraeo at gmail.com
Thu Jun 30 23:28:43 UTC 2016


From: Marek Olšák <marek.olsak at amd.com>

Handle the bc_optimize SGPR bit if both CENTER and CENTROID are enabled.
This should increase the PS launch rate for big primitives with MSAA.
Based on discussion with SPI guys.
---
 src/gallium/drivers/radeonsi/si_shader.c        | 118 +++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h        |   7 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  18 ++--
 3 files changed, 125 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4652fe8..10dd12a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1435,6 +1435,56 @@ static void interp_fs_input(struct si_shader_context *ctx,
 	}
 }
 
+/* LLVMGetParam with bc_optimize resolved. */
+static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
+				     int interp_param_idx)
+{
+	LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+	LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
+	LLVMValueRef param = NULL;
+
+	/* Handle PRIM_MASK[31] (bc_optimize). */
+	if (ctx->is_monolithic &&
+	    ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
+	      interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
+	     (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
+	      interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
+		/* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+		 * The hw doesn't compute CENTROID if the whole wave only
+		 * contains fully-covered quads.
+		 */
+		LLVMValueRef bc_optimize =
+			LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
+		bc_optimize = LLVMBuildLShr(builder,
+					    bc_optimize,
+					    LLVMConstInt(ctx->i32, 31, 0), "");
+		bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
+
+		if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
+		    interp_param_idx == SI_PARAM_PERSP_CENTROID) {
+			param = LLVMBuildSelect(builder, bc_optimize,
+						LLVMGetParam(main_fn,
+							     SI_PARAM_PERSP_CENTER),
+						LLVMGetParam(main_fn,
+							     SI_PARAM_PERSP_CENTROID),
+						"");
+		}
+		if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
+		    interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
+			param = LLVMBuildSelect(builder, bc_optimize,
+						LLVMGetParam(main_fn,
+							     SI_PARAM_LINEAR_CENTER),
+						LLVMGetParam(main_fn,
+							     SI_PARAM_LINEAR_CENTROID),
+						"");
+		}
+	}
+
+	if (!param)
+		param = LLVMGetParam(main_fn, interp_param_idx);
+	return param;
+}
+
 static void declare_input_fs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
@@ -1475,7 +1525,7 @@ static void declare_input_fs(
 	else if (interp_param_idx) {
 		interp_param_idx = select_interp_param(ctx,
 						       interp_param_idx);
-		interp_param = LLVMGetParam(main_fn, interp_param_idx);
+		interp_param = get_interp_param(ctx, interp_param_idx);
 	}
 
 	interp_fs_input(ctx, input_index, decl->Semantic.Name,
@@ -5041,7 +5091,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	if (interp_param_idx == -1)
 		return;
 	else if (interp_param_idx)
-		interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
+		interp_param = get_interp_param(ctx, interp_param_idx);
 	else
 		interp_param = NULL;
 
@@ -6410,6 +6460,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		fprintf(f, "  prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
 		fprintf(f, "  prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
 		fprintf(f, "  prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
+		fprintf(f, "  prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
+		fprintf(f, "  prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
 		fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
 		fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
 		fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
@@ -7204,6 +7256,55 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
 		si_llvm_emit_polygon_stipple(&ctx, list, pos);
 	}
 
+	if (key->ps_prolog.states.bc_optimize_for_persp ||
+	    key->ps_prolog.states.bc_optimize_for_linear) {
+		unsigned i, base = key->ps_prolog.num_input_sgprs;
+		LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
+
+		/* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+		 * The hw doesn't compute CENTROID if the whole wave only
+		 * contains fully-covered quads.
+		 *
+		 * PRIM_MASK is after user SGPRs.
+		 */
+		bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+		bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
+					    LLVMConstInt(ctx.i32, 31, 0), "");
+		bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
+					     ctx.i1, "");
+
+		if (key->ps_prolog.states.bc_optimize_for_persp) {
+			/* Read PERSP_CENTER. */
+			for (i = 0; i < 2; i++)
+				center[i] = LLVMGetParam(func, base + 2 + i);
+			/* Read PERSP_CENTROID. */
+			for (i = 0; i < 2; i++)
+				centroid[i] = LLVMGetParam(func, base + 4 + i);
+			/* Select PERSP_CENTROID. */
+			for (i = 0; i < 2; i++) {
+				tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
+						      center[i], centroid[i], "");
+				ret = LLVMBuildInsertValue(gallivm->builder, ret,
+							   tmp, base + 4 + i, "");
+			}
+		}
+		if (key->ps_prolog.states.bc_optimize_for_linear) {
+			/* Read LINEAR_CENTER. */
+			for (i = 0; i < 2; i++)
+				center[i] = LLVMGetParam(func, base + 8 + i);
+			/* Read LINEAR_CENTROID. */
+			for (i = 0; i < 2; i++)
+				centroid[i] = LLVMGetParam(func, base + 10 + i);
+			/* Select LINEAR_CENTROID. */
+			for (i = 0; i < 2; i++) {
+				tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
+						      center[i], centroid[i], "");
+				ret = LLVMBuildInsertValue(gallivm->builder, ret,
+							   tmp, base + 10 + i, "");
+			}
+		}
+	}
+
 	/* Interpolate colors. */
 	for (i = 0; i < 2; i++) {
 		unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
@@ -7220,8 +7321,11 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
 			unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
 					       key->ps_prolog.color_interp_vgpr_index[i];
 
-			interp[0] = LLVMGetParam(func, interp_vgpr);
-			interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+			/* Get the (i,j) updated by bc_optimize handling. */
+			interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
+							  interp_vgpr, "");
+			interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
+							  interp_vgpr + 1, "");
 			interp_ij = lp_build_gather_values(gallivm, interp, 2);
 			interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
 						     ctx.v2i32, "");
@@ -7478,7 +7582,9 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
 		 prolog_key.ps_prolog.states.force_persp_sample_interp ||
 		 prolog_key.ps_prolog.states.force_linear_sample_interp ||
 		 prolog_key.ps_prolog.states.force_persp_center_interp ||
-		 prolog_key.ps_prolog.states.force_linear_center_interp);
+		 prolog_key.ps_prolog.states.force_linear_center_interp ||
+		 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
+		 prolog_key.ps_prolog.states.bc_optimize_for_linear);
 
 	if (info->colors_read) {
 		unsigned *color = shader->selector->color_attr_index;
@@ -7569,6 +7675,8 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
 	    prolog_key.ps_prolog.states.force_linear_sample_interp ||
 	    prolog_key.ps_prolog.states.force_persp_center_interp ||
 	    prolog_key.ps_prolog.states.force_linear_center_interp ||
+	    prolog_key.ps_prolog.states.bc_optimize_for_persp ||
+	    prolog_key.ps_prolog.states.bc_optimize_for_linear ||
 	    prolog_key.ps_prolog.states.poly_stipple) {
 		shader->prolog =
 			si_get_shader_part(sscreen, &sscreen->ps_prologs,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 0647736..3b7b3e1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -317,11 +317,8 @@ struct si_ps_prolog_bits {
 	unsigned	force_linear_sample_interp:1;
 	unsigned	force_persp_center_interp:1;
 	unsigned	force_linear_center_interp:1;
-	/* TODO:
-	 * - add force_center_interp_bc_optimize to force center interpolation
-	 *   based on the bc_optimize SGPR bit if MSAA is enabled, centroid is
-	 *   present and sample isn't present.
-	 */
+	unsigned	bc_optimize_for_persp:1;
+	unsigned	bc_optimize_for_linear:1;
 };
 
 /* Common PS bits between the shader key and the epilog key. */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index eb3c2f9..42ebf69 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -671,7 +671,6 @@ static void si_shader_ps(struct si_shader *shader)
 	unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
 	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	uint64_t va;
-	bool has_centroid;
 	unsigned input_ena = shader->config.spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
@@ -736,11 +735,7 @@ static void si_shader_ps(struct si_shader *shader)
 		       shader->config.spi_ps_input_addr);
 
 	/* Set interpolation controls. */
-	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
-		       G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
-
-	spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
-			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
+	spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader));
 
 	/* Set registers. */
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
@@ -953,8 +948,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				key->ps.prolog.force_linear_sample_interp =
 					sel->info.uses_linear_center ||
 					sel->info.uses_linear_centroid;
-			} else if (!rs->multisample_enable ||
-				   sctx->framebuffer.nr_samples <= 1) {
+			} else if (rs->multisample_enable &&
+				   sctx->framebuffer.nr_samples > 1) {
+				key->ps.prolog.bc_optimize_for_persp =
+					sel->info.uses_persp_center &&
+					sel->info.uses_persp_centroid;
+				key->ps.prolog.bc_optimize_for_linear =
+					sel->info.uses_linear_center &&
+					sel->info.uses_linear_centroid;
+			} else {
 				/* Make sure SPI doesn't compute more than 1 pair
 				 * of (i,j), which is the optimization here. */
 				key->ps.prolog.force_persp_center_interp =
-- 
2.7.4



More information about the mesa-dev mailing list