[Mesa-dev] [PATCH 3/6] radeonsi: skip TESSINNER/OUTER offchip stores if TES doesn't read them

Marek Olšák maraeo at gmail.com
Sun Feb 19 16:27:44 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

We were unconditionally storing these outputs, sometimes even one component
at a time, but apps never read them in TES.

Move the TESSINNER/OUTER buffer stores into the TCS epilog where we can
easily disable them on demand.
---
 src/gallium/drivers/radeonsi/si_shader.c        | 89 ++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_shader.h        |  1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c |  2 +
 3 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 65e3faf..cd537be 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -708,43 +708,44 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
  * - attribute 1 of patch 0 vertex 0
  * - attribute 1 of patch 0 vertex 1
  *   ...
  * - per patch attribute 0 of patch 0
  * - per patch attribute 0 of patch 1
  *   ...
  *
  * Note that every attribute has 4 components.
  */
 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
+					       LLVMValueRef rel_patch_id,
                                                LLVMValueRef vertex_index,
                                                LLVMValueRef param_index)
 {
 	struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
 	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 	LLVMValueRef param_stride, constant16;
 
 	vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
 	num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
 	total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 	                              num_patches, "");
 
 	constant16 = lp_build_const_int32(gallivm, 16);
 	if (vertex_index) {
-		base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
+		base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
 		                         vertices_per_patch, "");
 
 		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 		                         vertex_index, "");
 
 		param_stride = total_vertices;
 	} else {
-		base_addr = get_rel_patch_id(ctx);
+		base_addr = rel_patch_id;
 		param_stride = num_patches;
 	}
 
 	base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 	                         LLVMBuildMul(gallivm->builder, param_index,
 	                                      param_stride, ""), "");
 
 	base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 
 	if (!vertex_index) {
@@ -810,21 +811,22 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 		param_index = lp_build_const_int32(gallivm, 0);
 	}
 
 	param_index_base = si_shader_io_get_unique_index(name[param_base],
 	                                                 index[param_base]);
 
 	param_index = LLVMBuildAdd(gallivm->builder, param_index,
 	                           lp_build_const_int32(gallivm, param_index_base),
 	                           "");
 
-	return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
+	return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
+					  vertex_index, param_index);
 }
 
 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
                                 enum tgsi_opcode_type type, unsigned swizzle,
                                 LLVMValueRef buffer, LLVMValueRef offset,
                                 LLVMValueRef base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef value, value2;
@@ -981,20 +983,21 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
 	unsigned chan_index;
 	LLVMValueRef dw_addr, stride;
 	LLVMValueRef rw_buffers, buffer, base, buf_addr;
 	LLVMValueRef values[4];
 	bool skip_lds_store;
+	bool is_tess_factor = false;
 
 	/* Only handle per-patch and per-vertex outputs here.
 	 * Vectors will be lowered to scalars and this function will be called again.
 	 */
 	if (reg->Register.File != TGSI_FILE_OUTPUT ||
 	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
 		si_llvm_emit_store(bld_base, inst, info, dst);
 		return;
 	}
 
@@ -1006,22 +1009,24 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 	} else {
 		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 		dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
 		skip_lds_store = !sh_info->reads_perpatch_outputs;
 
 		if (!reg->Register.Indirect) {
 			int name = sh_info->output_semantic_name[reg->Register.Index];
 
 			/* Always write tess factors into LDS for the TCS epilog. */
 			if (name == TGSI_SEMANTIC_TESSINNER ||
-			    name == TGSI_SEMANTIC_TESSOUTER)
+			    name == TGSI_SEMANTIC_TESSOUTER) {
 				skip_lds_store = false;
+				is_tess_factor = true;
+			}
 		}
 	}
 
 	rw_buffers = LLVMGetParam(ctx->main_fn,
 				  SI_PARAM_RW_BUFFERS);
 	buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
 			lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
 
 	base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
 	buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
@@ -1033,28 +1038,28 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 		if (inst->Instruction.Saturate)
 			value = ac_emit_clamp(&ctx->ac, value);
 
 		/* Skip LDS stores if there is no LDS read of this output. */
 		if (!skip_lds_store)
 			lds_store(bld_base, chan_index, dw_addr, value);
 
 		value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 		values[chan_index] = value;
 
-		if (inst->Dst[0].Register.WriteMask != 0xF) {
+		if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
 			ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 1,
 						      buf_addr, base,
 						      4 * chan_index);
 		}
 	}
 
-	if (inst->Dst[0].Register.WriteMask == 0xF) {
+	if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
 		LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
 		                                            values, 4);
 		ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buf_addr,
 					      base, 0);
 	}
 }
 
 static LLVMValueRef fetch_input_gs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -1523,21 +1528,21 @@ static void declare_system_value(
 	{
 		LLVMValueRef rw_buffers, buffer, base, addr;
 		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
 
 		rw_buffers = LLVMGetParam(ctx->main_fn,
 					SI_PARAM_RW_BUFFERS);
 		buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
 		        lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
 
 		base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
-		addr = get_tcs_tes_buffer_address(ctx, NULL,
+		addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
 		                          lp_build_const_int32(gallivm, param));
 
 		value = buffer_load(&radeon_bld->bld_base, TGSI_TYPE_FLOAT,
 		                    ~0, buffer, base, addr);
 
 		break;
 	}
 
 	case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
 	case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
@@ -2405,20 +2410,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 
 	inputs = ctx->shader->key.mono.tcs.inputs_to_copy;
 	while (inputs) {
 		unsigned i = u_bit_scan64(&inputs);
 
 		LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
 		                            lp_build_const_int32(gallivm, 4 * i),
 		                             "");
 
 		LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
+					      get_rel_patch_id(ctx),
 		                              invocation_id,
 		                              lp_build_const_int32(gallivm, i));
 
 		LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
 		                              lds_ptr);
 
 		ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buffer_addr,
 					      buffer_offset, 0);
 	}
 }
@@ -2426,21 +2432,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 				  LLVMValueRef rel_patch_id,
 				  LLVMValueRef invocation_id,
 				  LLVMValueRef tcs_out_current_patch_data_offset)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	struct si_shader *shader = ctx->shader;
 	unsigned tess_inner_index, tess_outer_index;
 	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-	LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
+	LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
 	unsigned stride, outer_comps, inner_comps, i;
 	struct lp_build_if_state if_ctx, inner_if_ctx;
 
 	si_llvm_emit_barrier(NULL, bld_base, NULL);
 
 	/* Do this only for invocation 0, because the tess levels are per-patch,
 	 * not per-vertex.
 	 *
 	 * This can't jump, because invocation 0 executes this. It should
 	 * at least mask out the loads and stores for other invocations.
@@ -2478,31 +2484,40 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
 
 	lds_base = tcs_out_current_patch_data_offset;
 	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
 				 lp_build_const_int32(gallivm,
 						      tess_inner_index * 4), "");
 	lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
 				 lp_build_const_int32(gallivm,
 						      tess_outer_index * 4), "");
 
+	for (i = 0; i < 4; i++) {
+		inner[i] = LLVMGetUndef(ctx->i32);
+		outer[i] = LLVMGetUndef(ctx->i32);
+	}
+
 	if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
 		/* For isolines, the hardware expects tess factors in the
 		 * reverse order from what GLSL / TGSI specify.
 		 */
-		out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
-		out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
+		outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
+		outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
 	} else {
-		for (i = 0; i < outer_comps; i++)
-			out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
-		for (i = 0; i < inner_comps; i++)
-			out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+		for (i = 0; i < outer_comps; i++) {
+			outer[i] = out[i] =
+				lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
+		}
+		for (i = 0; i < inner_comps; i++) {
+			inner[i] = out[outer_comps+i] =
+				lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+		}
 	}
 
 	/* Convert the outputs to vectors for stores. */
 	vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
 	vec1 = NULL;
 
 	if (stride > 4)
 		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
 	/* Get the buffer. */
@@ -2527,28 +2542,65 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 				      1, lp_build_const_int32(gallivm, 0), tf_base, 0);
 
 	lp_build_endif(&inner_if_ctx);
 
 	/* Store the tessellation factors. */
 	ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec0,
 				      MIN2(stride, 4), byteoffset, tf_base, 4);
 	if (vec1)
 		ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec1,
 					      stride - 4, byteoffset, tf_base, 20);
+
+	/* Store the tess factors into the offchip buffer if TES reads them. */
+	if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+		LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
+		LLVMValueRef tf_inner_offset;
+		unsigned param_outer, param_inner;
+
+		buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
+				LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+		base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
+
+		param_outer = si_shader_io_get_unique_index(
+				      TGSI_SEMANTIC_TESSOUTER, 0);
+		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+					LLVMConstInt(ctx->i32, param_outer, 0));
+
+		outer_vec = lp_build_gather_values(gallivm, outer,
+						   util_next_power_of_two(outer_comps));
+
+		ac_build_tbuffer_store_dwords(&ctx->ac, buf, outer_vec,
+					      outer_comps, tf_outer_offset,
+					      base, 0);
+		if (inner_comps) {
+			param_inner = si_shader_io_get_unique_index(
+					      TGSI_SEMANTIC_TESSINNER, 0);
+			tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+					LLVMConstInt(ctx->i32, param_inner, 0));
+
+			inner_vec = inner_comps == 1 ? inner[0] :
+				    lp_build_gather_values(gallivm, inner, inner_comps);
+			ac_build_tbuffer_store_dwords(&ctx->ac, buf, inner_vec,
+						      inner_comps, tf_inner_offset,
+						      base, 0);
+		}
+	}
+
 	lp_build_endif(&if_ctx);
 }
 
 /* This only writes the tessellation factor levels. */
 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+	LLVMValueRef offchip_soffset, offchip_layout;
 
 	si_copy_tcs_inputs(bld_base);
 
 	rel_patch_id = get_rel_patch_id(ctx);
 	invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
 	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
 	/* Return epilog parameters from this function. */
 	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
 	LLVMValueRef ret = ctx->return_value;
@@ -2560,23 +2612,30 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 				  SI_PARAM_RW_BUFFERS);
 	rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
 	rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
 	rw0 = LLVMBuildExtractElement(builder, rw_buffers,
 				      bld_base->uint_bld.zero, "");
 	rw1 = LLVMBuildExtractElement(builder, rw_buffers,
 				      bld_base->uint_bld.one, "");
 	ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
 	ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
 
-	/* Tess factor buffer soffset is after user SGPRs. */
+	/* Tess offchip and factor buffer soffset are after user SGPRs. */
+	offchip_layout = LLVMGetParam(ctx->main_fn,
+				      SI_PARAM_TCS_OFFCHIP_LAYOUT);
+	offchip_soffset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
 	tf_soffset = LLVMGetParam(ctx->main_fn,
 				  SI_PARAM_TESS_FACTOR_OFFSET);
+	ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
+				   SI_SGPR_TCS_OFFCHIP_LAYOUT, "");
+	ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
+				   SI_TCS_NUM_USER_SGPR, "");
 	ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
 				   SI_TCS_NUM_USER_SGPR + 1, "");
 
 	/* VGPRs */
 	rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
 	invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
 	tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
 
 	vgpr = SI_TCS_NUM_USER_SGPR + 2;
 	ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 0bb0f18..71cd95b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -345,20 +345,21 @@ struct si_vs_prolog_bits {
 };
 
 /* Common VS bits between the shader key and the epilog key. */
 struct si_vs_epilog_bits {
 	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
+	unsigned	tes_reads_tess_factors:1;
 };
 
 struct si_gs_prolog_bits {
 	unsigned	tri_strip_adj_fix:1;
 };
 
 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
 	unsigned	color_two_side:1;
 	unsigned	flatshade_colors:1;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4a81b56..727ff33 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -985,20 +985,22 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		else {
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 				key->part.vs.epilog.export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		key->part.tcs.epilog.prim_mode =
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+		key->part.tcs.epilog.tes_reads_tess_factors =
+			sctx->tes_shader.cso->info.reads_tess_factors;
 
 		if (sel == sctx->fixed_func_tcs_shader.cso)
 			key->mono.tcs.inputs_to_copy = sctx->vs_shader.cso->outputs_written;
 		break;
 	case PIPE_SHADER_TESS_EVAL:
 		if (sctx->gs_shader.cso)
 			key->as_es = 1;
 		else {
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
-- 
2.7.4



More information about the mesa-dev mailing list