[Mesa-dev] [PATCH 8/9] radeonsi: don't read tcs_out_lds_layout.vertex_size from an SGPR

Wed Sep 6 17:03:59 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

TCS outputs are usually not written to LDS, so no stats here.
---
 src/gallium/drivers/radeonsi/si_shader.c          | 21 +++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_shader_internal.h |  2 --
 src/gallium/drivers/radeonsi/si_state_draw.c      |  3 +--
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d622304..1a9fce9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -319,20 +319,37 @@ get_tcs_in_patch_stride(struct si_shader_context *ctx)
 {
 	return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 }
 
 static LLVMValueRef
 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
 	return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 }
 
+static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
+{
+	assert(ctx->type == PIPE_SHADER_TESS_CTRL);
+
+	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+		return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+
+	return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
+}
+
+static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
+{
+	unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+
+	return LLVMConstInt(ctx->i32, stride, 0);
+}
+
 static LLVMValueRef
 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 {
 	return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 				unpack_param(ctx,
 					     ctx->param_tcs_out_lds_offsets,
 					     0, 16),
 				4);
 }
 
@@ -1079,21 +1096,21 @@ static LLVMValueRef fetch_input_tcs(
 
 static LLVMValueRef fetch_output_tcs(
 		struct lp_build_tgsi_context *bld_base,
 		const struct tgsi_full_src_register *reg,
 		enum tgsi_opcode_type type, unsigned swizzle)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef dw_addr, stride;
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
+		stride = get_tcs_out_vertex_dw_stride(ctx);
 		dw_addr = get_tcs_out_current_patch_offset(ctx);
 		dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 	} else {
 		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 		dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 	}
 
 	return lds_load(bld_base, type, swizzle, dw_addr);
 }
 
@@ -1132,21 +1149,21 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 	/* Only handle per-patch and per-vertex outputs here.
 	 * Vectors will be lowered to scalars and this function will be called again.
 	 */
 	if (reg->Register.File != TGSI_FILE_OUTPUT ||
 	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
 		si_llvm_emit_store(bld_base, inst, info, dst);
 		return;
 	}
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
+		stride = get_tcs_out_vertex_dw_stride(ctx);
 		dw_addr = get_tcs_out_current_patch_offset(ctx);
 		dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
 		skip_lds_store = !sh_info->reads_pervertex_outputs;
 	} else {
 		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 		dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
 		skip_lds_store = !sh_info->reads_perpatch_outputs;
 
 		if (!reg->Register.Indirect) {
 			int name = sh_info->output_semantic_name[reg->Register.Index];
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 4ae8d85..023f9a6 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -154,22 +154,20 @@ struct si_shader_context {
 	/* API TCS */
 	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
 	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
 	 *   [16:31] = TCS output patch0 offset for per-patch / 16
 	 *             max = (NUM_PATCHES + 1) * 32*32
 	 */
 	int param_tcs_out_lds_offsets;
 	/* Layout of TCS outputs / TES inputs:
 	 *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
 	 *            max = 32*32*4 + 32*4
-	 *   [13:20] = stride between output vertices in DW = num_inputs * 4
-	 *             max = 32*4
 	 *   [26:31] = gl_PatchVerticesIn, max = 32
 	 */
 	int param_tcs_out_lds_layout;
 	int param_tcs_offchip_addr_base64k;
 	int param_tcs_factor_addr_base64k;
 	int param_tcs_offchip_offset;
 	int param_tcs_factor_offset;
 	int param_tcs_patch_id;
 	int param_tcs_rel_ids;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 363a4ae..05ed854 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -226,22 +226,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	assert(((output_vertex_size / 4) & ~0xff) == 0);
 	assert(((input_patch_size / 4) & ~0x1fff) == 0);
 	assert(((output_patch_size / 4) & ~0x1fff) == 0);
 	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
 	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
 	assert(num_tcs_input_cp <= 32);
 	assert(num_tcs_output_cp <= 32);
 
 	tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
 			S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
-	tcs_out_layout = (output_patch_size / 4) |
-			 ((output_vertex_size / 4) << 13);
+	tcs_out_layout = output_patch_size / 4;
 	tcs_out_offsets = (output_patch0_offset / 16) |
 			  ((perpatch_output_offset / 16) << 16);
 	offchip_layout = *num_patches |
 			 (num_tcs_output_cp << 6) |
 			 (pervertex_output_patch_size * *num_patches << 12);
 
 	/* Compute the LDS size. */
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
 
 	if (sctx->b.chip_class >= CIK) {
-- 
2.7.4