[Mesa-dev] [PATCH 10/61] radeonsi: code shuffling in si_emit_derived_tess_state

Mon Apr 24 08:45:07 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_state_draw.c | 69 +++++++++++++++-------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index a354c69..c24d607 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -100,21 +100,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
 		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
 	unsigned num_tcs_patch_outputs;
 	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
 	unsigned input_patch_size, output_patch_size, output_patch0_offset;
-	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+	unsigned perpatch_output_offset, lds_size;
 	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
 	unsigned offchip_layout, hardware_lds_size, ls_hs_config;
 
 	if (sctx->last_ls == ls->current &&
 	    sctx->last_tcs == tcs &&
 	    sctx->last_tes_sh_base == tes_sh_base &&
 	    sctx->last_num_tcs_input_cp == num_tcs_input_cp) {
 		*num_patches = sctx->last_num_patches;
 		return;
 	}
@@ -174,74 +174,81 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == SI) {
 		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
 
 	sctx->last_num_patches = *num_patches;
 
 	output_patch0_offset = input_patch_size * *num_patches;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
-	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->config.rsrc2;
-
-	if (sctx->b.chip_class >= CIK) {
-		assert(lds_size <= 65536);
-		lds_size = align(lds_size, 512) / 512;
-	} else {
-		assert(lds_size <= 32768);
-		lds_size = align(lds_size, 256) / 256;
-	}
-	si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
-	ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
-	/* Due to a hw bug, RSRC2_LS must be written twice with another
-	 * LS register written in between. */
-	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
-		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->config.rsrc1);
-	radeon_emit(cs, ls_rsrc2);
-
 	/* Compute userdata SGPRs. */
 	assert(((input_vertex_size / 4) & ~0xff) == 0);
 	assert(((output_vertex_size / 4) & ~0xff) == 0);
 	assert(((input_patch_size / 4) & ~0x1fff) == 0);
 	assert(((output_patch_size / 4) & ~0x1fff) == 0);
 	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
 	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
 	assert(num_tcs_input_cp <= 32);
 	assert(num_tcs_output_cp <= 32);
 
 	tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
 			S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
 	tcs_out_layout = (output_patch_size / 4) |
 			 ((output_vertex_size / 4) << 13);
 	tcs_out_offsets = (output_patch0_offset / 16) |
 			  ((perpatch_output_offset / 16) << 16);
 	offchip_layout = (pervertex_output_patch_size * *num_patches << 16) |
 			 (num_tcs_output_cp << 9) | *num_patches;
 
-	/* Set them for LS. */
+	/* Compute the LDS size. */
+	lds_size = output_patch0_offset + output_patch_size * *num_patches;
+
+	if (sctx->b.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		lds_size = align(lds_size, 512) / 512;
+	} else {
+		assert(lds_size <= 32768);
+		lds_size = align(lds_size, 256) / 256;
+	}
+
+	/* Set SI_SGPR_VS_STATE_BITS. */
 	sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
 				  C_VS_STATE_LS_OUT_VERTEX_SIZE;
 	sctx->current_vs_state |= tcs_in_layout;
 
-	/* Set them for TCS. */
-	radeon_set_sh_reg_seq(cs,
-		R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-	radeon_emit(cs, offchip_layout);
-	radeon_emit(cs, tcs_out_offsets);
-	radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
-	radeon_emit(cs, tcs_in_layout);
+	if (sctx->b.chip_class >= GFX9) {
+		// TODO
+	} else {
+		unsigned ls_rsrc2 = ls->current->config.rsrc2;
+
+		si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+		/* Due to a hw bug, RSRC2_LS must be written twice with another
+		 * LS register written in between. */
+		if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+			radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+		radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+		radeon_emit(cs, ls->current->config.rsrc1);
+		radeon_emit(cs, ls_rsrc2);
+
+		/* Set userdata SGPRs for TCS. */
+		radeon_set_sh_reg_seq(cs,
+			R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+		radeon_emit(cs, offchip_layout);
+		radeon_emit(cs, tcs_out_offsets);
+		radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+		radeon_emit(cs, tcs_in_layout);
+	}
 
-	/* Set them for TES. */
+	/* Set userdata SGPRs for TES. */
 	radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 1);
 	radeon_emit(cs, offchip_layout);
 
 	ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
 		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
 		       S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
 
 	if (sctx->b.chip_class >= CIK)
 		radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
 					   ls_hs_config);
-- 
2.7.4