[Mesa-dev] [PATCH 1/8] radeonsi: properly compute an LS-HS thread group size limit

Sat Jun 9 03:16:48 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

"64 / max * 4" is less than "64 * 4 / max".
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 942cb3c7994..e7f8389caf3 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -139,21 +139,22 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
 
 	input_patch_size = num_tcs_input_cp * input_vertex_size;
 
 	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 
 	/* Ensure that we only need one wave per SIMD so we don't need to check
 	 * resource usage. Also ensures that the number of tcs in and out
 	 * vertices per threadgroup are at most 256.
 	 */
-	*num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+	unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
+	*num_patches = 256 / max_verts_per_patch;
 
 	/* Make sure that the data fits in LDS. This assumes the shaders only
 	 * use LDS for the inputs and outputs.
 	 *
 	 * While CIK can use 64K per threadgroup, there is a hang on Stoney
 	 * with 2 CUs if we use more than 32K. The closed Vulkan driver also
 	 * uses 32K at most on all GCN chips.
 	 */
 	hardware_lds_size = 32768;
 	*num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
@@ -166,21 +167,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
 
 	/* Not necessary for correctness, but improves performance. The
 	 * specific value is taken from the proprietary driver.
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
 	if (sctx->chip_class == SI) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
-		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+		unsigned one_wave = 64 / max_verts_per_patch;
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
 
 	/* The VGT HS block increments the patch ID unconditionally
 	 * within a single threadgroup. This results in incorrect
 	 * patch IDs when instanced draws are used.
 	 *
 	 * The intended solution is to restrict threadgroups to
 	 * a single instance by setting SWITCH_ON_EOI, which
 	 * should cause IA to split instances up. However, this
-- 
2.17.1