[Mesa-dev] [PATCH] radv: optimise compute dispatch to avoid looking up the sgpr repeatedly.

Wed Jun 7 03:48:05 UTC 2017

From: Dave Airlie <airlied at redhat.com>

Same as we did for draw dispatch and vertex sgprs.
---
 src/amd/vulkan/radv_cmd_buffer.c | 23 +++++++++--------------
 src/amd/vulkan/radv_pipeline.c   |  6 ++++++
 src/amd/vulkan/radv_private.h    |  4 ++++
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index a069945..a4ddd7e 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2872,13 +2872,10 @@ void radv_CmdDispatch(
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
 
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-	if (loc->sgpr_idx != -1) {
-		assert(!loc->indirect);
+	if (cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr) {
 		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
-		assert(loc->num_sgprs == grid_used);
-		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
+		radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr,
+				      grid_used);
 		radeon_emit(cmd_buffer->cs, x);
 		if (grid_used > 1)
 			radeon_emit(cmd_buffer->cs, y);
@@ -2912,9 +2909,9 @@ void radv_CmdDispatchIndirect(
 	radv_flush_compute_state(cmd_buffer);
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25);
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-	if (loc->sgpr_idx != -1) {
+
+
+	if (cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr) {
 		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
 		for (unsigned i = 0; i < grid_used; ++i) {
 			radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
@@ -2922,7 +2919,7 @@ void radv_CmdDispatchIndirect(
 				    COPY_DATA_DST_SEL(COPY_DATA_REG));
 			radeon_emit(cmd_buffer->cs, (va +  4 * i));
 			radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32);
-			radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
+			radeon_emit(cmd_buffer->cs, (cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr >> 2) + i);
 			radeon_emit(cmd_buffer->cs, 0);
 		}
 	}
@@ -2984,11 +2981,9 @@ void radv_unaligned_dispatch(
 		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) |
 		    S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
 
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-	if (loc->sgpr_idx != -1) {
+	if (cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr) {
 		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
-		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
+		radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.compute_pipeline->compute.cs_grid_size_sgpr, grid_used);
 		radeon_emit(cmd_buffer->cs, blocks[0]);
 		if (grid_used > 1)
 			radeon_emit(cmd_buffer->cs, blocks[1]);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index ccbe20d..bda4c74 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2375,6 +2375,12 @@ static VkResult radv_compute_pipeline_create(
 
 
 	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
+
+	struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline,
+							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
+	if (loc->sgpr_idx != -1) {
+		pipeline->compute.cs_grid_size_sgpr = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
+	}
 	result = radv_pipeline_scratch_init(device, pipeline);
 	if (result != VK_SUCCESS) {
 		radv_pipeline_destroy(device, pipeline, pAllocator);
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 8f60d9b..29db05c 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1090,6 +1090,10 @@ struct radv_pipeline {
 			bool tess_partial_vs_wave;
 			bool partial_es_wave;
 		} graphics;
+
+		struct {
+			uint32_t cs_grid_size_sgpr;
+		} compute;
 	};
 
 	unsigned max_waves;
-- 
2.9.4