[Mesa-dev] [PATCH 1/2] radv: Put semaphore waits in preamble cs.

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Sat Sep 2 22:00:52 UTC 2017


The separate flush cs gets in the way of batchchain.
---
 src/amd/vulkan/radv_device.c  | 94 +++++++++++++++++--------------------------
 src/amd/vulkan/radv_private.h |  3 +-
 2 files changed, 37 insertions(+), 60 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index a5898691df6..d4dd2dbf5ff 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1000,6 +1000,8 @@ radv_queue_finish(struct radv_queue *queue)
 	if (queue->hw_ctx)
 		queue->device->ws->ctx_destroy(queue->hw_ctx);
 
+	if (queue->initial_full_flush_preamble_cs)
+		queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs);
 	if (queue->initial_preamble_cs)
 		queue->device->ws->cs_destroy(queue->initial_preamble_cs);
 	if (queue->continue_preamble_cs)
@@ -1178,41 +1180,6 @@ VkResult radv_CreateDevice(
 			break;
 		}
 		device->ws->cs_finalize(device->empty_cs[family]);
-
-		device->flush_cs[family] = device->ws->cs_create(device->ws, family);
-		switch (family) {
-		case RADV_QUEUE_GENERAL:
-		case RADV_QUEUE_COMPUTE:
-			si_cs_emit_cache_flush(device->flush_cs[family],
-					       false,
-			                       device->physical_device->rad_info.chip_class,
-					       NULL, 0,
-			                       family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
-			                       RADV_CMD_FLAG_INV_ICACHE |
-			                       RADV_CMD_FLAG_INV_SMEM_L1 |
-			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
-			break;
-		}
-		device->ws->cs_finalize(device->flush_cs[family]);
-
-		device->flush_shader_cs[family] = device->ws->cs_create(device->ws, family);
-		switch (family) {
-		case RADV_QUEUE_GENERAL:
-		case RADV_QUEUE_COMPUTE:
-			si_cs_emit_cache_flush(device->flush_shader_cs[family],
-					       false,
-			                       device->physical_device->rad_info.chip_class,
-					       NULL, 0,
-			                       family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
-					       family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
-			                       RADV_CMD_FLAG_INV_ICACHE |
-			                       RADV_CMD_FLAG_INV_SMEM_L1 |
-			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
-			break;
-		}
-		device->ws->cs_finalize(device->flush_shader_cs[family]);
 	}
 
 	if (getenv("RADV_TRACE_FILE")) {
@@ -1280,10 +1247,6 @@ void radv_DestroyDevice(
 			vk_free(&device->alloc, device->queues[i]);
 		if (device->empty_cs[i])
 			device->ws->cs_destroy(device->empty_cs[i]);
-		if (device->flush_cs[i])
-			device->ws->cs_destroy(device->flush_cs[i]);
-		if (device->flush_shader_cs[i])
-			device->ws->cs_destroy(device->flush_shader_cs[i]);
 	}
 	radv_device_finish_meta(device);
 
@@ -1576,6 +1539,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 		     uint32_t gsvs_ring_size,
 		     bool needs_tess_rings,
 		     bool needs_sample_positions,
+		     struct radeon_winsys_cs **initial_full_flush_preamble_cs,
                      struct radeon_winsys_cs **initial_preamble_cs,
                      struct radeon_winsys_cs **continue_preamble_cs)
 {
@@ -1586,7 +1550,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 	struct radeon_winsys_bo *gsvs_ring_bo = NULL;
 	struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
 	struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
-	struct radeon_winsys_cs *dest_cs[2] = {0};
+	struct radeon_winsys_cs *dest_cs[3] = {0};
 	bool add_tess_rings = false, add_sample_positions = false;
 	unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
 	unsigned max_offchip_buffers;
@@ -1611,6 +1575,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 	    gsvs_ring_size <= queue->gsvs_ring_size &&
 	    !add_tess_rings && !add_sample_positions &&
 	    queue->initial_preamble_cs) {
+		*initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
 		*initial_preamble_cs = queue->initial_preamble_cs;
 		*continue_preamble_cs = queue->continue_preamble_cs;
 		if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
@@ -1712,7 +1677,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 	} else
 		descriptor_bo = queue->descriptor_bo;
 
-	for(int i = 0; i < 2; ++i) {
+	for(int i = 0; i < 3; ++i) {
 		struct radeon_winsys_cs *cs = NULL;
 		cs = queue->device->ws->cs_create(queue->device->ws,
 						  queue->queue_family_index ? RING_COMPUTE : RING_GFX);
@@ -1831,7 +1796,19 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			radeon_emit(cs, rsrc1);
 		}
 
-		if (!i) {
+		if (i == 0) {
+			si_cs_emit_cache_flush(cs,
+					       false,
+			                       queue->device->physical_device->rad_info.chip_class,
+					       NULL, 0,
+			                       queue->queue_family_index == RING_COMPUTE &&
+			                         queue->device->physical_device->rad_info.chip_class >= CIK,
+			                       (queue->queue_family_index == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
+			                       RADV_CMD_FLAG_INV_ICACHE |
+			                       RADV_CMD_FLAG_INV_SMEM_L1 |
+			                       RADV_CMD_FLAG_INV_VMEM_L1 |
+			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
+		} else if (i == 1) {
 			si_cs_emit_cache_flush(cs,
 					       false,
 			                       queue->device->physical_device->rad_info.chip_class,
@@ -1848,14 +1825,18 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			goto fail;
 	}
 
+	if (queue->initial_full_flush_preamble_cs)
+			queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs);
+
 	if (queue->initial_preamble_cs)
 			queue->device->ws->cs_destroy(queue->initial_preamble_cs);
 
 	if (queue->continue_preamble_cs)
 			queue->device->ws->cs_destroy(queue->continue_preamble_cs);
 
-	queue->initial_preamble_cs = dest_cs[0];
-	queue->continue_preamble_cs = dest_cs[1];
+	queue->initial_full_flush_preamble_cs = dest_cs[0];
+	queue->initial_preamble_cs = dest_cs[1];
+	queue->continue_preamble_cs = dest_cs[2];
 
 	if (scratch_bo != queue->scratch_bo) {
 		if (queue->scratch_bo)
@@ -1904,6 +1885,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 	if (add_sample_positions)
 		queue->has_sample_positions = true;
 
+	*initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
 	*initial_preamble_cs = queue->initial_preamble_cs;
 	*continue_preamble_cs = queue->continue_preamble_cs;
 	if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
@@ -2028,7 +2010,7 @@ VkResult radv_QueueSubmit(
 	uint32_t scratch_size = 0;
 	uint32_t compute_scratch_size = 0;
 	uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
-	struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
+	struct radeon_winsys_cs *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL;
 	VkResult result;
 	bool fence_emitted = false;
 	bool tess_rings_needed = false;
@@ -2053,7 +2035,7 @@ VkResult radv_QueueSubmit(
 
 	result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
 	                              esgs_ring_size, gsvs_ring_size, tess_rings_needed,
-				      sample_positions_needed,
+				      sample_positions_needed, &initial_flush_preamble_cs,
 	                              &initial_preamble_cs, &continue_preamble_cs);
 	if (result != VK_SUCCESS)
 		return result;
@@ -2061,7 +2043,7 @@ VkResult radv_QueueSubmit(
 	for (uint32_t i = 0; i < submitCount; i++) {
 		struct radeon_winsys_cs **cs_array;
 		bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
-		bool can_patch = !do_flush;
+		bool can_patch = true;
 		uint32_t advance;
 		struct radv_winsys_sem_info sem_info;
 
@@ -2091,35 +2073,31 @@ VkResult radv_QueueSubmit(
 		}
 
 		cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
-					        (pSubmits[i].commandBufferCount + do_flush));
-
-		if(do_flush)
-			cs_array[0] = pSubmits[i].waitSemaphoreCount ?
-				queue->device->flush_shader_cs[queue->queue_family_index] :
-				queue->device->flush_cs[queue->queue_family_index];
+					        (pSubmits[i].commandBufferCount));
 
 		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
 			RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
 					 pSubmits[i].pCommandBuffers[j]);
 			assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-			cs_array[j + do_flush] = cmd_buffer->cs;
+			cs_array[j] = cmd_buffer->cs;
 			if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
 				can_patch = false;
 		}
 
-		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + do_flush; j += advance) {
+		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
+			struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
 			advance = MIN2(max_cs_submission,
-				       pSubmits[i].commandBufferCount + do_flush - j);
+				       pSubmits[i].commandBufferCount - j);
 
 			if (queue->device->trace_bo)
 				*queue->device->trace_id_ptr = 0;
 
 			sem_info.cs_emit_wait = j == 0;
-			sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount + do_flush;
+			sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
 
 			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
-							advance, initial_preamble_cs, continue_preamble_cs,
+							advance, initial_preamble, continue_preamble_cs,
 							   &sem_info,
 							can_patch, base_fence);
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 28e70e69150..07e3a49a702 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -499,6 +499,7 @@ struct radv_queue {
 	struct radeon_winsys_bo *tess_factor_ring_bo;
 	struct radeon_winsys_bo *tess_offchip_ring_bo;
 	struct radeon_winsys_cs *initial_preamble_cs;
+	struct radeon_winsys_cs *initial_full_flush_preamble_cs;
 	struct radeon_winsys_cs *continue_preamble_cs;
 };
 
@@ -515,8 +516,6 @@ struct radv_device {
 	struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
 	int queue_count[RADV_MAX_QUEUE_FAMILIES];
 	struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];
-	struct radeon_winsys_cs *flush_cs[RADV_MAX_QUEUE_FAMILIES];
-	struct radeon_winsys_cs *flush_shader_cs[RADV_MAX_QUEUE_FAMILIES];
 	uint64_t debug_flags;
 
 	bool llvm_supports_spill;
-- 
2.14.1



More information about the mesa-dev mailing list