[Mesa-dev] [PATCH 27/31] radv: handle emitting ring size registers at start of cs

Dave Airlie airlied at gmail.com
Fri Jan 20 03:03:23 UTC 2017


From: Dave Airlie <airlied at redhat.com>

Work out the maximum size of the rings and patch them
in at the start of the primary buffer.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/amd/vulkan/radv_cmd_buffer.c | 189 +++++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_private.h    |   7 ++
 2 files changed, 196 insertions(+)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 64c1507..814b12e 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1498,6 +1498,21 @@ VkResult radv_BeginCommandBuffer(
 		default:
 			break;
 		}
+
+		uint32_t pad_word = 0xffff1000U;
+		if (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2)
+			pad_word = 0x80000000;
+		cmd_buffer->ring_patch_idx = cmd_buffer->cs->cdw;
+		cmd_buffer->cs_to_patch_ring = cmd_buffer->cs->buf;
+		for (unsigned i = 0; i < 8; i++) {
+			radeon_emit(cmd_buffer->cs, pad_word);
+		}
+		for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+		}
 	}
 
 	if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
@@ -1635,6 +1650,171 @@ VkResult radv_EndCommandBuffer(
 
 	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
 		si_emit_cache_flush(cmd_buffer);
+
+	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+	    (cmd_buffer->esgs_ring_size_needed ||
+	     cmd_buffer->gsvs_ring_size_needed)) {
+		uint64_t esgs_va = 0, gsvs_va = 0;
+		uint32_t ring_offset;
+		void *ring_ptr;
+
+		if (cmd_buffer->esgs_ring_size_needed) {
+			cmd_buffer->esgs_ring =
+				cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
+								      cmd_buffer->esgs_ring_size_needed,
+								      4096,
+								      RADEON_DOMAIN_VRAM,
+								      RADEON_FLAG_NO_CPU_ACCESS);
+			if (!cmd_buffer->esgs_ring) {
+				cmd_buffer->record_fail = true;
+				return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+			}
+			cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
+							      cmd_buffer->esgs_ring, 8);
+			esgs_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->esgs_ring);
+		}
+
+		if (cmd_buffer->gsvs_ring_size_needed) {
+			cmd_buffer->gsvs_ring =
+				cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
+								      cmd_buffer->gsvs_ring_size_needed,
+								      4096,
+								      RADEON_DOMAIN_VRAM,
+								      RADEON_FLAG_NO_CPU_ACCESS);
+			if (!cmd_buffer->gsvs_ring) {
+				cmd_buffer->record_fail = true;
+				return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+			}
+			cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
+							      cmd_buffer->gsvs_ring, 8);
+			gsvs_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gsvs_ring);
+		}
+
+		/* 4 4-dword buffer descriptors
+		 * ES entry for ES->GS ring
+		 * GS entry for ES->GS ring
+		 * VS entry for GS->VS ring
+		 * GS entry for GS->VS ring that gets patched by shader.
+		*/
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4 * 4, 256, &ring_offset,
+				     &ring_ptr);
+		{
+			uint32_t *desc = (uint32_t *)ring_ptr;
+
+			/* ES entry for ES->GS ring */
+			/* stride 0, num records - size, add tid, swizzle, elsize4,
+			   index stride 64 */
+			desc[0] = esgs_va;
+			desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) |
+				S_008F04_STRIDE(0) |
+				S_008F04_SWIZZLE_ENABLE(true);
+			desc[2] = cmd_buffer->esgs_ring_size_needed;
+			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+				S_008F0C_ELEMENT_SIZE(1) |
+				S_008F0C_INDEX_STRIDE(3) |
+				S_008F0C_ADD_TID_ENABLE(true);
+
+			desc += 4;
+			/* GS entry for ES->GS ring */
+			/* stride 0, num records - size, elsize0,
+			   index stride 0 */
+			desc[0] = esgs_va;
+			desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)|
+				S_008F04_STRIDE(0) |
+				S_008F04_SWIZZLE_ENABLE(false);
+			desc[2] = cmd_buffer->esgs_ring_size_needed;
+			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+				S_008F0C_ELEMENT_SIZE(0) |
+				S_008F0C_INDEX_STRIDE(0) |
+				S_008F0C_ADD_TID_ENABLE(false);
+
+			desc += 4;
+			/* VS entry for GS->VS ring */
+			/* stride 0, num records - size, elsize0,
+			   index stride 0 */
+			desc[0] = gsvs_va;
+			desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
+				S_008F04_STRIDE(0) |
+				S_008F04_SWIZZLE_ENABLE(false);
+			desc[2] = cmd_buffer->gsvs_ring_size_needed;
+			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+				S_008F0C_ELEMENT_SIZE(0) |
+				S_008F0C_INDEX_STRIDE(0) |
+				S_008F0C_ADD_TID_ENABLE(false);
+			desc += 4;
+			
+			/* stride gsvs_itemsize, num records 64
+			   elsize 4, index stride 16 */
+			/* shader will patch stride and desc[2] */
+			desc[0] = gsvs_va;
+			desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
+				S_008F04_STRIDE(0) |
+				S_008F04_SWIZZLE_ENABLE(true);
+			desc[2] = 0;
+			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+				S_008F0C_ELEMENT_SIZE(1) |
+				S_008F0C_INDEX_STRIDE(1) |
+				S_008F0C_ADD_TID_ENABLE(true);
+		}
+		int idx = cmd_buffer->ring_patch_idx;
+
+		cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cmd_buffer->cs_to_patch_ring[idx++] = EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+		cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cmd_buffer->cs_to_patch_ring[idx++] = EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0);
+
+		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
+			cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_SET_UCONFIG_REG, 2, 0);
+			cmd_buffer->cs_to_patch_ring[idx++] = (R_030900_VGT_ESGS_RING_SIZE - CIK_UCONFIG_REG_OFFSET) >> 2;
+			cmd_buffer->cs_to_patch_ring[idx++] = cmd_buffer->esgs_ring_size_needed >> 8;
+			cmd_buffer->cs_to_patch_ring[idx++] = cmd_buffer->gsvs_ring_size_needed >> 8;
+		} else {
+			cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_SET_CONFIG_REG, 2, 0);
+			cmd_buffer->cs_to_patch_ring[idx++] = (R_0088C8_VGT_ESGS_RING_SIZE - R600_CONFIG_REG_OFFSET) >> 2;
+			cmd_buffer->cs_to_patch_ring[idx++] = cmd_buffer->esgs_ring_size_needed >> 8;
+			cmd_buffer->cs_to_patch_ring[idx++] = cmd_buffer->gsvs_ring_size_needed >> 8;
+		}
+
+		uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) + ring_offset;
+		uint32_t reg_base = R_00B130_SPI_SHADER_USER_DATA_VS_0 + (2 * 4);
+		cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+		cmd_buffer->cs_to_patch_ring[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+		cmd_buffer->cs_to_patch_ring[idx++] = va;
+		cmd_buffer->cs_to_patch_ring[idx++] = va >> 32;
+		
+		reg_base = R_00B230_SPI_SHADER_USER_DATA_GS_0 + (2 * 4);
+		cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+		cmd_buffer->cs_to_patch_ring[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+		cmd_buffer->cs_to_patch_ring[idx++] = va;
+		cmd_buffer->cs_to_patch_ring[idx++] = va >> 32;
+
+		reg_base = R_00B330_SPI_SHADER_USER_DATA_ES_0 + (2 * 4);
+		cmd_buffer->cs_to_patch_ring[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+		cmd_buffer->cs_to_patch_ring[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+		cmd_buffer->cs_to_patch_ring[idx++] = va;
+		cmd_buffer->cs_to_patch_ring[idx++] = va >> 32;
+	}
+
 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
 	    cmd_buffer->record_fail)
 		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -1715,6 +1895,11 @@ void radv_CmdBindPipeline(
 		radv_dynamic_state_copy(&cmd_buffer->state.dynamic,
 					&pipeline->dynamic_state,
 					pipeline->dynamic_state_mask);
+
+		if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
+			cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
+		if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
+			cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
 		break;
 	default:
 		assert(!"invalid bind point");
@@ -1862,6 +2047,10 @@ void radv_CmdExecuteCommands(
 	for (uint32_t i = 0; i < commandBufferCount; i++) {
 		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
 
+		if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
+			primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
+		if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
+			primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
 	}
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index aa6e09a..e4423fa 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -728,6 +728,13 @@ struct radv_cmd_buffer {
 	struct radv_cmd_buffer_upload upload;
 
 	bool record_fail;
+
+	uint32_t ring_patch_idx;
+	uint32_t *cs_to_patch_ring;
+	struct radeon_winsys_bo *esgs_ring;
+	struct radeon_winsys_bo *gsvs_ring;
+	uint32_t esgs_ring_size_needed;
+	uint32_t gsvs_ring_size_needed;
 };
 
 struct radv_image;
-- 
2.9.3



More information about the mesa-dev mailing list