Mesa (main): radv: Implement task shader draw and payload rings.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Jun 8 09:12:36 UTC 2022
Module: Mesa
Branch: main
Commit: b730f91247844515aea0b7079738c690c4d0ca93
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=b730f91247844515aea0b7079738c690c4d0ca93
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Thu May 12 00:40:39 2022 +0200
radv: Implement task shader draw and payload rings.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
---
src/amd/vulkan/radv_cmd_buffer.c | 9 +++
src/amd/vulkan/radv_device.c | 167 ++++++++++++++++++++++++++++++++++++---
src/amd/vulkan/radv_private.h | 3 +
3 files changed, 169 insertions(+), 10 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 9f6937f9d2d..ac6851dc908 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -508,6 +508,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->esgs_ring_size_needed = 0;
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
+ cmd_buffer->task_rings_needed = false;
cmd_buffer->gds_needed = false;
cmd_buffer->gds_oa_needed = false;
cmd_buffer->sample_positions_needed = false;
@@ -5183,6 +5184,8 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
cmd_buffer->state.compute_pipeline = compute_pipeline;
cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
+ cmd_buffer->task_rings_needed |=
+ pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.uses_task_rings;
break;
}
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
@@ -5257,6 +5260,10 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
cmd_buffer->tess_rings_needed = true;
+
+ if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
+ cmd_buffer->task_rings_needed = true;
+ }
break;
}
default:
@@ -5792,6 +5799,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
if (secondary->tess_rings_needed)
primary->tess_rings_needed = true;
+ if (secondary->task_rings_needed)
+ primary->task_rings_needed = true;
if (secondary->sample_positions_needed)
primary->sample_positions_needed = true;
if (secondary->gds_needed)
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 4f3d7d730cc..eb66c4215cb 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -2731,6 +2731,8 @@ radv_queue_state_finish(struct radv_queue_state *queue, struct radeon_winsys *ws
ws->buffer_destroy(ws, queue->gsvs_ring_bo);
if (queue->tess_rings_bo)
ws->buffer_destroy(ws, queue->tess_rings_bo);
+ if (queue->task_rings_bo)
+ ws->buffer_destroy(ws, queue->task_rings_bo);
if (queue->gds_bo)
ws->buffer_destroy(ws, queue->gds_bo);
if (queue->gds_oa_bo)
@@ -3602,7 +3604,8 @@ static void
radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sample_positions,
uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size, struct radeon_winsys_bo *gsvs_ring_bo,
- struct radeon_winsys_bo *tess_rings_bo)
+ struct radeon_winsys_bo *tess_rings_bo,
+ struct radeon_winsys_bo *task_rings_bo)
{
uint32_t *desc = &map[4];
@@ -3750,7 +3753,41 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
desc += 8;
- /* Reserved for task shader rings. */
+ if (task_rings_bo) {
+ uint64_t task_va = radv_buffer_get_va(task_rings_bo);
+ uint64_t task_draw_ring_va = task_va + device->physical_device->task_info.draw_ring_offset;
+ uint64_t task_payload_ring_va = task_va + device->physical_device->task_info.payload_ring_offset;
+
+ desc[0] = task_draw_ring_va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(task_draw_ring_va >> 32);
+ desc[2] = device->physical_device->task_info.num_entries * AC_TASK_DRAW_ENTRY_BYTES;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (device->physical_device->rad_info.gfx_level >= GFX11) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
+ } else {
+ assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+ }
+
+ desc[4] = task_payload_ring_va;
+ desc[5] = S_008F04_BASE_ADDRESS_HI(task_payload_ring_va >> 32);
+ desc[6] = device->physical_device->task_info.num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
+ desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (device->physical_device->rad_info.gfx_level >= GFX11) {
+ desc[7] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
+ } else {
+ assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
+ desc[7] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+ }
+ }
desc += 8;
@@ -3829,6 +3866,57 @@ radv_emit_tess_factor_ring(struct radv_device *device, struct radeon_cmdbuf *cs,
}
}
+static VkResult
+radv_initialise_task_control_buffer(struct radv_device *device,
+ struct radeon_winsys_bo *task_rings_bo)
+{
+ uint32_t *ptr = (uint32_t *)device->ws->buffer_map(task_rings_bo);
+ if (!ptr)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+ const uint32_t num_entries = device->physical_device->task_info.num_entries;
+ const uint64_t task_va = radv_buffer_get_va(task_rings_bo);
+ const uint64_t task_draw_ring_va = task_va + device->physical_device->task_info.draw_ring_offset;
+ assert((task_draw_ring_va & 0xFFFFFF00) == (task_draw_ring_va & 0xFFFFFFFF));
+
+ /* 64-bit write_ptr */
+ ptr[0] = num_entries;
+ ptr[1] = 0;
+ /* 64-bit read_ptr */
+ ptr[2] = num_entries;
+ ptr[3] = 0;
+ /* 64-bit dealloc_ptr */
+ ptr[4] = num_entries;
+ ptr[5] = 0;
+ /* num_entries */
+ ptr[6] = num_entries;
+ /* 64-bit draw ring address */
+ ptr[7] = task_draw_ring_va;
+ ptr[8] = task_draw_ring_va >> 32;
+
+ device->ws->buffer_unmap(task_rings_bo);
+ return VK_SUCCESS;
+}
+
+static void
+radv_emit_task_rings(struct radv_device *device, struct radeon_cmdbuf *cs,
+ struct radeon_winsys_bo *task_rings_bo, bool compute)
+{
+ if (!task_rings_bo)
+ return;
+
+ const uint64_t task_ctrlbuf_va = radv_buffer_get_va(task_rings_bo);
+ assert(radv_is_aligned(task_ctrlbuf_va, 256));
+ radv_cs_add_buffer(device->ws, cs, task_rings_bo);
+
+ /* Tell the GPU where the task control buffer is. */
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_TASK_STATE_INIT, 1, 0) | PKT3_SHADER_TYPE_S(!!compute));
+ /* bits [31:8]: control buffer address lo, bits[7:0]: reserved (set to zero) */
+ radeon_emit(cs, task_ctrlbuf_va & 0xFFFFFF00);
+ /* bits [31:0]: control buffer address hi */
+ radeon_emit(cs, task_ctrlbuf_va >> 32);
+}
+
static void
radv_emit_graphics_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
uint32_t size_per_wave, uint32_t waves,
@@ -3896,6 +3984,22 @@ radv_emit_compute_scratch(struct radv_device *device, struct radeon_cmdbuf *cs,
S_00B860_WAVESIZE(round_up_u32(size_per_wave, info->gfx_level >= GFX11 ? 256 : 1024)));
}
+static void
+radv_emit_compute_shader_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
+ struct radeon_winsys_bo *descriptor_bo)
+{
+ if (!descriptor_bo)
+ return;
+
+ uint64_t va = radv_buffer_get_va(descriptor_bo);
+ radv_cs_add_buffer(device->ws, cs, descriptor_bo);
+
+ /* Compute shader user data 0-1 have the scratch pointer (unlike GFX shaders),
+ * so emit the descriptor pointer to user data 2-3 instead (task_ring_offsets arg).
+ */
+ radv_emit_shader_pointer(device, cs, R_00B908_COMPUTE_USER_DATA_2, va, true);
+}
+
static void
radv_emit_graphics_shader_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
struct radeon_winsys_bo *descriptor_bo)
@@ -3978,6 +4082,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
struct radeon_winsys_bo *esgs_ring_bo = queue->esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo = queue->gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo = queue->tess_rings_bo;
+ struct radeon_winsys_bo *task_rings_bo = queue->task_rings_bo;
struct radeon_winsys_bo *gds_bo = queue->gds_bo;
struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo;
struct radeon_cmdbuf *dest_cs[3] = {0};
@@ -4029,6 +4134,26 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
goto fail;
}
+ if (!queue->ring_info.task_rings && needs->task_rings) {
+ assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
+
+ /* We write the control buffer from the CPU, so need to grant CPU access to the BO.
+ * The draw ring needs to be zero-initialized otherwise the ready bits will be incorrect.
+ */
+ uint32_t task_rings_bo_flags =
+ RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM;
+
+ result = ws->buffer_create(ws, device->physical_device->task_info.bo_size_bytes, 256,
+ RADEON_DOMAIN_VRAM, task_rings_bo_flags, RADV_BO_PRIORITY_SCRATCH,
+ 0, &task_rings_bo);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ result = radv_initialise_task_control_buffer(device, task_rings_bo);
+ if (result != VK_SUCCESS)
+ goto fail;
+ }
+
if (!queue->ring_info.gds && needs->gds) {
assert(device->physical_device->rad_info.gfx_level >= GFX10);
@@ -4050,11 +4175,18 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
goto fail;
}
- if (scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo ||
+ /* Re-initialize the descriptor BO when any ring BOs changed.
+ *
+ * Additionally, make sure to create the descriptor BO for the compute queue
+ * when it uses the task shader rings. The task rings BO is shared between the
+ * GFX and compute queues and already initialized here.
+ */
+ if ((queue->qf == RADV_QUEUE_COMPUTE && !descriptor_bo && task_rings_bo) ||
+ scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo || tess_rings_bo != queue->tess_rings_bo ||
- add_sample_positions) {
+ task_rings_bo != queue->task_rings_bo || add_sample_positions) {
uint32_t size = 0;
- if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || add_sample_positions) {
+ if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) {
size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
if (add_sample_positions)
size += 128; /* 64+32+16+8 = 120 bytes */
@@ -4088,9 +4220,10 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
map[1] = rsrc1;
}
- if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || add_sample_positions)
+ if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions)
radv_fill_shader_rings(device, map, add_sample_positions, needs->esgs_ring_size,
- esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo);
+ esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo,
+ task_rings_bo);
ws->buffer_unmap(descriptor_bo);
}
@@ -4105,7 +4238,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
/* Continue preamble is unnecessary when no shader rings are used. */
if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave &&
!needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings &&
- !needs->gds && !needs->gds_oa && !needs->sample_positions)
+ !needs->task_rings && !needs->gds && !needs->gds_oa && !needs->sample_positions)
continue;
}
@@ -4127,7 +4260,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
case RADV_QUEUE_GENERAL:
radv_init_graphics_state(cs, device);
- if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo) {
+ if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
@@ -4138,6 +4271,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
radv_emit_gs_ring_sizes(device, cs, esgs_ring_bo, needs->esgs_ring_size, gsvs_ring_bo,
needs->gsvs_ring_size);
radv_emit_tess_factor_ring(device, cs, tess_rings_bo);
+ radv_emit_task_rings(device, cs, task_rings_bo, false);
radv_emit_graphics_shader_pointers(device, cs, descriptor_bo);
radv_emit_compute_scratch(device, cs, needs->compute_scratch_size_per_wave,
needs->compute_scratch_waves, compute_scratch_bo);
@@ -4146,6 +4280,14 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
break;
case RADV_QUEUE_COMPUTE:
radv_init_compute_state(cs, device);
+
+ if (task_rings_bo) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ }
+
+ radv_emit_task_rings(device, cs, task_rings_bo, true);
+ radv_emit_compute_shader_pointers(device, cs, descriptor_bo);
radv_emit_compute_scratch(device, cs, needs->compute_scratch_size_per_wave,
needs->compute_scratch_waves, compute_scratch_bo);
break;
@@ -4225,6 +4367,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
}
queue->tess_rings_bo = tess_rings_bo;
+ queue->task_rings_bo = task_rings_bo;
queue->gds_bo = gds_bo;
queue->gds_oa_bo = gds_oa_bo;
queue->ring_info = *needs;
@@ -4245,6 +4388,8 @@ fail:
ws->buffer_destroy(ws, gsvs_ring_bo);
if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo)
ws->buffer_destroy(ws, tess_rings_bo);
+ if (task_rings_bo && task_rings_bo != queue->task_rings_bo)
+ ws->buffer_destroy(ws, task_rings_bo);
if (gds_bo && gds_bo != queue->gds_bo)
ws->buffer_destroy(ws, gds_bo);
if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo)
@@ -4393,6 +4538,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.esgs_ring_size = MAX2(needs.esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
needs.tess_rings |= cmd_buffer->tess_rings_needed;
+ needs.task_rings |= cmd_buffer->task_rings_needed;
needs.gds |= cmd_buffer->gds_needed;
needs.gds_oa |= cmd_buffer->gds_oa_needed;
needs.sample_positions |= cmd_buffer->sample_positions_needed;
@@ -4418,7 +4564,8 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
queue->ring_info.compute_scratch_waves == needs.compute_scratch_waves &&
queue->ring_info.esgs_ring_size == needs.esgs_ring_size &&
queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size &&
- queue->ring_info.tess_rings == needs.tess_rings && queue->ring_info.gds == needs.gds &&
+ queue->ring_info.tess_rings == needs.tess_rings &&
+ queue->ring_info.task_rings == needs.task_rings && queue->ring_info.gds == needs.gds &&
queue->ring_info.gds_oa == needs.gds_oa &&
queue->ring_info.sample_positions == needs.sample_positions)
return VK_SUCCESS;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 2f01c6e573f..9cad5b92241 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -716,6 +716,7 @@ struct radv_queue_ring_info {
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool tess_rings;
+ bool task_rings;
bool gds;
bool gds_oa;
bool sample_positions;
@@ -731,6 +732,7 @@ struct radv_queue_state {
struct radeon_winsys_bo *esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo;
+ struct radeon_winsys_bo *task_rings_bo;
struct radeon_winsys_bo *gds_bo;
struct radeon_winsys_bo *gds_oa_bo;
@@ -1565,6 +1567,7 @@ struct radv_cmd_buffer {
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
+ bool task_rings_needed;
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;
More information about the mesa-commit
mailing list