Mesa (main): radv: Implement mesh shader scratch ring.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Jun 8 09:12:36 UTC 2022
Module: Mesa
Branch: main
Commit: 0280b526d58e85d65b53d3f9c8b0f7364d853751
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=0280b526d58e85d65b53d3f9c8b0f7364d853751
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Fri May 20 18:12:36 2022 +0200
radv: Implement mesh shader scratch ring.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
---
src/amd/vulkan/radv_cmd_buffer.c | 6 +++++
src/amd/vulkan/radv_constants.h | 25 ++++++++++++++++-
src/amd/vulkan/radv_device.c | 54 +++++++++++++++++++++++++++++++------
src/amd/vulkan/radv_nir_lower_abi.c | 10 +++++++
src/amd/vulkan/radv_private.h | 3 +++
src/amd/vulkan/radv_shader.c | 1 +
src/amd/vulkan/radv_shader.h | 1 +
7 files changed, 91 insertions(+), 9 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ac6851dc908..1caf50522f2 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -509,6 +509,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
cmd_buffer->task_rings_needed = false;
+ cmd_buffer->mesh_scratch_ring_needed = false;
cmd_buffer->gds_needed = false;
cmd_buffer->gds_oa_needed = false;
cmd_buffer->sample_positions_needed = false;
@@ -5260,6 +5261,9 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
cmd_buffer->tess_rings_needed = true;
+ if (mesh_shading)
+ cmd_buffer->mesh_scratch_ring_needed |=
+ pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
cmd_buffer->task_rings_needed = true;
@@ -5801,6 +5805,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
primary->tess_rings_needed = true;
if (secondary->task_rings_needed)
primary->task_rings_needed = true;
+ if (secondary->mesh_scratch_ring_needed)
+ primary->mesh_scratch_ring_needed = true;
if (secondary->sample_positions_needed)
primary->sample_positions_needed = true;
if (secondary->gds_needed)
diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h
index 4b6d3b9667b..1953a5d2201 100644
--- a/src/amd/vulkan/radv_constants.h
+++ b/src/amd/vulkan/radv_constants.h
@@ -76,7 +76,8 @@
#define RING_HS_TESS_OFFCHIP 6
#define RING_TS_DRAW 7
#define RING_TS_PAYLOAD 8
-#define RING_PS_SAMPLE_POSITIONS 9
+#define RING_MS_SCRATCH 9
+#define RING_PS_SAMPLE_POSITIONS 10
/* max number of descriptor sets */
#define MAX_SETS 32
@@ -91,6 +92,28 @@
*/
#define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull
+/* Number of entries in the mesh shader scratch ring.
+ * This depends on VGT_GS_MAX_WAVE_ID which is set by the kernel
+ * and is impossible to query. We leave it on its maximum value
+ * because real applications are unlikely to use it.
+ *
+ * The maximum ID on GFX10.3 is 2047 (0x7ff), so we need 2048 entries.
+ */
+#define RADV_MESH_SCRATCH_NUM_ENTRIES 2048
+
+/* Size of each entry in the mesh shader scratch ring.
+ * We must ensure that the absolute maximum mesh shader output fits here.
+ *
+ * Mesh shaders can create up to 256 vertices/primitives per workgroup,
+ * and up to the following amount of outputs:
+ * - 32 parameters
+ * - 4 positions (clip/cull distance, etc.)
+ * - 4 per-primitive built-in outputs (layer, view index, prim id, VRS rate)
+ * - primitive indices which are always kept in LDS
+ * That is a total of 32+4+4=40 output slots x 16 bytes per slot x 256 = 160K bytes.
+ */
+#define RADV_MESH_SCRATCH_ENTRY_BYTES (160 * 1024)
+
/* Number of invocations in each subgroup. */
#define RADV_SUBGROUP_SIZE 64
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index eb66c4215cb..e92b56a8a05 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -3605,7 +3605,8 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size, struct radeon_winsys_bo *gsvs_ring_bo,
struct radeon_winsys_bo *tess_rings_bo,
- struct radeon_winsys_bo *task_rings_bo)
+ struct radeon_winsys_bo *task_rings_bo,
+ struct radeon_winsys_bo *mesh_scratch_ring_bo)
{
uint32_t *desc = &map[4];
@@ -3791,6 +3792,27 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
desc += 8;
+ if (mesh_scratch_ring_bo) {
+ uint64_t va = radv_buffer_get_va(mesh_scratch_ring_bo);
+
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
+ desc[2] = RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (device->physical_device->rad_info.gfx_level >= GFX11) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
+ } else {
+ assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+ }
+ }
+
+ desc += 4;
+
if (add_sample_positions) {
/* add sample positions after all rings */
memcpy(desc, device->sample_locations_1x, 8);
@@ -4083,6 +4105,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
struct radeon_winsys_bo *gsvs_ring_bo = queue->gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo = queue->tess_rings_bo;
struct radeon_winsys_bo *task_rings_bo = queue->task_rings_bo;
+ struct radeon_winsys_bo *mesh_scratch_ring_bo = queue->mesh_scratch_ring_bo;
struct radeon_winsys_bo *gds_bo = queue->gds_bo;
struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo;
struct radeon_cmdbuf *dest_cs[3] = {0};
@@ -4154,6 +4177,16 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
goto fail;
}
+ if (!queue->ring_info.mesh_scratch_ring && needs->mesh_scratch_ring) {
+ assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
+ result =
+ ws->buffer_create(ws, RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES, 256,
+ RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &mesh_scratch_ring_bo);
+
+ if (result != VK_SUCCESS)
+ goto fail;
+ }
+
if (!queue->ring_info.gds && needs->gds) {
assert(device->physical_device->rad_info.gfx_level >= GFX10);
@@ -4184,10 +4217,11 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
if ((queue->qf == RADV_QUEUE_COMPUTE && !descriptor_bo && task_rings_bo) ||
scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo || tess_rings_bo != queue->tess_rings_bo ||
- task_rings_bo != queue->task_rings_bo || add_sample_positions) {
+ task_rings_bo != queue->task_rings_bo || mesh_scratch_ring_bo != queue->mesh_scratch_ring_bo ||
+ add_sample_positions) {
uint32_t size = 0;
- if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) {
- size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
+ if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions) {
+ size = 160; /* 2 dword + 2 padding + 4 dword * 9 */
if (add_sample_positions)
size += 128; /* 64+32+16+8 = 120 bytes */
} else if (scratch_bo) {
@@ -4220,10 +4254,10 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
map[1] = rsrc1;
}
- if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions)
+ if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions)
radv_fill_shader_rings(device, map, add_sample_positions, needs->esgs_ring_size,
esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo,
- task_rings_bo);
+ task_rings_bo, mesh_scratch_ring_bo);
ws->buffer_unmap(descriptor_bo);
}
@@ -4238,7 +4272,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
/* Continue preamble is unnecessary when no shader rings are used. */
if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave &&
!needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings &&
- !needs->task_rings && !needs->gds && !needs->gds_oa && !needs->sample_positions)
+ !needs->task_rings && !needs->mesh_scratch_ring && !needs->gds && !needs->gds_oa && !needs->sample_positions)
continue;
}
@@ -4368,6 +4402,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
queue->tess_rings_bo = tess_rings_bo;
queue->task_rings_bo = task_rings_bo;
+ queue->mesh_scratch_ring_bo = mesh_scratch_ring_bo;
queue->gds_bo = gds_bo;
queue->gds_oa_bo = gds_oa_bo;
queue->ring_info = *needs;
@@ -4539,6 +4574,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
needs.tess_rings |= cmd_buffer->tess_rings_needed;
needs.task_rings |= cmd_buffer->task_rings_needed;
+ needs.mesh_scratch_ring |= cmd_buffer->mesh_scratch_ring_needed;
needs.gds |= cmd_buffer->gds_needed;
needs.gds_oa |= cmd_buffer->gds_oa_needed;
needs.sample_positions |= cmd_buffer->sample_positions_needed;
@@ -4565,7 +4601,9 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
queue->ring_info.esgs_ring_size == needs.esgs_ring_size &&
queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size &&
queue->ring_info.tess_rings == needs.tess_rings &&
- queue->ring_info.task_rings == needs.task_rings && queue->ring_info.gds == needs.gds &&
+ queue->ring_info.task_rings == needs.task_rings &&
+ queue->ring_info.mesh_scratch_ring == needs.mesh_scratch_ring &&
+ queue->ring_info.gds == needs.gds &&
queue->ring_info.gds_oa == needs.gds_oa &&
queue->ring_info.sample_positions == needs.sample_positions)
return VK_SUCCESS;
diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c
index 0570a0f1d88..7b6231114c4 100644
--- a/src/amd/vulkan/radv_nir_lower_abi.c
+++ b/src/amd/vulkan/radv_nir_lower_abi.c
@@ -173,6 +173,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
case nir_intrinsic_load_ring_task_payload_amd:
return load_ring(b, RING_TS_PAYLOAD, s);
+ case nir_intrinsic_load_ring_mesh_scratch_amd:
+ return load_ring(b, RING_MS_SCRATCH, s);
+
+ case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
+ /* gs_tg_info[0:11] is ordered_wave_id. Multiply by the ring entry size. */
+ return nir_imul_imm(b, nir_iand_imm(b, ac_nir_load_arg(b, &s->args->ac, s->args->ac.gs_tg_info), 0xfff),
+ RADV_MESH_SCRATCH_ENTRY_BYTES);
+
case nir_intrinsic_load_task_ring_entry_amd:
return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry);
@@ -230,6 +238,8 @@ filter_abi_instr(const nir_instr *instr,
intrin->intrinsic == nir_intrinsic_load_viewport_y_offset ||
intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd ||
intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd ||
+ intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_amd ||
+ intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_offset_amd ||
intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd ||
intrin->intrinsic == nir_intrinsic_load_task_ib_addr ||
intrin->intrinsic == nir_intrinsic_load_task_ib_stride ||
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 9cad5b92241..0d409d49d58 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -717,6 +717,7 @@ struct radv_queue_ring_info {
uint32_t gsvs_ring_size;
bool tess_rings;
bool task_rings;
+ bool mesh_scratch_ring;
bool gds;
bool gds_oa;
bool sample_positions;
@@ -733,6 +734,7 @@ struct radv_queue_state {
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo;
struct radeon_winsys_bo *task_rings_bo;
+ struct radeon_winsys_bo *mesh_scratch_ring_bo;
struct radeon_winsys_bo *gds_bo;
struct radeon_winsys_bo *gds_oa_bo;
@@ -1568,6 +1570,7 @@ struct radv_cmd_buffer {
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
bool task_rings_needed;
+ bool mesh_scratch_ring_needed;
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 3f4d1b3551a..f7e5470aec7 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1242,6 +1242,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_
} else if (nir->info.stage == MESA_SHADER_MESH) {
bool scratch_ring = false;
NIR_PASS_V(nir, ac_nir_lower_ngg_ms, &scratch_ring, info->wave_size, pl_key->has_multiview_view_index);
+ ngg_stage->info.ms.needs_ms_scratch_ring = scratch_ring;
} else {
unreachable("invalid SW stage passed to radv_lower_ngg");
}
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 0044d1e98d6..fbd43140184 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -364,6 +364,7 @@ struct radv_shader_info {
struct {
struct radv_vs_output_info outinfo;
enum shader_prim output_prim;
+ bool needs_ms_scratch_ring;
} ms;
struct radv_streamout_info so;
More information about the mesa-commit
mailing list