Mesa (main): radv: Implement mesh shading draw calls with task shaders.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Jul 20 19:31:56 UTC 2022
Module: Mesa
Branch: main
Commit: 2479b6286974c1467bec5120df96650d297dcb7f
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=2479b6286974c1467bec5120df96650d297dcb7f
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Tue Feb 8 00:28:44 2022 +0100
radv: Implement mesh shading draw calls with task shaders.
This implements NV_mesh_shader draw calls with task shaders.
- On the GFX side:
DISPATCH_TASKMESH_GFX for all draws
- On the ACE side:
DISPATCH_TASKMESH_DIRECT_ACE for direct draws
DISPATCH_TASKMESH_INDIRECT_MULTI_ACE for indirect draws
Additionally, the NV_mesh_shader indirect BO layout is
incompatible with AMD HW, so we add a function that copies
that into a suitable layout.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>
---
src/amd/vulkan/radv_cmd_buffer.c | 439 +++++++++++++++++++++++++++++++++++----
1 file changed, 398 insertions(+), 41 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 1b4304f7095..4053d33696b 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -6375,7 +6375,7 @@ radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
{
struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- radv_foreach_stage(stage, pipeline->active_stages) {
+ radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) {
radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index);
}
if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
@@ -6386,6 +6386,10 @@ radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
}
}
+ if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
+ radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
+ index);
+ }
}
/**
@@ -6529,6 +6533,109 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool index
}
}
+ALWAYS_INLINE static void
+radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer,
+ const uint32_t x, const uint32_t y,
+ const uint32_t z)
+{
+ struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
+ struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
+ struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
+ const bool predicating = cmd_buffer->state.predicating;
+ const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
+ S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
+
+ struct radv_userdata_info *ring_entry_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
+ assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
+
+ uint32_t ring_entry_reg =
+ (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, x);
+ radeon_emit(cs, y);
+ radeon_emit(cs, z);
+ radeon_emit(cs, dispatch_initiator);
+ radeon_emit(cs, ring_entry_reg & 0xFFFF);
+}
+
+ALWAYS_INLINE static void
+radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer,
+ uint64_t data_va, uint32_t draw_count,
+ uint64_t count_va, uint32_t stride)
+{
+ assert((data_va & 0x03) == 0);
+ assert((count_va & 0x03) == 0);
+
+ struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
+ struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
+ struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
+
+ const uint32_t count_indirect_enable = !!count_va;
+ const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size;
+ const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id;
+ const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
+ S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
+
+ const struct radv_userdata_info *ring_entry_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
+ const struct radv_userdata_info *xyz_dim_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
+ const struct radv_userdata_info *draw_id_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
+
+ assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
+ assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
+ assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
+
+ const uint32_t ring_entry_reg =
+ (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
+ const uint32_t xyz_dim_reg =
+ !xyz_dim_enable
+ ? 0
+ : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
+ const uint32_t draw_id_reg =
+ !draw_id_enable
+ ? 0
+ : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, data_va);
+ radeon_emit(cs, data_va >> 32);
+ radeon_emit(cs, ring_entry_reg & 0xFFFF);
+ radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) |
+ (draw_id_reg << 16));
+ radeon_emit(cs, xyz_dim_reg & 0xFFFF);
+ radeon_emit(cs, draw_count);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, count_va >> 32);
+ radeon_emit(cs, stride);
+ radeon_emit(cs, dispatch_initiator);
+}
+
+ALWAYS_INLINE static void
+radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
+{
+ struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
+ struct radeon_cmdbuf *cs = cmd_buffer->cs;
+ bool predicating = cmd_buffer->state.predicating;
+
+ struct radv_userdata_info *ring_entry_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY);
+
+ assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1);
+
+ uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
+ uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
+ uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating));
+ radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF));
+ radeon_emit(cs, 0);
+ radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+}
+
static inline void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info, const uint32_t vertex_offset)
@@ -6606,6 +6713,77 @@ radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer,
}
}
+ALWAYS_INLINE static void
+radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer)
+{
+ struct radv_cmd_state *state = &cmd_buffer->state;
+ struct radeon_cmdbuf *cs = cmd_buffer->cs;
+ struct radv_graphics_pipeline *pipeline = state->graphics_pipeline;
+ const bool uses_drawid = pipeline->uses_drawid;
+
+ radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1);
+ radeon_emit(cs, 0);
+
+ if (uses_drawid) {
+ radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1);
+ radeon_emit(cs, 0);
+ }
+}
+
+ALWAYS_INLINE static void
+radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va,
+ uint32_t ib_stride)
+{
+ struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
+ struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
+
+ struct radv_userdata_info *task_ib_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB);
+
+ if (task_ib_loc->sgpr_idx != -1) {
+ assert(task_ib_loc->num_sgprs == 3);
+ unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4;
+
+ radeon_set_sh_reg_seq(cs, task_ib_reg, 3);
+ radeon_emit(cs, ib_va);
+ radeon_emit(cs, ib_va >> 32);
+ radeon_emit(cs, ib_stride);
+ }
+}
+
+ALWAYS_INLINE static void
+radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z,
+ uint32_t draw_id, uint32_t first_task, uint64_t ib_va)
+{
+ struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
+ struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
+
+ struct radv_userdata_info *xyz_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
+ struct radv_userdata_info *draw_id_loc =
+ radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
+
+ if (xyz_loc->sgpr_idx != -1) {
+ assert(xyz_loc->num_sgprs == 3);
+ unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
+
+ radeon_set_sh_reg_seq(cs, xyz_reg, 3);
+ radeon_emit(cs, x);
+ radeon_emit(cs, y);
+ radeon_emit(cs, z);
+ }
+
+ if (draw_id_loc->sgpr_idx != -1) {
+ assert(draw_id_loc->num_sgprs == 1);
+ unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
+
+ radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
+ radeon_emit(cs, draw_id);
+ }
+
+ radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0);
+}
+
ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info,
@@ -6802,6 +6980,83 @@ radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer,
}
}
+ALWAYS_INLINE static void
+radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
+ uint32_t z, uint32_t first_task)
+{
+ uint64_t fake_ib_va = 0;
+ const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
+ const unsigned num_views = MAX2(1, util_bitcount(view_mask));
+ unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
+
+ if (first_task) {
+ /* Pass this as the IB to the shader for emulating firstTask in task shaders. */
+ uint32_t fake_ib_dwords[2] = {x, first_task};
+ unsigned fake_ib_offset;
+ radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset);
+ fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset;
+ }
+
+ radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va);
+ radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
+ radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
+ cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
+ ace_predication_size);
+
+ if (!view_mask) {
+ radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
+ radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
+ } else {
+ u_foreach_bit (view, view_mask) {
+ radv_emit_view_index(cmd_buffer, view);
+ radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
+ radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
+ }
+ }
+}
+
+static void
+radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer,
+ const struct radv_draw_info *info, uint64_t nv_ib_va,
+ uint32_t nv_ib_stride)
+{
+ const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
+ struct radeon_winsys *ws = cmd_buffer->device->ws;
+ const unsigned num_views = MAX2(1, util_bitcount(view_mask));
+ unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
+
+ const uint64_t va =
+ radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
+ const uint64_t count_va = !info->count_buffer
+ ? 0
+ : radv_buffer_get_va(info->count_buffer->bo) +
+ info->count_buffer->offset + info->count_buffer_offset;
+
+ if (count_va) {
+ radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo);
+ }
+
+ radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo);
+ radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride);
+ radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
+ radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
+ cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
+ ace_predication_size);
+
+ if (!view_mask) {
+ radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
+ count_va, info->stride);
+ radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
+ } else {
+ u_foreach_bit (view, view_mask) {
+ radv_emit_view_index(cmd_buffer, view);
+ radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
+ count_va, info->stride);
+ radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
+ }
+ }
+}
+
static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info)
@@ -7382,6 +7637,77 @@ radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer,
return buf;
}
+static struct radv_buffer
+radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer,
+ VkDeviceSize offset, uint32_t draw_count, uint32_t stride)
+{
+ /* Translates the indirect BO format used by NV_mesh_shader API
+ * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
+ */
+
+ assert(draw_count);
+ static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command.");
+
+ struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
+ struct radeon_winsys *ws = cmd_buffer->device->ws;
+
+ const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
+ const size_t dst_stride = sizeof(VkDispatchIndirectCommand);
+ const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
+ const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x);
+
+ const unsigned new_disp_size = dst_stride * draw_count;
+
+ const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
+ radv_cs_add_buffer(ws, cs, buffer->bo);
+
+ /* Fill the buffer with X=0, Y=1, Z=1. */
+ VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size);
+ for (unsigned i = 0; i < draw_count; ++i) {
+ fill_data[i].x = 0;
+ fill_data[i].y = 1;
+ fill_data[i].z = 1;
+ }
+
+ /* Allocate space in the upload BO. */
+ unsigned out_offset;
+ ASSERTED bool uploaded =
+ radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset);
+ const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
+ assert(uploaded);
+
+ /* Clamp draw count to fit the actual size of the buffer.
+ * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer).
+ * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless.
+ */
+ draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride);
+
+ ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2);
+
+ /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */
+ for (unsigned i = 0; i < draw_count; ++i) {
+ const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
+ const uint64_t dst_x = new_va + i * dst_stride + dst_off_x;
+
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ COPY_DATA_WR_CONFIRM);
+ radeon_emit(cs, src_task_count);
+ radeon_emit(cs, src_task_count >> 32);
+ radeon_emit(cs, dst_x);
+ radeon_emit(cs, dst_x >> 32);
+ }
+
+ assert(cs->cdw <= cdw_max);
+
+ /* The draw packet can now use this buffer: */
+ struct radv_buffer buf = *buffer;
+ buf.bo = cmd_buffer->upload.upload_bo;
+ buf.offset = out_offset;
+
+ return buf;
+}
+
VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
uint32_t firstVertex, uint32_t firstInstance)
@@ -7574,16 +7900,7 @@ VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-
- ASSERTED struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- assert(!pipeline->base.shaders[MESA_SHADER_TASK]);
-
- /* Direct draw with mesh shader only.
- *
- * Use a non-indexed direct draw packet: DRAW_INDEX_AUTO.
- * As far as the HW is concerned: 1 input vertex = 1 mesh shader workgroup.
- */
-
+ struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
struct radv_draw_info info;
info.count = taskCount;
@@ -7598,7 +7915,12 @@ radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint3
if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1))
return;
- radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
+ if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+ radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask);
+ } else {
+ radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
+ }
+
radv_after_draw(cmd_buffer);
}
@@ -7609,28 +7931,13 @@ radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
- ASSERTED struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- assert(!pipeline->base.shaders[MESA_SHADER_TASK]);
-
- /* Indirect draw with mesh shader only.
- *
- * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
- * We must use these draw commands for NV_mesh_shader because these
- * have firstVertex while the new DISPATCH_MESH_INDIRECT_MULTI
- * command doesn't have that.
- *
- * The indirect BO layout from the NV_mesh_shader API is not directly
- * compatible with AMD HW. To make it work, we allocate some space
- * in the upload buffer and copy the data to it.
- */
- struct radv_buffer buf = radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset,
- drawCount, stride);
+ struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
struct radv_draw_info info;
+ info.indirect = buffer;
+ info.indirect_offset = offset;
+ info.stride = stride;
info.count = drawCount;
- info.indirect = &buf;
- info.indirect_offset = 0;
- info.stride = sizeof(VkDrawIndirectCommand);
info.strmout_buffer = NULL;
info.count_buffer = NULL;
info.indexed = false;
@@ -7638,7 +7945,41 @@ radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount))
return;
- radv_emit_indirect_draw_packets(cmd_buffer, &info);
+
+ /* Indirect draw with mesh shader only:
+ * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
+ * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask.
+ *
+ * Indirect draw with task + mesh shaders:
+ * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX.
+ * These packets don't support firstTask so we implement that by
+ * reading the NV command's indirect buffer in the shader.
+ *
+ * The indirect BO layout from the NV_mesh_shader API is incompatible
+ * with AMD HW. To make it work, we allocate some space
+ * in the upload buffer and copy the data to it.
+ */
+
+ if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+ uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
+ uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
+ struct radv_buffer buf =
+ radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
+ info.indirect = &buf;
+ info.indirect_offset = 0;
+ info.stride = sizeof(VkDispatchIndirectCommand);
+
+ radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
+ } else {
+ struct radv_buffer buf =
+ radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
+ info.indirect = &buf;
+ info.indirect_offset = 0;
+ info.stride = sizeof(VkDrawIndirectCommand);
+
+ radv_emit_indirect_draw_packets(cmd_buffer, &info);
+ }
+
radv_after_draw(cmd_buffer);
}
@@ -7652,17 +7993,13 @@ radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _bu
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
- ASSERTED struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- assert(!pipeline->base.shaders[MESA_SHADER_TASK]);
-
- struct radv_buffer buf = radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset,
- maxDrawCount, stride);
+ struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
struct radv_draw_info info;
+ info.indirect = buffer;
+ info.indirect_offset = offset;
+ info.stride = stride;
info.count = maxDrawCount;
- info.indirect = &buf;
- info.indirect_offset = 0;
- info.stride = sizeof(VkDrawIndirectCommand);
info.strmout_buffer = NULL;
info.count_buffer = count_buffer;
info.count_buffer_offset = countBufferOffset;
@@ -7671,7 +8008,27 @@ radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _bu
if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount))
return;
- radv_emit_indirect_draw_packets(cmd_buffer, &info);
+
+ if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+ uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
+ uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
+ struct radv_buffer buf =
+ radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
+ info.indirect = &buf;
+ info.indirect_offset = 0;
+ info.stride = sizeof(VkDispatchIndirectCommand);
+
+ radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
+ } else {
+ struct radv_buffer buf =
+ radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
+ info.indirect = &buf;
+ info.indirect_offset = 0;
+ info.stride = sizeof(VkDrawIndirectCommand);
+
+ radv_emit_indirect_draw_packets(cmd_buffer, &info);
+ }
+
radv_after_draw(cmd_buffer);
}
More information about the mesa-commit
mailing list