Mesa (main): radv: Refactor predication for compute queues.

Wed Jul 20 19:31:56 UTC 2022

Module: Mesa
Branch: main
Commit: e10cbb5d980b8476911de2fdff3e2e501c325c4d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e10cbb5d980b8476911de2fdff3e2e501c325c4d

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Thu Jun 16 16:40:45 2022 +0200

radv: Refactor predication for compute queues.

Initialize the inverted predication VA only when it is used
for the first time.

This is needed to get conditional rendering work correctly with
task shaders because the internal compute cmdbuf may not exist
yet when conditional rendering starts.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>

---

 src/amd/vulkan/radv_cmd_buffer.c | 123 ++++++++++++++++++++-------------------
 src/amd/vulkan/radv_private.h    |   3 +
 2 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index e17c7b230e1..be9a929e3d1 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -533,6 +533,16 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
       cmd_buffer->descriptors[i].push_dirty = false;
    }
 
+   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
+      uint32_t pred_value = 0;
+      uint32_t pred_offset;
+      if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
+         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      cmd_buffer->mec_inv_pred_emitted = false;
+      cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
+   }
+
    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
        cmd_buffer->qf == RADV_QUEUE_GENERAL) {
       unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
@@ -6301,18 +6311,56 @@ radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
  * Emulates predication for MEC using COND_EXEC.
  * When the current command buffer is predicating, emit a COND_EXEC packet
  * so that the MEC skips the next few dwords worth of packets.
+ *
+ * To make it work with inverted conditional rendering, we allocate
+ * space in the upload BO and emit some packets to invert the condition.
  */
 static void
-radv_cs_emit_compute_predication(struct radv_cmd_buffer *cmd_buffer, unsigned dwords)
+radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs,
+                                 uint64_t inv_va, bool *inv_emitted, unsigned dwords)
 {
-   if (!cmd_buffer->state.predicating)
+   if (!state->predicating)
       return;
 
-   struct radeon_cmdbuf *cs = cmd_buffer->cs;
+   uint64_t va = state->predication_va;
+
+   if (!state->predication_type) {
+      /* Invert the condition the first time it is needed. */
+      if (!*inv_emitted) {
+         *inv_emitted = true;
+
+         /* Write 1 to the inverted predication VA. */
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_WR_CONFIRM);
+         radeon_emit(cs, 1);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, inv_va);
+         radeon_emit(cs, inv_va >> 32);
+
+         /* If the API predication VA == 0, skip next command. */
+         radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
+         radeon_emit(cs, va);
+         radeon_emit(cs, va >> 32);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, 6); /* 1x COPY_DATA size */
+
+         /* Write 0 to the new predication VA (when the API condition != 0) */
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_WR_CONFIRM);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, inv_va);
+         radeon_emit(cs, inv_va >> 32);
+      }
+
+      va = inv_va;
+   }
 
    radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
-   radeon_emit(cs, cmd_buffer->state.predication_va);
-   radeon_emit(cs, cmd_buffer->state.predication_va >> 32);
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
    radeon_emit(cs, 0); /* Cache policy */
    radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
 }
@@ -7649,7 +7697,9 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
       }
 
       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-         radv_cs_emit_compute_predication(cmd_buffer, 4 /* DISPATCH_INDIRECT size */);
+         radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
+                                          &cmd_buffer->mec_inv_pred_emitted,
+                                          4 /* DISPATCH_INDIRECT size */);
          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
          radeon_emit(cs, info->va);
          radeon_emit(cs, info->va >> 32);
@@ -7732,7 +7782,9 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
       }
 
       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-         radv_cs_emit_compute_predication(cmd_buffer, 5 /* DISPATCH_DIRECT size */);
+         radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
+                                          &cmd_buffer->mec_inv_pred_emitted,
+                                          5 /* DISPATCH_DIRECT size */);
          predicating = false;
       }
 
@@ -9096,54 +9148,8 @@ radv_CmdBeginConditionalRenderingEXT(
       pred_op = PREDICATION_OP_BOOL64;
    }
 
-
-   if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-      /* MEC does not support real predication, so we have to emulate that
-       * using COND_EXEC packets before each dispatch.
-       */
-
-      if (!draw_visible) {
-         /* COND_EXEC can only skip the next packet(s) when the condition is 0.
-          * When using inverted conditional rendering, we allocate some space in
-          * the upload BO and emit some packets to invert the condition.
-          */
-
-         uint64_t pred_value = 0, pred_va;
-         unsigned pred_offset;
-
-         radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset);
-         pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
-
-         /* Write 1 to the new predication VA. */
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                         COPY_DATA_WR_CONFIRM);
-         radeon_emit(cs, 1);
-         radeon_emit(cs, 0);
-         radeon_emit(cs, pred_va);
-         radeon_emit(cs, pred_va >> 32);
-
-         /* If the API predication VA == 0, skip next command. */
-         radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 6); /* 1x COPY_DATA size */
-
-         /* Write 0 to the new predication VA (when the API condition != 0) */
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                         COPY_DATA_WR_CONFIRM);
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 0);
-         radeon_emit(cs, pred_va);
-         radeon_emit(cs, pred_va >> 32);
-
-         va = pred_va;
-         draw_visible = true;
-      }
-   } else {
-      /* Enable predication for this command buffer. */
+   /* MEC doesn't support predication, we emulate it elsewhere. */
+   if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
       si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
    }
 
@@ -9152,6 +9158,7 @@ radv_CmdBeginConditionalRenderingEXT(
    cmd_buffer->state.predication_type = draw_visible;
    cmd_buffer->state.predication_op = pred_op;
    cmd_buffer->state.predication_va = va;
+   cmd_buffer->mec_inv_pred_emitted = false;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -9159,11 +9166,8 @@ radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   /* Note, MEC doesn't support predication, so we
-    * don't have to emit anything for MEC here.
-    */
+   /* MEC doesn't support predication, no need to emit anything here. */
    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
-      /* Disable predication for this command buffer. */
       si_emit_set_predication_state(cmd_buffer, false, 0, 0);
    }
 
@@ -9172,6 +9176,7 @@ radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
    cmd_buffer->state.predication_type = -1;
    cmd_buffer->state.predication_op = 0;
    cmd_buffer->state.predication_va = 0;
+   cmd_buffer->mec_inv_pred_emitted = false;
 }
 
 /* VK_EXT_transform_feedback */
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index ca1df44922c..1f3676844e5 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1634,6 +1634,9 @@ struct radv_cmd_buffer {
    uint32_t gfx9_fence_idx;
    uint64_t gfx9_eop_bug_va;
 
+   uint64_t mec_inv_pred_va;  /* For inverted predication when using MEC. */
+   bool mec_inv_pred_emitted; /* To ensure we don't have to repeat inverting the VA. */
+
    /**
     * Whether a query pool has been resetted and we have to flush caches.
     */