Mesa (main): tu: Properly handle waiting on an earlier pipeline stage

Fri Aug 20 18:21:37 UTC 2021

Module: Mesa
Branch: main
Commit: abf0ae014a878d063132a4bf2f2515dc7052f069
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=abf0ae014a878d063132a4bf2f2515dc7052f069

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Thu Aug 19 15:49:00 2021 +0200

tu: Properly handle waiting on an earlier pipeline stage

I never really implemented this properly, because I wasn't aware of the
clusters when doing the original pipeline barrier implementation. It
turns out that the Vulkan stages we get as part of the barriers are
actually good for something, because it turns out that the pipeline
state is split into stages, so earlier stages can run ahead of later
stages and sometimes we need to wait when an earlier stage depends on
the result of a later stage. This happens most often whenever a shader
reads the result of a color/depth attachment write, because attachment
writes happen in a logically later stage. However this could also happen
for a FS -> VS dependency.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12475>

---

 src/freedreno/ci/deqp-freedreno-a630-fails.txt |   7 --
 src/freedreno/vulkan/tu_clear_blit.c           |   4 +-
 src/freedreno/vulkan/tu_cmd_buffer.c           | 163 +++++++++++++++++--------
 src/freedreno/vulkan/tu_pass.c                 |  15 +--
 src/freedreno/vulkan/tu_private.h              |  98 +++++++++------
 5 files changed, 174 insertions(+), 113 deletions(-)

diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
index e5790183e98..1552cd5d1e6 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
@@ -81,13 +81,6 @@ dEQP-VK.api.info.get_physical_device_properties2.properties,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device_group,Fail
 
-# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019
-# should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745
-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
-
 # "deqp-vk: ../src/freedreno/vulkan/tu_cs.h:186: tu_cs_reserve: Assertion `tu_cs_get_space(cs) >= reserved_size' failed."
 # https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8841
 dEQP-VK.spirv_assembly.instruction.compute.opphi.wide,Crash
diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index 5f80034e89e..2bf02ca56d7 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -1800,6 +1800,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
        */
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
+      tu_cs_emit_wfi(cs);
 
       tu_image_view_copy(&staging, &staging_image, dst_format,
                          &staging_subresource, 0, false);
@@ -2573,9 +2574,6 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
    if (!attachment->clear_mask)
       return;
 
-   /* Wait for any flushes at the beginning of the renderpass to complete */
-   tu_cs_emit_wfi(cs);
-
    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index e8801b2340e..869a2c8fba4 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -68,7 +68,7 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
                  enum tu_cmd_flush_bits flushes)
 {
    if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL))
-      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_GPU_INVALIDATE;
+      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE;
 
    if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW))
       flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
@@ -2575,26 +2575,16 @@ tu_flush_for_access(struct tu_cache_state *cache,
 {
    enum tu_cmd_flush_bits flush_bits = 0;
 
-   if (src_mask & TU_ACCESS_HOST_WRITE) {
-      /* Host writes are always visible to CP, so only invalidate GPU caches */
-      cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;
-   }
-
    if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
-      /* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as
-       * well.
-       */
       cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
    }
 
    if (src_mask & TU_ACCESS_CP_WRITE) {
-      /* Flush the CP write queue. However a WFI shouldn't be necessary as
-       * WAIT_MEM_WRITES should cover it.
+      /* Flush the CP write queue.
        */
       cache->pending_flush_bits |=
          TU_CMD_FLAG_WAIT_MEM_WRITES |
-         TU_CMD_FLAG_GPU_INVALIDATE |
-         TU_CMD_FLAG_WAIT_FOR_ME;
+         TU_CMD_FLAG_ALL_INVALIDATE;
    }
 
 #define SRC_FLUSH(domain, flush, invalidate) \
@@ -2624,8 +2614,7 @@ tu_flush_for_access(struct tu_cache_state *cache,
    /* Treat host & sysmem write accesses the same, since the kernel implicitly
     * drains the queue before signalling completion to the host.
     */
-   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |
-                   TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {
+   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
       flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
    }
 
@@ -2656,30 +2645,26 @@ tu_flush_for_access(struct tu_cache_state *cache,
 
 #undef DST_INCOHERENT_FLUSH
 
-   if (dst_mask & TU_ACCESS_WFI_READ) {
-      flush_bits |= cache->pending_flush_bits &
-         (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);
-   }
-
-   if (dst_mask & TU_ACCESS_WFM_READ) {
-      flush_bits |= cache->pending_flush_bits &
-         (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);
-   }
-
    cache->flush_bits |= flush_bits;
    cache->pending_flush_bits &= ~flush_bits;
 }
 
-static enum tu_cmd_access_mask
-vk2tu_access(VkAccessFlags flags, bool gmem)
-{
-   enum tu_cmd_access_mask mask = 0;
+static void
+tu_flush_for_stage(struct tu_cache_state *cache,
+                   enum tu_stage src_stage, enum tu_stage dst_stage)
+{
+   /* As far as we know, flushes take place in the last stage so if there are
+    * any pending flushes then we have to move down the source stage, because
+    * the data only becomes available when the flush finishes. In particular
+    * this can matter when the CP writes something and we need to invalidate
+    * UCHE to read it.
+    */
+   if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
+      src_stage = TU_STAGE_PS;
 
-   /* If the GPU writes a buffer that is then read by an indirect draw
-    * command, we theoretically need to emit a WFI to wait for any cache
-    * flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to
-    * complete. Waiting for the WFI to complete is performed as part of the
-    * draw by the firmware, so we just need to execute the WFI.
+   /* Note: if the destination stage is the CP, then the CP also has to wait
+    * for any WFI's to finish. This is already done for draw calls, including
+    * before indirect param reads, for the most part, so we just need to WFI.
     *
     * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
     * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
@@ -2692,13 +2677,14 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
     * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
     * comparisons, then this will have to be dealt with.
     */
-   if (flags &
-       (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
-        VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
-        VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT |
-        VK_ACCESS_MEMORY_READ_BIT)) {
-      mask |= TU_ACCESS_WFI_READ;
-   }
+   if (src_stage > dst_stage)
+      cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
+}
+
+static enum tu_cmd_access_mask
+vk2tu_access(VkAccessFlags flags, bool gmem)
+{
+   enum tu_cmd_access_mask mask = 0;
 
    if (flags &
        (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
@@ -2717,13 +2703,13 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
    if (flags &
        (VK_ACCESS_HOST_READ_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
-      mask |= TU_ACCESS_HOST_READ;
+      mask |= TU_ACCESS_SYSMEM_READ;
    }
 
    if (flags &
        (VK_ACCESS_HOST_WRITE_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
-      mask |= TU_ACCESS_HOST_WRITE;
+      mask |= TU_ACCESS_SYSMEM_WRITE;
    }
 
    if (flags &
@@ -2792,13 +2778,6 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
       }
    }
 
-   /* When the dst access is a transfer read/write, it seems we sometimes need
-    * to insert a WFI after any flushes, to guarantee that the flushes finish
-    * before the 2D engine starts. However the opposite (i.e. a WFI after
-    * CP_BLIT and before any subsequent flush) does not seem to be needed, and
-    * the blob doesn't emit such a WFI.
-    */
-
    if (flags &
        (VK_ACCESS_TRANSFER_WRITE_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
@@ -2807,18 +2786,82 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
       } else {
          mask |= TU_ACCESS_CCU_COLOR_WRITE;
       }
-      mask |= TU_ACCESS_WFI_READ;
    }
 
    if (flags &
        (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
         VK_ACCESS_MEMORY_READ_BIT)) {
-      mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
+      mask |= TU_ACCESS_UCHE_READ;
    }
 
    return mask;
 }
 
+static enum tu_stage
+vk2tu_single_stage(VkPipelineStageFlags vk_stage, bool dst)
+{
+   switch (vk_stage) {
+   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+   case VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT:
+   case VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT:
+      return TU_STAGE_CP;
+   case VK_PIPELINE_STAGE_VERTEX_INPUT_BIT:
+      return TU_STAGE_FE;
+   case VK_PIPELINE_STAGE_VERTEX_SHADER_BIT:
+   case VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT:
+   case VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT:
+   case VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT:
+      return TU_STAGE_SP_VS;
+   case VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT:
+   case VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT:
+      return TU_STAGE_SP_PS;
+   case VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT: /* Yes, really */
+   /* See comment in TU_STAGE_GRAS about early fragment tests */
+   case VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT:
+   case VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT:
+   case VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT:
+   case VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT:
+      return TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_TRANSFER_BIT:
+      /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
+      return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT:
+   case VK_PIPELINE_STAGE_ALL_COMMANDS_BIT:
+      /* Be conservative */
+      return dst ? TU_STAGE_CP : TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_HOST_BIT:
+      return dst ? TU_STAGE_PS : TU_STAGE_CP;
+   }
+
+   unreachable("unknown pipeline stage");
+}
+
+static enum tu_stage
+vk2tu_src_stage(VkPipelineStageFlags vk_stages)
+{
+   enum tu_stage stage = TU_STAGE_CP;
+   u_foreach_bit (bit, vk_stages) {
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); 
+      stage = MAX2(stage, new_stage);
+   }
+
+   return stage;
+}
+
+static enum tu_stage
+vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
+{
+   enum tu_stage stage = TU_STAGE_PS;
+   u_foreach_bit (bit, vk_stages) {
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); 
+      stage = MIN2(stage, new_stage);
+   }
+
+   return stage;
+}
 
 VKAPI_ATTR void VKAPI_CALL
 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
@@ -3007,6 +3050,10 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
       src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
 
    tu_flush_for_access(cache, src_flags, dst_flags);
+
+   enum tu_stage src_stage = vk2tu_src_stage(barrier->src_stage_mask);
+   enum tu_stage dst_stage = vk2tu_dst_stage(barrier->dst_stage_mask);
+   tu_flush_for_stage(cache, src_stage, dst_stage);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -3048,7 +3095,13 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
          cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
 
          tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
-         tu6_emit_event_write(cmd, &cmd->cs, PC_CCU_FLUSH_COLOR_TS);
+
+         /* Clearing writes via CCU color in the PS stage, and LRZ is read via
+          * UCHE in the earlier GRAS stage.
+          */
+         cmd->state.cache.flush_bits |=
+            TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
+            TU_CMD_FLAG_WAIT_FOR_IDLE;
       } else {
          cmd->state.lrz.valid = false;
       }
@@ -4625,6 +4678,10 @@ tu_barrier(struct tu_cmd_buffer *cmd,
       cmd->state.pass  ? &cmd->state.renderpass_cache : &cmd->state.cache;
    tu_flush_for_access(cache, src_flags, dst_flags);
 
+   enum tu_stage src_stage = vk2tu_src_stage(info->srcStageMask);
+   enum tu_stage dst_stage = vk2tu_dst_stage(info->dstStageMask);
+   tu_flush_for_stage(cache, src_stage, dst_stage);
+
    for (uint32_t i = 0; i < info->eventCount; i++) {
       TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
 
diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c
index feef006c3cd..811e0a43c87 100644
--- a/src/freedreno/vulkan/tu_pass.c
+++ b/src/freedreno/vulkan/tu_pass.c
@@ -99,15 +99,6 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
    if (dep_invalid_for_gmem(dep))
       pass->gmem_pixels = 0;
 
-   struct tu_subpass_barrier *src_barrier;
-   if (src == VK_SUBPASS_EXTERNAL) {
-      src_barrier = &pass->subpasses[0].start_barrier;
-   } else if (src == pass->subpass_count - 1) {
-      src_barrier = &pass->end_barrier;
-   } else {
-      src_barrier = &pass->subpasses[src + 1].start_barrier;
-   }
-
    struct tu_subpass_barrier *dst_barrier;
    if (dst == VK_SUBPASS_EXTERNAL) {
       dst_barrier = &pass->end_barrier;
@@ -115,9 +106,9 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
       dst_barrier = &pass->subpasses[dst].start_barrier;
    }
 
-   if (dep->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
-      src_barrier->src_stage_mask |= dep->srcStageMask;
-   src_barrier->src_access_mask |= dep->srcAccessMask;
+   dst_barrier->src_stage_mask |= dep->srcStageMask;
+   dst_barrier->dst_stage_mask |= dep->dstStageMask;
+   dst_barrier->src_access_mask |= dep->srcAccessMask;
    dst_barrier->dst_access_mask |= dep->dstAccessMask;
 }
 
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 9ae3d161ff8..eac25e6f00f 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -763,41 +763,23 @@ enum tu_cmd_access_mask {
     * the location of a cache entry in CCU, to avoid conflicts. We assume that
     * any access in a renderpass after or before an access by a transfer needs
     * a flush/invalidate, and use the _INCOHERENT variants to represent access
-    * by a transfer.
+    * by a renderpass.
     */
    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
 
-   /* Accesses by the host */
-   TU_ACCESS_HOST_READ = 1 << 10,
-   TU_ACCESS_HOST_WRITE = 1 << 11,
-
-   /* Accesses by a GPU engine which bypasses any cache. e.g. writes via
-    * CP_EVENT_WRITE::BLIT and the CP are SYSMEM_WRITE.
-    */
-   TU_ACCESS_SYSMEM_READ = 1 << 12,
-   TU_ACCESS_SYSMEM_WRITE = 1 << 13,
-
-   /* Set if a WFI is required. This can be required for:
-    * - 2D engine which (on some models) doesn't wait for flushes to complete
-    *   before starting
-    * - CP draw indirect opcodes, where we need to wait for any flushes to
-    *   complete but the CP implicitly waits for WFI's to complete and
-    *   therefore we only need a WFI after the flushes.
-    */
-   TU_ACCESS_WFI_READ = 1 << 14,
-
-   /* Set if a CP_WAIT_FOR_ME is required due to the data being read by the CP
-    * without it waiting for any WFI.
+   /* Accesses which bypasses any cache. e.g. writes via the host,
+    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
     */
-   TU_ACCESS_WFM_READ = 1 << 15,
+   TU_ACCESS_SYSMEM_READ = 1 << 10,
+   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
 
    /* Memory writes from the CP start in-order with draws and event writes,
     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
     */
-   TU_ACCESS_CP_WRITE = 1 << 16,
+   TU_ACCESS_CP_WRITE = 1 << 12,
 
    TU_ACCESS_READ =
       TU_ACCESS_UCHE_READ |
@@ -805,10 +787,7 @@ enum tu_cmd_access_mask {
       TU_ACCESS_CCU_DEPTH_READ |
       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
-      TU_ACCESS_HOST_READ |
-      TU_ACCESS_SYSMEM_READ |
-      TU_ACCESS_WFI_READ |
-      TU_ACCESS_WFM_READ,
+      TU_ACCESS_SYSMEM_READ,
 
    TU_ACCESS_WRITE =
       TU_ACCESS_UCHE_WRITE |
@@ -816,7 +795,6 @@ enum tu_cmd_access_mask {
       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
       TU_ACCESS_CCU_DEPTH_WRITE |
       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
-      TU_ACCESS_HOST_WRITE |
       TU_ACCESS_SYSMEM_WRITE |
       TU_ACCESS_CP_WRITE,
 
@@ -825,6 +803,57 @@ enum tu_cmd_access_mask {
       TU_ACCESS_WRITE,
 };
 
+/* Starting with a6xx, the pipeline is split into several "clusters" (really
+ * pipeline stages). Each stage has its own pair of register banks and can
+ * switch them independently, so that earlier stages can run ahead of later
+ * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
+ * the same time.
+ *
+ * As a result of this, we need to insert a WFI when an earlier stage depends
+ * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
+ * pending WFI's to complete before starting, and usually before reading
+ * indirect params even, so a WFI also acts as a full "pipeline stall".
+ *
+ * Note, the names of the stages come from CLUSTER_* in devcoredump. We
+ * include all the stages for completeness, even ones which do not read/write
+ * anything.
+ */
+
+enum tu_stage {
+   /* This doesn't correspond to a cluster, but we need it for tracking
+    * indirect draw parameter reads etc.
+    */
+   TU_STAGE_CP,
+
+   /* - Fetch index buffer
+    * - Fetch vertex attributes, dispatch VS
+    */
+   TU_STAGE_FE,
+
+   /* Execute all geometry stages (VS thru GS) */
+   TU_STAGE_SP_VS,
+
+   /* Write to VPC, do primitive assembly. */
+   TU_STAGE_PC_VS,
+
+   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
+    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
+    * early depth testing is enabled before dispatching fragments? However
+    * GRAS reads and writes LRZ directly.
+    */
+   TU_STAGE_GRAS,
+
+   /* Execute FS */
+   TU_STAGE_SP_PS,
+
+   /* - Fragment tests
+    * - Write color/depth
+    * - Streamout writes (???)
+    * - Varying interpolation (???)
+    */
+   TU_STAGE_PS,
+};
+
 enum tu_cmd_flush_bits {
    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
@@ -845,18 +874,10 @@ enum tu_cmd_flush_bits {
        */
       TU_CMD_FLAG_WAIT_MEM_WRITES,
 
-   TU_CMD_FLAG_GPU_INVALIDATE =
+   TU_CMD_FLAG_ALL_INVALIDATE =
       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
       TU_CMD_FLAG_CACHE_INVALIDATE,
-
-   TU_CMD_FLAG_ALL_INVALIDATE =
-      TU_CMD_FLAG_GPU_INVALIDATE |
-      /* Treat the CP as a sort of "cache" which may need to be "invalidated"
-       * via waiting for UCHE/CCU flushes to land with WFI/WFM.
-       */
-      TU_CMD_FLAG_WAIT_FOR_IDLE |
-      TU_CMD_FLAG_WAIT_FOR_ME,
 };
 
 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
@@ -1546,6 +1567,7 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
 
 struct tu_subpass_barrier {
    VkPipelineStageFlags src_stage_mask;
+   VkPipelineStageFlags dst_stage_mask;
    VkAccessFlags src_access_mask;
    VkAccessFlags dst_access_mask;
    bool incoherent_ccu_color, incoherent_ccu_depth;