Mesa (main): turnip: Skip load/stores for tiles with no geometry

Fri Apr 29 09:50:53 UTC 2022

Module: Mesa
Branch: main
Commit: 0c489f18cb27d3c725f424f8f57d45636f4eb297
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=0c489f18cb27d3c725f424f8f57d45636f4eb297

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Thu Apr 14 17:19:21 2022 +0300

turnip: Skip load/stores for tiles with no geometry

When HW binning is used tile loads/stores could be skipped
if there is no geometry in the tile.

Loads could be skipped when:
- The attachment won't be resolved, otherwise if load is skipped
  there would be holes in the resolved attachment;
- There is no vkCmdClearAttachments afterwards since it is likely
  a partial clear done via 2d blit (2d blit doesn't produce geometry).

Stores could be skipped when:
- The attachment was not cleared, which may happen by load_op or
  vkCmdClearAttachments;
- When store is not a resolve.

I chose to predicate each load/store separately to allow them to be
skipped when only some attachments are cleared or resolved.

Gmem loads are moved into separate cs because whether to emit
CP_COND_REG_EXEC depends on HW binning being enabled and usage of
vkCmdClearAttachments.

CP_COND_REG_EXEC predicate could be changed during draw_cs only
by perf query, in such case the predicate should be re-emitted.
(At the moment it is always re-emitted before stores)

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15974>

---

 src/freedreno/vulkan/tu_clear_blit.c |  68 +++++++++++++++++++++-
 src/freedreno/vulkan/tu_cmd_buffer.c | 106 ++++++++++++++++++++++++++++++-----
 src/freedreno/vulkan/tu_pass.c       |  11 ++++
 src/freedreno/vulkan/tu_private.h    |  10 +++-
 src/freedreno/vulkan/tu_query.c      |   4 ++
 5 files changed, 182 insertions(+), 17 deletions(-)

diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index a96be2613e2..555b5edf26f 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -2280,6 +2280,8 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
          }
       }
+
+      cmd->state.attachment_cmd_clear[a] = true;
    }
 
    /* We may not know the multisample count if there are no attachments, so
@@ -2551,6 +2553,8 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
          if (a == VK_ATTACHMENT_UNUSED)
                continue;
 
+         cmd->state.attachment_cmd_clear[a] = true;
+
          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
                                        &attachments[j].clearValue);
       }
@@ -2799,24 +2803,64 @@ blit_can_resolve(VkFormat format)
    return true;
 }
 
+static void
+tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
+                              struct tu_cs *cs, bool load)
+{
+   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
+}
+
+static void
+tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
+                            struct tu_cs *cs, bool load)
+{
+   tu_cond_exec_end(cs);
+}
+
 void
 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
                         uint32_t a,
+                        bool cond_exec_allowed,
                         bool force_load)
 {
    const struct tu_image_view *iview = cmd->state.attachments[a];
    const struct tu_render_pass_attachment *attachment =
       &cmd->state.pass->attachments[a];
 
+   bool load_common = attachment->load || force_load;
+   bool load_stencil =
+      attachment->load_stencil ||
+      (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
+
+   if (!load_common && !load_stencil)
+      return;
+
    trace_start_gmem_load(&cmd->trace, cs);
 
-   if (attachment->load || force_load)
+   /* If attachment will be cleared by vkCmdClearAttachments - it is likely
+    * that it would be partially cleared, and since it is done by 2d blit
+    * it doesn't produce geometry, so we have to unconditionally load.
+    *
+    * To simplify conditions treat partially cleared separate DS as fully
+    * cleared and don't emit cond_exec.
+    */
+   bool cond_exec = cond_exec_allowed &&
+                    !attachment->clear_mask &&
+                    !cmd->state.attachment_cmd_clear[a] &&
+                    !attachment->will_be_resolved;
+   if (cond_exec)
+      tu_begin_load_store_cond_exec(cmd, cs, true);
+
+   if (load_common)
       tu_emit_blit(cmd, cs, iview, attachment, false, false);
 
-   if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
+   if (load_stencil)
       tu_emit_blit(cmd, cs, iview, attachment, false, true);
 
+   if (cond_exec)
+      tu_end_load_store_cond_exec(cmd, cs, true);
+
    trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
 }
 
@@ -2919,7 +2963,8 @@ void
 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          struct tu_cs *cs,
                          uint32_t a,
-                         uint32_t gmem_a)
+                         uint32_t gmem_a,
+                         bool cond_exec_allowed)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const VkRect2D *render_area = &cmd->state.render_area;
@@ -2930,6 +2975,15 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
    if (!dst->store && !dst->store_stencil)
       return;
 
+   bool was_cleared = src->clear_mask || cmd->state.attachment_cmd_clear[a];
+   /* Unconditional store should happen only if attachment was cleared,
+    * which could have happened either by load_op or via vkCmdClearAttachments.
+    */
+   bool cond_exec = cond_exec_allowed && !was_cleared;
+   if (cond_exec) {
+      tu_begin_load_store_cond_exec(cmd, cs, false);
+   }
+
    uint32_t x1 = render_area->offset.x;
    uint32_t y1 = render_area->offset.y;
    uint32_t x2 = x1 + render_area->extent.width;
@@ -2971,6 +3025,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       if (store_separate_stencil)
          tu_emit_blit(cmd, cs, iview, src, true, true);
 
+      if (cond_exec) {
+         tu_end_load_store_cond_exec(cmd, cs, false);
+      }
+
       trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
       return;
    }
@@ -3011,5 +3069,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       }
    }
 
+   if (cond_exec) {
+      tu_end_load_store_cond_exec(cmd, cs, false);
+   }
+
    trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
 }
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index e97765ccdc8..3f1dd3831b6 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -632,6 +632,25 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
    return use_sysmem;
 }
 
+/* Optimization: there is no reason to load gmem if there is no
+ * geometry to process. COND_REG_EXEC predicate is set here,
+ * but the actual skip happens in tile_load_cs and tile_store_cs,
+ * for each blit separately.
+ */
+static void
+tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                              uint32_t pipe, uint32_t slot, bool wfm)
+{
+   if (use_hw_binning(cmd)) {
+      tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
+      tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
+                     A6XX_CP_REG_TEST_0_BIT(slot) |
+                     COND(wfm, A6XX_CP_REG_TEST_0_WAIT_FOR_ME));
+   } else {
+      /* COND_REG_EXECs are not emitted in non-binning case */
+   }
+}
+
 static void
 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
@@ -664,6 +683,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
       tu_cs_emit(cs, pipe * 4);
       tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
 
+      tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, true);
+
       tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
       tu_cs_emit(cs, 0x0);
 
@@ -740,6 +761,15 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
    }
 }
 
+static void
+tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+   tu6_emit_blit_scissor(cmd, cs, true);
+
+   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
+      tu_load_gmem_attachment(cmd, cs, i, use_hw_binning(cmd), false);
+}
+
 static void
 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
@@ -756,7 +786,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    for (uint32_t a = 0; a < pass->attachment_count; ++a) {
       if (pass->attachments[a].gmem_offset >= 0)
-         tu_store_gmem_attachment(cmd, cs, a, a);
+         tu_store_gmem_attachment(cmd, cs, a, a, use_hw_binning(cmd));
    }
 
    if (subpass->resolve_attachments) {
@@ -764,7 +794,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
          uint32_t a = subpass->resolve_attachments[i].attachment;
          if (a != VK_ATTACHMENT_UNUSED) {
             uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
-            tu_store_gmem_attachment(cmd, cs, a, gmem_a);
+            tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
          }
       }
    }
@@ -1220,11 +1250,6 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
 
    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
 
-   tu6_emit_blit_scissor(cmd, cs, true);
-
-   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu_load_gmem_attachment(cmd, cs, i, false);
-
    tu6_emit_blit_scissor(cmd, cs, false);
 
    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
@@ -1356,8 +1381,10 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 }
 
 static void
-tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                uint32_t pipe, uint32_t slot)
 {
+   tu_cs_emit_call(cs, &cmd->tile_load_cs);
    tu_cs_emit_call(cs, &cmd->draw_cs);
 
    if (use_hw_binning(cmd)) {
@@ -1365,6 +1392,10 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
       tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
    }
 
+   /* Predicate is changed in draw_cs so we have to re-emit it */
+   if (cmd->state.draw_cs_writes_to_cond_pred)
+      tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
+
    tu_cs_emit_call(cs, &cmd->tile_store_cs);
 
    if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) {
@@ -1418,7 +1449,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
 
                trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
-               tu6_render_tile(cmd, &cmd->cs);
+               tu6_render_tile(cmd, &cmd->cs, pipe, slot);
                trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
             }
          }
@@ -1491,6 +1522,7 @@ tu_create_cmd_buffer(struct tu_device *device,
    list_inithead(&cmd_buffer->renderpass_autotune_results);
 
    tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
+   tu_cs_init(&cmd_buffer->tile_load_cs, device, TU_CS_MODE_GROW, 2048);
    tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
    tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
    tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
@@ -1507,11 +1539,14 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
    list_del(&cmd_buffer->pool_link);
 
    tu_cs_finish(&cmd_buffer->cs);
+   tu_cs_finish(&cmd_buffer->tile_load_cs);
    tu_cs_finish(&cmd_buffer->draw_cs);
    tu_cs_finish(&cmd_buffer->tile_store_cs);
    tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
    tu_cs_finish(&cmd_buffer->sub_cs);
 
+   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
+
    u_trace_fini(&cmd_buffer->trace);
 
    tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
@@ -1535,11 +1570,15 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
    cmd_buffer->record_result = VK_SUCCESS;
 
    tu_cs_reset(&cmd_buffer->cs);
+   tu_cs_reset(&cmd_buffer->tile_load_cs);
    tu_cs_reset(&cmd_buffer->draw_cs);
    tu_cs_reset(&cmd_buffer->tile_store_cs);
    tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
    tu_cs_reset(&cmd_buffer->sub_cs);
 
+   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
+   cmd_buffer->state.attachment_cmd_clear = NULL;
+
    tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
@@ -1678,6 +1717,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    cmd_buffer->usage_flags = pBeginInfo->flags;
 
    tu_cs_begin(&cmd_buffer->cs);
+   tu_cs_begin(&cmd_buffer->tile_load_cs);
    tu_cs_begin(&cmd_buffer->draw_cs);
    tu_cs_begin(&cmd_buffer->tile_store_cs);
    tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
@@ -1710,6 +1750,14 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
          cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
          cmd_buffer->state.subpass =
             &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
+         /* vkCmdClearAttachments is allowed in a secondary cmdbuf and we have to
+          * track it as in primary cmdbuf.
+          */
+         cmd_buffer->state.attachment_cmd_clear =
+            vk_zalloc(&cmd_buffer->pool->vk.alloc,
+                      cmd_buffer->state.pass->attachment_count *
+                         sizeof(cmd_buffer->state.attachment_cmd_clear[0]),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       } else {
          /* When executing in the middle of another command buffer, the CCU
           * state is unknown.
@@ -2245,6 +2293,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
    }
 
    tu_cs_end(&cmd_buffer->cs);
+   tu_cs_end(&cmd_buffer->tile_load_cs);
    tu_cs_end(&cmd_buffer->draw_cs);
    tu_cs_end(&cmd_buffer->tile_store_cs);
    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
@@ -3061,7 +3110,7 @@ vk2tu_src_stage(VkPipelineStageFlags vk_stages)
 {
    enum tu_stage stage = TU_STAGE_CP;
    u_foreach_bit (bit, vk_stages) {
-      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); 
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
       stage = MAX2(stage, new_stage);
    }
 
@@ -3073,7 +3122,7 @@ vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
 {
    enum tu_stage stage = TU_STAGE_PS;
    u_foreach_bit (bit, vk_stages) {
-      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); 
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
       stage = MIN2(stage, new_stage);
    }
 
@@ -3130,6 +3179,14 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
             cmd->state.has_subpass_predication = true;
          if (secondary->state.disable_gmem)
             cmd->state.disable_gmem = true;
+
+         cmd->state.draw_cs_writes_to_cond_pred |=
+            secondary->state.draw_cs_writes_to_cond_pred;
+
+         for (uint32_t i = 0; i < cmd->state.pass->attachment_count; i++) {
+            cmd->state.attachment_cmd_clear[i] |=
+               secondary->state.attachment_cmd_clear[i];
+         }
       } else {
          assert(tu_cs_is_empty(&secondary->draw_cs));
          assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@@ -3307,6 +3364,18 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
       return;
    }
 
+   cmd->state.attachment_cmd_clear =
+      vk_zalloc(&cmd->pool->vk.alloc, pass->attachment_count *
+               sizeof(cmd->state.attachment_cmd_clear[0]), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (!cmd->state.attachment_cmd_clear) {
+      cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      return;
+   }
+
+   cmd->state.draw_cs_writes_to_cond_pred = false;
+
    for (unsigned i = 0; i < pass->attachment_count; i++) {
       cmd->state.attachments[i] = pAttachmentInfo ?
          tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
@@ -3400,7 +3469,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
 
          uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
 
-         tu_store_gmem_attachment(cmd, cs, a, gmem_a);
+         tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
 
          if (pass->attachments[a].gmem_offset < 0)
             continue;
@@ -3410,7 +3479,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
           * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
           */
          tu_finishme("missing GMEM->GMEM resolve path\n");
-         tu_load_gmem_attachment(cmd, cs, a, true);
+         tu_load_gmem_attachment(cmd, cs, a, false, true);
       }
    }
 
@@ -4627,8 +4696,15 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
 
+   /* GMEM loads are created after draw_cs in the separate cs
+    * because they need to know whether to allow their conditional
+    * execution, which is tied to a state that is known only at
+    * the end of the renderpass.
+    */
+   tu6_emit_tile_load(cmd_buffer, &cmd_buffer->tile_load_cs);
    tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs);
 
+   tu_cs_end(&cmd_buffer->tile_load_cs);
    tu_cs_end(&cmd_buffer->draw_cs);
    tu_cs_end(&cmd_buffer->tile_store_cs);
    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
@@ -4649,6 +4725,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
 
    /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
       rendered */
+   tu_cs_discard_entries(&cmd_buffer->tile_load_cs);
+   tu_cs_begin(&cmd_buffer->tile_load_cs);
    tu_cs_discard_entries(&cmd_buffer->draw_cs);
    tu_cs_begin(&cmd_buffer->draw_cs);
    tu_cs_discard_entries(&cmd_buffer->tile_store_cs);
@@ -4661,6 +4739,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
    tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
 
    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
+   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
+   cmd_buffer->state.attachment_cmd_clear = NULL;
 
    cmd_buffer->state.pass = NULL;
    cmd_buffer->state.subpass = NULL;
diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c
index e7bc2c7da0f..64d9de2676f 100644
--- a/src/freedreno/vulkan/tu_pass.c
+++ b/src/freedreno/vulkan/tu_pass.c
@@ -800,6 +800,12 @@ tu_CreateRenderPass2(VkDevice _device,
          for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
             subpass->resolve_attachments[j].attachment =
                   desc->pResolveAttachments[j].attachment;
+
+            uint32_t src_a = desc->pColorAttachments[j].attachment;
+            if (src_a != VK_ATTACHMENT_UNUSED) {
+               pass->attachments[src_a].will_be_resolved =
+                  desc->pResolveAttachments[j].attachment != VK_ATTACHMENT_UNUSED;
+            }
          }
       }
 
@@ -808,6 +814,11 @@ tu_CreateRenderPass2(VkDevice _device,
          subpass->resolve_count++;
          uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
          subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a;
+
+         uint32_t src_a = desc->pDepthStencilAttachment->attachment;
+         if (src_a != VK_ATTACHMENT_UNUSED) {
+            pass->attachments[src_a].will_be_resolved = a != VK_ATTACHMENT_UNUSED;
+         }
       }
 
       uint32_t a = desc->pDepthStencilAttachment ?
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index b35647f9887..52b4fc3bccb 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1196,6 +1196,10 @@ struct tu_cmd_state
    VkRect2D render_area;
 
    const struct tu_image_view **attachments;
+   /* Tracks whether attachment was cleared by vkCmdClearAttachments */
+   bool *attachment_cmd_clear;
+   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
+   bool draw_cs_writes_to_cond_pred;
 
    bool xfb_used;
    bool has_tess;
@@ -1290,6 +1294,7 @@ struct tu_cmd_buffer
    VkResult record_result;
 
    struct tu_cs cs;
+   struct tu_cs tile_load_cs;
    struct tu_cs draw_cs;
    struct tu_cs tile_store_cs;
    struct tu_cs draw_epilogue_cs;
@@ -1576,6 +1581,7 @@ void
 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
                         uint32_t a,
+                        bool cond_exec_allowed,
                         bool force_load);
 
 /* expose this function to be able to emit load without checking LOAD_OP */
@@ -1587,7 +1593,8 @@ void
 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          struct tu_cs *cs,
                          uint32_t a,
-                         uint32_t gmem_a);
+                         uint32_t gmem_a,
+                         bool cond_exec_allowed);
 
 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
 
@@ -1857,6 +1864,7 @@ struct tu_render_pass_attachment
    bool load;
    bool store;
    int32_t gmem_offset;
+   bool will_be_resolved;
    /* for D32S8 separate stencil: */
    bool load_stencil;
    bool store_stencil;
diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c
index 7bf710f5e30..2b374b73626 100644
--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@@ -874,6 +874,10 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
    uint32_t last_pass = ~0;
 
+   if (cmdbuf->state.pass) {
+      cmdbuf->state.draw_cs_writes_to_cond_pred = true;
+   }
+
    /* Querying perf counters happens in these steps:
     *
     *  0) There's a scratch reg to set a pass index for perf counters query.