Mesa (main): tu: Read some input attachments directly

Tue Aug 10 15:13:38 UTC 2021

Module: Mesa
Branch: main
Commit: 380d4904ea6402f2829f9c0cf801c9d0996e61b8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=380d4904ea6402f2829f9c0cf801c9d0996e61b8

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Thu Aug  5 14:52:00 2021 +0200

tu: Read some input attachments directly

It can happen that the user reads an input attachment as the first use
of that attachment. In that case there are no subpass dependencies
required at all, because there could be a pipeline barrier before the
renderpass instead, and in any case we assume that dependencies with the
first subpass as a destination can be executed only once outside the
renderpass. The result is that we only do a CACHE_INVALIDATE once
before the entire renderpass, but it's actually required after each GMEM
load, because input attachments read GMEM through UCHE and those writes
to GMEM invalidate UCHE.

While we could add the missing CACHE_INVALIDATE "by hand" somehow, it
turns out it's actually just as easy to do an optimization the blob
does, where it simply doesn't patch those input attachments and reads
them directly instead. This means we can skip allocating memory in GMEM
for them entirely in some circumstances.

This fixes e.g.
dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image.4_bit
with TU_DEBUG=forcebin.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12213>

---

 src/freedreno/ci/deqp-freedreno-a630-fails.txt |  7 ---
 src/freedreno/vulkan/tu_cmd_buffer.c           |  8 ++-
 src/freedreno/vulkan/tu_pass.c                 | 77 +++++++++++++++++++++++++-
 src/freedreno/vulkan/tu_private.h              | 10 ++++
 4 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
index 9d63a70ce95..c8a171583d9 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
@@ -11,13 +11,6 @@ KHR-GL33.transform_feedback.draw_xfb_stream_instanced_test,Crash
 KHR-GL33.transform_feedback.query_vertex_interleaved_test,Fail
 KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
 
-# Fails with TU_DEBUG=forcebin
-# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12213
-dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image.4_bit,Fail
-dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image_one_region.4_bit,Fail
-dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_array_image.4_bit,Fail
-dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_array_image_one_region.4_bit,Fail
-
 # "Fail (createInstance returned VK_ERROR_INITIALIZATION_FAILED)"
 # happens inside the loader on anholt's debian system, and there are various
 # likely-looking fixes in later versions of the loader.
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 05a8a314288..31100c54d88 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1056,7 +1056,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
          gmem_offset = att->gmem_offset_stencil;
       }
 
-      if (!gmem)
+      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
          continue;
 
       /* patched for gmem */
@@ -2924,6 +2924,9 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
       cmd->state.cache.pending_flush_bits;
    cmd->state.renderpass_cache.flush_bits = 0;
 
+   if (pass->subpasses[0].feedback_invalidate)
+      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
+
    /* Track LRZ valid state */
    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
    if (a != VK_ATTACHMENT_UNUSED) {
@@ -3011,6 +3014,9 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
    /* Handle dependencies for the next subpass */
    tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
 
+   if (cmd->state.subpass->feedback_invalidate)
+      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
+
    /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
    tu6_emit_zs(cmd, cmd->state.subpass, cs);
    tu6_emit_mrt(cmd, cmd->state.subpass, cs);
diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c
index 7df0060c0d4..a76abe9fa5c 100644
--- a/src/freedreno/vulkan/tu_pass.c
+++ b/src/freedreno/vulkan/tu_pass.c
@@ -334,6 +334,75 @@ tu_render_pass_add_implicit_deps(struct tu_render_pass *pass,
    }
 }
 
+/* If an input attachment is used without an intervening write to the same
+ * attachment, then we can just use the original image, even in GMEM mode.
+ * This is an optimization, but it's also important because it allows us to
+ * avoid having to invalidate UCHE at the beginning of each tile due to it
+ * becoming invalid. The only reads of GMEM via UCHE should be after an
+ * earlier subpass modified it, which only works if there's already an
+ * appropriate dependency that will add the CACHE_INVALIDATE anyway. We
+ * don't consider this in the dependency code, so this is also required for
+ * correctness.
+ */
+static void
+tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
+{
+   bool written[pass->attachment_count];
+
+   memset(written, 0, sizeof(written));
+
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      struct tu_subpass *subpass = &pass->subpasses[i];
+
+      for (unsigned j = 0; j < subpass->input_count; j++) {
+         uint32_t a = subpass->input_attachments[j].attachment;
+         if (a == VK_ATTACHMENT_UNUSED)
+            continue;
+         subpass->input_attachments[j].patch_input_gmem = written[a];
+      }
+
+      for (unsigned j = 0; j < subpass->color_count; j++) {
+         uint32_t a = subpass->color_attachments[j].attachment;
+         if (a == VK_ATTACHMENT_UNUSED)
+            continue;
+         written[a] = true;
+
+         for (unsigned k = 0; k < subpass->input_count; k++) {
+            if (subpass->input_attachments[k].attachment == a &&
+                !subpass->input_attachments[k].patch_input_gmem) {
+               /* For render feedback loops, we have no idea whether the use
+                * as a color attachment or input attachment will come first,
+                * so we have to always use GMEM in case the color attachment
+                * comes first and defensively invalidate UCHE in case the
+                * input attachment comes first.
+                */
+               subpass->feedback_invalidate = true;
+               subpass->input_attachments[k].patch_input_gmem = true;
+            }
+         }
+      }
+
+      for (unsigned j = 0; j < subpass->resolve_count; j++) {
+         uint32_t a = subpass->resolve_attachments[j].attachment;
+         if (a == VK_ATTACHMENT_UNUSED)
+            continue;
+         written[a] = true;
+      }
+
+      if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+         written[subpass->depth_stencil_attachment.attachment] = true;
+         for (unsigned k = 0; k < subpass->input_count; k++) {
+            if (subpass->input_attachments[k].attachment ==
+                subpass->depth_stencil_attachment.attachment &&
+                !subpass->input_attachments[k].patch_input_gmem) {
+               subpass->feedback_invalidate = true;
+               subpass->input_attachments[k].patch_input_gmem = true;
+            }
+         }
+      }
+   }
+}
+
 static void update_samples(struct tu_subpass *subpass,
                            VkSampleCountFlagBits samples)
 {
@@ -584,8 +653,10 @@ tu_CreateRenderPass2(VkDevice _device,
          for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
             uint32_t a = desc->pInputAttachments[j].attachment;
             subpass->input_attachments[j].attachment = a;
-            if (a != VK_ATTACHMENT_UNUSED)
-               pass->attachments[a].gmem_offset = 0;
+            /* Note: attachments only used as input attachments will be read
+             * directly instead of through gmem, so we don't mark input
+             * attachments as needing gmem.
+             */
          }
       }
 
@@ -635,6 +706,8 @@ tu_CreateRenderPass2(VkDevice _device,
       }
    }
 
+   tu_render_pass_patch_input_gmem(pass);
+
    /* disable unused attachments */
    for (uint32_t i = 0; i < pass->attachment_count; i++) {
       struct tu_render_pass_attachment *att = &pass->attachments[i];
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 9c867c916ff..d669595470f 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1542,6 +1542,12 @@ struct tu_subpass_barrier {
 struct tu_subpass_attachment
 {
    uint32_t attachment;
+
+   /* For input attachments, true if it needs to be patched to refer to GMEM
+    * in GMEM mode. This is false if it hasn't already been written as an
+    * attachment.
+    */
+   bool patch_input_gmem;
 };
 
 struct tu_subpass
@@ -1550,6 +1556,10 @@ struct tu_subpass
    uint32_t color_count;
    uint32_t resolve_count;
    bool resolve_depth_stencil;
+
+   /* True if we must invalidate UCHE thanks to a feedback loop. */
+   bool feedback_invalidate;
+
    struct tu_subpass_attachment *input_attachments;
    struct tu_subpass_attachment *color_attachments;
    struct tu_subpass_attachment *resolve_attachments;