Mesa (master): iris: Implement PIPE_FLUSH_DEFERRED support.

Fri May 1 19:13:07 UTC 2020

Module: Mesa
Branch: master
Commit: 1800e4b58caaa89acfe45c95d0d22e533b50ee03
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1800e4b58caaa89acfe45c95d0d22e533b50ee03

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Mon Aug  5 13:18:39 2019 -0700

iris: Implement PIPE_FLUSH_DEFERRED support.

(Co-authored with Chris Wilson.)

Frequently, games create fences and later check them with a timeout of
0 to see if that work has completed yet.  They do not want the work to
be flushed immediately upon fence creation.

This is what PIPE_FLUSH_DEFERRED does - it inhibits the flush at fence
creation time, but still guarantees that a flush will occur later on
once fence_finish() is called.

Since syncpts can only occur at batch boundaries, when deferring a
flush, we have to wait for the syncpt at the end of the batch being
constructed.  This is later than desired, but safe if blocking.  To
avoid extra delays, we additionally insert a PIPE_CONTROL to write an
availability bit at the exact point of the fence.  We can poll this
on the CPU, allowing us to check whether the fence has gone by, even
if the batch hasn't completed.  It can also let us skip kernel calls.

Improves performance in Bioshock Infinite by 10% on Icelake GT2 on
-ForceCompatLevel=5 settings.  Thanks to Felix Degrood and Mark Janes
for helping notice the extraneous stalls and batches, Marek Olšák for
adding deferred flush support to Gallium to solve this issue, and
Chris Wilson for reworking a lot of the internals of this work.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3802>

---

 src/gallium/drivers/iris/iris_fence.c | 96 ++++++++++++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_fence.c b/src/gallium/drivers/iris/iris_fence.c
index ff280599060..d4f11886f02 100644
--- a/src/gallium/drivers/iris/iris_fence.c
+++ b/src/gallium/drivers/iris/iris_fence.c
@@ -28,6 +28,7 @@
 
 #include <linux/sync_file.h>
 
+#include "util/u_debug.h"
 #include "util/u_inlines.h"
 #include "intel/common/gen_gem.h"
 
@@ -114,6 +115,9 @@ iris_batch_add_syncobj(struct iris_batch *batch,
 
 struct pipe_fence_handle {
    struct pipe_reference ref;
+
+   struct pipe_context *unflushed_ctx;
+
    struct iris_seqno *seqno[IRIS_BATCH_COUNT];
 };
 
@@ -170,6 +174,14 @@ iris_fence_flush(struct pipe_context *ctx,
    struct iris_screen *screen = (void *) ctx->screen;
    struct iris_context *ice = (struct iris_context *)ctx;
 
+   /* We require DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (kernel 5.2+) for
+    * deferred flushes.  Just ignore the request to defer on older kernels.
+    */
+   if (!(screen->kernel_features & KERNEL_HAS_WAIT_FOR_SUBMIT))
+      flags &= ~PIPE_FLUSH_DEFERRED;
+
+   const bool deferred = flags & PIPE_FLUSH_DEFERRED;
+
    if (flags & PIPE_FLUSH_END_OF_FRAME) {
       ice->frame++;
 
@@ -181,9 +193,10 @@ iris_fence_flush(struct pipe_context *ctx,
       }
    }
 
-   /* XXX PIPE_FLUSH_DEFERRED */
-   for (unsigned i = 0; i < IRIS_BATCH_COUNT; i++)
-      iris_batch_flush(&ice->batches[i]);
+   if (!deferred) {
+      for (unsigned i = 0; i < IRIS_BATCH_COUNT; i++)
+         iris_batch_flush(&ice->batches[i]);
+   }
 
    if (!out_fence)
       return;
@@ -194,13 +207,27 @@ iris_fence_flush(struct pipe_context *ctx,
 
    pipe_reference_init(&fence->ref, 1);
 
+   if (deferred)
+      fence->unflushed_ctx = ctx;
+
    for (unsigned b = 0; b < IRIS_BATCH_COUNT; b++) {
       struct iris_batch *batch = &ice->batches[b];
 
-      if (iris_seqno_signaled(batch->last_seqno))
-         continue;
+      if (deferred && iris_batch_bytes_used(batch) > 0) {
+         struct iris_seqno *seqno =
+            iris_seqno_new(batch, IRIS_SEQNO_BOTTOM_OF_PIPE);
+         iris_seqno_reference(screen, &fence->seqno[b], seqno);
+         iris_seqno_reference(screen, &seqno, NULL);
+      } else {
+         /* This batch has no commands queued up (perhaps we just flushed,
+          * or all the commands are on the other batch).  Wait for the last
+          * syncobj on this engine - unless it's already finished by now.
+          */
+         if (iris_seqno_signaled(batch->last_seqno))
+            continue;
 
-      iris_seqno_reference(screen, &fence->seqno[b], batch->last_seqno);
+         iris_seqno_reference(screen, &fence->seqno[b], batch->last_seqno);
+      }
    }
 
    iris_fence_reference(ctx->screen, out_fence, NULL);
@@ -213,6 +240,23 @@ iris_fence_await(struct pipe_context *ctx,
 {
    struct iris_context *ice = (struct iris_context *)ctx;
 
+   /* Unflushed fences from the same context are no-ops. */
+   if (ctx && ctx == fence->unflushed_ctx)
+      return;
+
+   /* XXX: We can't safely flush the other context, because it might be
+    *      bound to another thread, and poking at its internals wouldn't
+    *      be safe.  In the future we should use MI_SEMAPHORE_WAIT and
+    *      block until the other job has been submitted, relying on
+    *      kernel timeslicing to preempt us until the other job is
+    *      actually flushed and the seqno finally passes.
+    */
+   if (fence->unflushed_ctx) {
+      pipe_debug_message(&ice->dbg, CONFORMANCE, "%s",
+                         "glWaitSync on unflushed fence from another context "
+                         "is unlikely to work without kernel 5.8+\n");
+   }
+
    /* Flush any current work in our context as it doesn't need to wait
     * for this fence.  Any future work in our context must wait.
     */
@@ -263,8 +307,32 @@ iris_fence_finish(struct pipe_screen *p_screen,
                   struct pipe_fence_handle *fence,
                   uint64_t timeout)
 {
+   struct iris_context *ice = (struct iris_context *)ctx;
    struct iris_screen *screen = (struct iris_screen *)p_screen;
 
+   /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have
+    * flushed yet.  Check if our syncobj is the current batch's signalling
+    * syncobj - if so, we haven't flushed and need to now.
+    *
+    * The Gallium docs mention that a flush will occur if \p ctx matches
+    * the context the fence was created with.  It may be NULL, so we check
+    * that it matches first.
+    */
+   if (ctx && ctx == fence->unflushed_ctx) {
+      for (unsigned i = 0; i < IRIS_BATCH_COUNT; i++) {
+         struct iris_seqno *seqno = fence->seqno[i];
+
+         if (iris_seqno_signaled(seqno))
+            continue;
+
+         if (seqno->syncobj == iris_batch_get_signal_syncobj(&ice->batches[i]))
+            iris_batch_flush(&ice->batches[i]);
+      }
+
+      /* The fence is no longer deferred. */
+      fence->unflushed_ctx = NULL;
+   }
+
    unsigned int handle_count = 0;
    uint32_t handles[ARRAY_SIZE(fence->seqno)];
    for (unsigned i = 0; i < ARRAY_SIZE(fence->seqno); i++) {
@@ -285,6 +353,18 @@ iris_fence_finish(struct pipe_screen *p_screen,
       .timeout_nsec = rel2abs(timeout),
       .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL
    };
+
+   if (fence->unflushed_ctx) {
+      /* This fence had a deferred flush from another context.  We can't
+       * safely flush it here, because the context might be bound to a
+       * different thread, and poking at its internals wouldn't be safe.
+       *
+       * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that
+       * another thread submits the work.
+       */
+      args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
+   }
+
    return gen_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0;
 }
 
@@ -317,6 +397,10 @@ iris_fence_get_fd(struct pipe_screen *p_screen,
    struct iris_screen *screen = (struct iris_screen *)p_screen;
    int fd = -1;
 
+   /* Deferred fences aren't supported. */
+   if (fence->unflushed_ctx)
+      return -1;
+
    for (unsigned i = 0; i < ARRAY_SIZE(fence->seqno); i++) {
       struct iris_seqno *seqno = fence->seqno[i];