Mesa (main): iris: Extend the cache tracker to handle L3 flushes and invalidates

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Apr 13 09:39:36 UTC 2022


Module: Mesa
Branch: main
Commit: 43e3747eea17659b5209dff48d337eff739ec3a0
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=43e3747eea17659b5209dff48d337eff739ec3a0

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Mon Aug  2 14:50:19 2021 -0700

iris: Extend the cache tracker to handle L3 flushes and invalidates

Most clients are L3-coherent these days.  However, there are some
notable exceptions, such as push constants, stream output, and command
streamer memory reads and writes.

With the advent of the tile cache, flushing the render or depth caches
alone are no longer sufficient for memory to become globally-observable.
For those, we need to flush the tile cache as well.  However, we'd like
to avoid that for L3-coherent clients, as it shouldn't be necessary,
and is expensive.

Reviewed-by: Rohan Garg <rohan.garg at intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15275>

---

 src/gallium/drivers/iris/iris_batch.h        | 52 +++++++++++++++++++++++++---
 src/gallium/drivers/iris/iris_context.h      |  4 +++
 src/gallium/drivers/iris/iris_pipe_control.c | 41 ++++++++++++++++++++--
 src/gallium/drivers/iris/iris_state.c        | 30 +++++++++++++++-
 4 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h
index 7f0179002ab..cf34e7f3472 100644
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@@ -162,6 +162,13 @@ struct iris_batch {
     */
    uint64_t coherent_seqnos[NUM_IRIS_DOMAINS][NUM_IRIS_DOMAINS];
 
+   /**
+    * A vector representing the cache coherency status of the L3.  For each
+    * cache domain i, l3_coherent_seqnos[i] denotes the seqno of the most
+    * recent flush of that domain which is visible to L3 clients.
+    */
+   uint64_t l3_coherent_seqnos[NUM_IRIS_DOMAINS];
+
    /**
     * Sequence number used to track the completion of any subsequent memory
     * operations in the batch until the next sync boundary.
@@ -351,7 +358,12 @@ static inline void
 iris_batch_mark_flush_sync(struct iris_batch *batch,
                            enum iris_domain access)
 {
-   batch->coherent_seqnos[access][access] = batch->next_seqno - 1;
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   if (iris_domain_is_l3_coherent(devinfo, access))
+      batch->l3_coherent_seqnos[access] = batch->next_seqno - 1;
+   else
+      batch->coherent_seqnos[access][access] = batch->next_seqno - 1;
 }
 
 /**
@@ -363,8 +375,38 @@ static inline void
 iris_batch_mark_invalidate_sync(struct iris_batch *batch,
                                 enum iris_domain access)
 {
-   for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++)
-      batch->coherent_seqnos[access][i] = batch->coherent_seqnos[i][i];
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
+      if (i == access)
+         continue;
+
+      if (iris_domain_is_l3_coherent(devinfo, access)) {
+         if (iris_domain_is_read_only(access)) {
+            /* Invalidating a L3-coherent read-only domain "access" also
+             * triggers an invalidation of any matching L3 cachelines as well.
+             *
+             * If domain 'i' is L3-coherent, it sees the latest data in L3,
+             * otherwise it sees the latest globally-observable data.
+             */
+            batch->coherent_seqnos[access][i] =
+               iris_domain_is_l3_coherent(devinfo, i) ?
+               batch->l3_coherent_seqnos[i] : batch->coherent_seqnos[i][i];
+         } else {
+            /* Invalidating L3-coherent write domains does not trigger
+             * an invalidation of any matching L3 cachelines, however.
+             *
+             * It sees the latest data from domain i visible to L3 clients.
+             */
+            batch->coherent_seqnos[access][i] = batch->l3_coherent_seqnos[i];
+         }
+      } else {
+         /* "access" isn't L3-coherent, so invalidating it means it sees the
+          * most recent globally-observable data from domain i.
+          */
+         batch->coherent_seqnos[access][i] = batch->coherent_seqnos[i][i];
+      }
+   }
 }
 
 /**
@@ -375,9 +417,11 @@ iris_batch_mark_invalidate_sync(struct iris_batch *batch,
 static inline void
 iris_batch_mark_reset_sync(struct iris_batch *batch)
 {
-   for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++)
+   for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
+      batch->l3_coherent_seqnos[i] = batch->next_seqno - 1;
       for (unsigned j = 0; j < NUM_IRIS_DOMAINS; j++)
          batch->coherent_seqnos[i][j] = batch->next_seqno - 1;
+   }
 }
 
 const char *
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 91f128d121b..7b73c7be06b 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -358,6 +358,10 @@ enum pipe_control_flags
     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
     PIPE_CONTROL_INSTRUCTION_INVALIDATE)
 
+#define PIPE_CONTROL_L3_RO_INVALIDATE_BITS       \
+   (PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE | \
+    PIPE_CONTROL_CONST_CACHE_INVALIDATE)
+
 enum iris_predicate_state {
    /* The first two states are used if we can determine whether to draw
     * without having to look at the values in the query object buffer. This
diff --git a/src/gallium/drivers/iris/iris_pipe_control.c b/src/gallium/drivers/iris/iris_pipe_control.c
index e9ed766e26e..bf6cf5909b1 100644
--- a/src/gallium/drivers/iris/iris_pipe_control.c
+++ b/src/gallium/drivers/iris/iris_pipe_control.c
@@ -184,8 +184,11 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
                              struct iris_bo *bo,
                              enum iris_domain access)
 {
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
    const struct brw_compiler *compiler = batch->screen->compiler;
 
+   const bool access_via_l3 = iris_domain_is_l3_coherent(devinfo, access);
+
    const uint32_t all_flush_bits = (PIPE_CONTROL_CACHE_FLUSH_BITS |
                                     PIPE_CONTROL_STALL_AT_SCOREBOARD |
                                     PIPE_CONTROL_FLUSH_ENABLE);
@@ -211,6 +214,11 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
           PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE :
           PIPE_CONTROL_DATA_CACHE_FLUSH),
    };
+   const uint32_t l3_flush_bits[NUM_IRIS_DOMAINS] = {
+      [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_TILE_CACHE_FLUSH,
+      [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_TILE_CACHE_FLUSH,
+      [IRIS_DOMAIN_DATA_WRITE] = PIPE_CONTROL_DATA_CACHE_FLUSH,
+   };
    uint32_t bits = 0;
 
    /* Iterate over all read/write domains first in order to handle RaW
@@ -219,6 +227,8 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
     */
    for (unsigned i = 0; i < IRIS_DOMAIN_OTHER_WRITE; i++) {
       assert(!iris_domain_is_read_only(i));
+      assert(iris_domain_is_l3_coherent(devinfo, i));
+
       if (i != access) {
          const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 
@@ -230,8 +240,19 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
          if (seqno > batch->coherent_seqnos[access][i]) {
             bits |= invalidate_bits[access];
 
-            if (seqno > batch->coherent_seqnos[i][i])
-               bits |= flush_bits[i];
+            if (access_via_l3) {
+               /* Both domains share L3.  If the most recent read/write access
+                * in domain `i' isn't visible to L3, then flush it to L3.
+                */
+               if (seqno > batch->l3_coherent_seqnos[i])
+                  bits |= flush_bits[i];
+            } else {
+               /* Domain `i` is L3 coherent but the specified domain is not.
+                * Flush both this cache and L3 out to memory.
+                */
+               if (seqno > batch->coherent_seqnos[i][i])
+                  bits |= flush_bits[i] | l3_flush_bits[i];
+            }
          }
       }
    }
@@ -246,10 +267,14 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
          assert(iris_domain_is_read_only(i));
          const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 
+         const uint64_t last_visible_seqno =
+            iris_domain_is_l3_coherent(devinfo, i) ?
+            batch->l3_coherent_seqnos[i] : batch->coherent_seqnos[i][i];
+
          /* Flush if the most recent access from this domain occurred
           * after its most recent flush.
           */
-         if (seqno > batch->coherent_seqnos[i][i])
+         if (seqno > last_visible_seqno)
             bits |= flush_bits[i];
       }
    }
@@ -262,6 +287,8 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
    const unsigned i = IRIS_DOMAIN_OTHER_WRITE;
    const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 
+   assert(!iris_domain_is_l3_coherent(devinfo, i));
+
    /* Invalidate unless the most recent read/write access from this
     * domain is already guaranteed to be visible to the specified
     * domain.  Flush if the most recent access from this domain
@@ -270,6 +297,14 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
    if (seqno > batch->coherent_seqnos[access][i]) {
       bits |= invalidate_bits[access];
 
+      /* There is a non-L3-coherent write that isn't visible to the
+       * specified domain.  If the access is via L3, then it might see
+       * stale L3 data that was loaded before that write.  In this case,
+       * we try to invalidate all read-only sections of the L3 cache.
+       */
+      if (access_via_l3 && seqno > batch->l3_coherent_seqnos[i])
+         bits |= PIPE_CONTROL_L3_RO_INVALIDATE_BITS;
+
       if (seqno > batch->coherent_seqnos[i][i])
          bits |= flush_bits[i];
    }
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 45c60aa8166..58cf5badb72 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -7592,6 +7592,8 @@ iris_rebind_buffer(struct iris_context *ice,
 static void
 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
 {
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
    iris_batch_sync_boundary(batch);
 
    if ((flags & PIPE_CONTROL_CS_STALL)) {
@@ -7601,8 +7603,24 @@ batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
       if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
 
-      if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH))
+      if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
+         /* A tile cache flush makes any C/Z data in L3 visible to memory. */
+         const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
+         const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
+         batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
+         batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
+      }
+
+      if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
+         /* HDC and DC flushes both flush the data cache out to L3 */
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
+      }
+
+      if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
+         /* A DC flush also flushes L3 data cache lines out to memory. */
+         const unsigned i = IRIS_DOMAIN_DATA_WRITE;
+         batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
+      }
 
       if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
@@ -7652,6 +7670,16 @@ batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
 
    /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
+
+   if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
+      /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
+       * domains will now be visible to those L3 clients.
+       */
+      for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
+         if (!iris_domain_is_l3_coherent(devinfo, i))
+            batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
+      }
+   }
 }
 
 static unsigned



More information about the mesa-commit mailing list