[Mesa-dev] [PATCH 7/7] i965: Use persistent CPU mappings for the program cache even on non-LLC.

Tue Jan 17 07:14:17 UTC 2017

The non-LLC story was a horror show.  We uploaded data via pwrite
(drm_intel_bo_subdata), which would stall if the cache BO was in
use (being read) by the GPU.  Obviously, we wanted to avoid that.
So, we tried to detect whether the buffer was busy, and if so, we'd
allocate a new BO, map the old one read-only (hopefully not stalling),
copy all shaders compiled since the dawn of time to the new buffer,
upload our new one, toss the old BO, and let the state upload code
know that our program cache BO changed.  This was a lot of extra data
copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new
STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline.

Not only that, but our rudimentary busy tracking consistented of a flag
set at execbuf time, and not cleared until we threw out the program
cache BO.  So, the first shader upload after any drawing would hit this
"abandon the cache and start over" copying path.

None of this is necessary - it's just ancient crufty code.  We can
use the same persistent mapping paths on all platforms.  On non-LLC
platforms, we simply need to clflush after memcpy'ing in new shaders,
so they're visible to the GPU.  This is not only better, but the code
is also significantly simpler.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  1 -
 src/mesa/drivers/dri/i965/brw_program_cache.c | 58 ++++++++-------------------
 src/mesa/drivers/dri/i965/intel_batchbuffer.c |  6 ---
 3 files changed, 16 insertions(+), 49 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b032d511a1e..70f240383b1 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -461,7 +461,6 @@ struct brw_cache {
    GLuint size, n_items;
 
    uint32_t next_offset;
-   bool bo_used_by_gpu;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index 0c1da172588..b76ff12e9fe 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -207,32 +207,31 @@ brw_search_cache(struct brw_cache *cache,
 }
 
 static void
+memcpy_coherent(bool has_llc, void *dest, const void *src, size_t n)
+{
+   memcpy(dest, src, n);
+   if (!has_llc)
+      brw_clflush_range(dest, n);
+}
+
+static void
 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 {
    struct brw_context *brw = cache->brw;
    drm_intel_bo *new_bo;
 
    new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
-   if (brw->has_llc)
-      drm_intel_bo_map(new_bo, true);
+   drm_intel_bo_map(new_bo, true);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
-      } else {
-         drm_intel_bo_map(cache->bo, false);
-         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
-                              cache->bo->virtual);
-         drm_intel_bo_unmap(cache->bo);
-      }
+      memcpy_coherent(brw->has_llc, new_bo->virtual, cache->bo->virtual,
+                      cache->next_offset);
    }
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
+   drm_intel_bo_unmap(cache->bo);
    drm_intel_bo_unreference(cache->bo);
    cache->bo = new_bo;
-   cache->bo_used_by_gpu = false;
 
    /* Since we have a new BO in place, we need to signal the units
     * that depend on it (state base address on gen5+, or unit state before).
@@ -249,7 +248,6 @@ brw_lookup_prog(const struct brw_cache *cache,
                 enum brw_cache_id cache_id,
                 const void *data, unsigned data_size)
 {
-   const struct brw_context *brw = cache->brw;
    unsigned i;
    const struct brw_cache_item *item;
 
@@ -260,11 +258,7 @@ brw_lookup_prog(const struct brw_cache *cache,
          if (item->cache_id != cache_id || item->size != data_size)
             continue;
 
-         if (!brw->has_llc)
-            drm_intel_bo_map(cache->bo, false);
          ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
-         if (!brw->has_llc)
-            drm_intel_bo_unmap(cache->bo);
          if (ret)
             continue;
 
@@ -279,7 +273,6 @@ static uint32_t
 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
    uint32_t offset;
-   struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
    if (cache->next_offset + size > cache->bo->size) {
@@ -291,14 +284,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
       brw_cache_new_bo(cache, new_size);
    }
 
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
    offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
@@ -363,11 +348,8 @@ brw_upload_cache(struct brw_cache *cache,
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
-      } else {
-         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      memcpy_coherent(brw->has_llc, cache->bo->virtual + item->offset,
+                      data, data_size);
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -404,8 +386,7 @@ brw_init_caches(struct brw_context *brw)
       calloc(cache->size, sizeof(struct brw_cache_item *));
 
    cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
-   if (brw->has_llc)
-      drm_intel_bo_map(cache->bo, true);
+   drm_intel_bo_map(cache->bo, true);
 }
 
 static void
@@ -482,8 +463,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    DBG("%s\n", __func__);
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
+   drm_intel_bo_unmap(cache->bo);
    drm_intel_bo_unreference(cache->bo);
    cache->bo = NULL;
    brw_clear_cache(brw, cache);
@@ -532,9 +512,6 @@ brw_print_program_cache(struct brw_context *brw)
    const struct brw_cache *cache = &brw->cache;
    struct brw_cache_item *item;
 
-   if (!brw->has_llc)
-      drm_intel_bo_map(cache->bo, false);
-
    for (unsigned i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
          fprintf(stderr, "%s:\n", cache_name(i));
@@ -542,7 +519,4 @@ brw_print_program_cache(struct brw_context *brw)
                          item->offset, item->size, stderr);
       }
    }
-
-   if (!brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index d1b9317a8c1..29ff49d06f1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -265,12 +265,6 @@ brw_finish_batch(struct brw_context *brw)
                                           PIPE_CONTROL_CS_STALL);
       }
    }
-
-   /* Mark that the current program cache BO has been used by the GPU.
-    * It will be reallocated if we need to put new programs in for the
-    * next batch.
-    */
-   brw->cache.bo_used_by_gpu = true;
 }
 
 static void
-- 
2.11.0