Mesa (master): i965: Drop non-LLC lunacy in the program cache code.

Sun Jul 23 02:35:24 UTC 2017

Module: Mesa
Branch: master
Commit: 0044de931f9a82bcf33a4c12851709788361d990
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=0044de931f9a82bcf33a4c12851709788361d990

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Tue Jul 11 14:19:46 2017 -0700

i965: Drop non-LLC lunacy in the program cache code.

The non-LLC story was a horror show.  We uploaded data via pwrite
(drm_intel_bo_subdata), which would stall if the cache BO was in
use (being read) by the GPU.  Obviously, we wanted to avoid that.
So, we tried to detect whether the buffer was busy, and if so, we'd
allocate a new BO, map the old one read-only (hopefully not stalling),
copy all shaders compiled since the dawn of time to the new buffer,
upload our new one, toss the old BO, and let the state upload code
know that our program cache BO changed.  This was a lot of extra data
copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new
STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline.

Not only that, but our rudimentary busy tracking consistented of a flag
set at execbuf time, and not cleared until we threw out the program
cache BO.  So, the first shader upload after any drawing would hit this
"abandon the cache and start over" copying path.

This is largely unnecessary - it's just ancient and crufty code.  We can
use the same persistent mapping paths on all platforms.  On non-ancient
kernels, this will use a write combining map, which should be reasonably
fast.

One aspect that is worse: we do occasionally grow the program cache BO,
and copy the old contents to the newer BO.  This will suffer from UC
readback performance now.  To mitigate this, we use the MOVNTDQA based
streaming memcpy on platforms with SSE 4.1 (all Gen7+ atoms).  Gen4-5
are unfortunately going to be penalized.

v2: Add MOVNTDQA path, rebase on other map flag changes.
v3: Drop cache->bo_used_by_gpu too (caught by Chris Wilson).

Reviewed-by: Matt Turner <mattst88 at gmail.com>

---

 src/mesa/drivers/dri/i965/brw_context.h       |  1 -
 src/mesa/drivers/dri/i965/brw_program_cache.c | 84 +++++++--------------------
 src/mesa/drivers/dri/i965/intel_batchbuffer.c |  6 --
 3 files changed, 21 insertions(+), 70 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2acebaa820..a4a04daaa9 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -373,7 +373,6 @@ struct brw_cache {
    GLuint size, n_items;
 
    uint32_t next_offset;
-   bool bo_used_by_gpu;
 };
 
 /* Considered adding a member to this struct to document which flags
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index 04682bef34..4dcfd5234d 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -45,6 +45,8 @@
  */
 
 #include "main/imports.h"
+#include "main/streaming-load-memcpy.h"
+#include "x86/common_x86_asm.h"
 #include "intel_batchbuffer.h"
 #include "brw_state.h"
 #include "brw_wm.h"
@@ -214,33 +216,28 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 {
    struct brw_context *brw = cache->brw;
    struct brw_bo *new_bo;
-   void *llc_map;
 
    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
    if (can_do_exec_capture(brw->screen))
       new_bo->kflags = EXEC_OBJECT_CAPTURE;
-   if (brw->has_llc) {
-      llc_map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
-                                        MAP_ASYNC | MAP_PERSISTENT);
-   }
+
+   void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
+                                       MAP_ASYNC | MAP_PERSISTENT);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(llc_map, cache->map, cache->next_offset);
-      } else {
-         void *map = brw_bo_map(brw, cache->bo, MAP_READ);
-         brw_bo_subdata(new_bo, 0, cache->next_offset, map);
-         brw_bo_unmap(cache->bo);
-      }
+#ifdef USE_SSE41
+      if (!cache->bo->cache_coherent && cpu_has_sse4_1)
+         _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
+      else
+#endif
+         memcpy(map, cache->map, cache->next_offset);
    }
 
-   if (brw->has_llc)
-      brw_bo_unmap(cache->bo);
+   brw_bo_unmap(cache->bo);
    brw_bo_unreference(cache->bo);
    cache->bo = new_bo;
-   cache->map = brw->has_llc ? llc_map : NULL;
-   cache->bo_used_by_gpu = false;
+   cache->map = map;
 
    /* Since we have a new BO in place, we need to signal the units
     * that depend on it (state base address on gen5+, or unit state before).
@@ -257,27 +254,13 @@ brw_lookup_prog(const struct brw_cache *cache,
                 enum brw_cache_id cache_id,
                 const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
    unsigned i;
    const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-         int ret;
-
-         if (item->cache_id != cache_id || item->size != data_size)
-            continue;
-
-         void *map;
-         if (!brw->has_llc)
-            map = brw_bo_map(brw, cache->bo, MAP_READ);
-         else
-            map = cache->map;
-
-         ret = memcmp(map + item->offset, data, item->size);
-         if (!brw->has_llc)
-            brw_bo_unmap(cache->bo);
-         if (ret)
+         if (item->cache_id != cache_id || item->size != data_size ||
+             memcmp(cache->map + item->offset, data, item->size) != 0)
             continue;
 
          return item;
@@ -291,7 +274,6 @@ static uint32_t
 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
    uint32_t offset;
-   struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
    if (cache->next_offset + size > cache->bo->size) {
@@ -303,14 +285,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
       brw_cache_new_bo(cache, new_size);
    }
 
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
    offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
@@ -348,7 +322,6 @@ brw_upload_cache(struct brw_cache *cache,
                  uint32_t *out_offset,
                  void *out_aux)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    const struct brw_cache_item *matching_data =
       brw_lookup_prog(cache, cache_id, data, data_size);
@@ -375,11 +348,7 @@ brw_upload_cache(struct brw_cache *cache,
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy(cache->map + item->offset, data, data_size);
-      } else {
-         brw_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      memcpy(cache->map + item->offset, data, data_size);
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -418,10 +387,9 @@ brw_init_caches(struct brw_context *brw)
    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
    if (can_do_exec_capture(brw->screen))
       cache->bo->kflags = EXEC_OBJECT_CAPTURE;
-   if (brw->has_llc) {
-      cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
-                                              MAP_ASYNC | MAP_PERSISTENT);
-   }
+
+   cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
+                                           MAP_ASYNC | MAP_PERSISTENT);
 }
 
 static void
@@ -500,8 +468,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    /* This can be NULL if context creation failed early on */
    if (cache->bo) {
-      if (brw->has_llc)
-         brw_bo_unmap(cache->bo);
+      brw_bo_unmap(cache->bo);
       brw_bo_unreference(cache->bo);
       cache->bo = NULL;
       cache->map = NULL;
@@ -551,21 +518,12 @@ brw_print_program_cache(struct brw_context *brw)
 {
    const struct brw_cache *cache = &brw->cache;
    struct brw_cache_item *item;
-   void *map;
-
-   if (!brw->has_llc)
-      map = brw_bo_map(brw, cache->bo, MAP_READ);
-   else
-      map = cache->map;
 
    for (unsigned i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
          fprintf(stderr, "%s:\n", cache_name(i));
-         brw_disassemble(&brw->screen->devinfo, map,
+         brw_disassemble(&brw->screen->devinfo, cache->map,
                          item->offset, item->size, stderr);
       }
    }
-
-   if (!brw->has_llc)
-      brw_bo_unmap(cache->bo);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 4461a59b80..e2f208a3d1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -465,12 +465,6 @@ brw_finish_batch(struct brw_context *brw)
                                           PIPE_CONTROL_CS_STALL);
       }
    }
-
-   /* Mark that the current program cache BO has been used by the GPU.
-    * It will be reallocated if we need to put new programs in for the
-    * next batch.
-    */
-   brw->cache.bo_used_by_gpu = true;
 }
 
 static void