[Mesa-dev] [PATCH 7/7] i965: Drop non-LLC lunacy in the program cache code.

Kenneth Graunke kenneth at whitecape.org
Fri Jul 21 23:17:47 UTC 2017


The non-LLC story was a horror show.  We uploaded data via pwrite
(drm_intel_bo_subdata), which would stall if the cache BO was in
use (being read) by the GPU.  Obviously, we wanted to avoid that.
So, we tried to detect whether the buffer was busy, and if so, we'd
allocate a new BO, map the old one read-only (hopefully not stalling),
copy all shaders compiled since the dawn of time to the new buffer,
upload our new one, toss the old BO, and let the state upload code
know that our program cache BO changed.  This was a lot of extra data
copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new
STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline.

Not only that, but our rudimentary busy tracking consistented of a flag
set at execbuf time, and not cleared until we threw out the program
cache BO.  So, the first shader upload after any drawing would hit this
"abandon the cache and start over" copying path.

This is largely unnecessary - it's just ancient and crufty code.  We can
use the same persistent mapping paths on all platforms.  On non-ancient
kernels, this will use a write combining map, which should be reasonably
fast.

One aspect that is worse: we do occasionally grow the program cache BO,
and copy the old contents to the newer BO.  This will suffer from UC
readback performance now.  To mitigate this, we use the MOVNTDQA based
streaming memcpy on platforms with SSE 4.1 (all Gen7+ atoms).  Gen4-5
are unfortunately going to be penalized.

v2: Add MOVNTDQA path, rebase on other map flag changes.
---
 src/mesa/drivers/dri/i965/brw_program_cache.c | 83 +++++++--------------------
 1 file changed, 21 insertions(+), 62 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index 04682bef34c..8d83af71d3a 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -45,6 +45,8 @@
  */
 
 #include "main/imports.h"
+#include "main/streaming-load-memcpy.h"
+#include "x86/common_x86_asm.h"
 #include "intel_batchbuffer.h"
 #include "brw_state.h"
 #include "brw_wm.h"
@@ -214,32 +216,28 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 {
    struct brw_context *brw = cache->brw;
    struct brw_bo *new_bo;
-   void *llc_map;
 
    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
    if (can_do_exec_capture(brw->screen))
       new_bo->kflags = EXEC_OBJECT_CAPTURE;
-   if (brw->has_llc) {
-      llc_map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
-                                        MAP_ASYNC | MAP_PERSISTENT);
-   }
+
+   void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
+                                       MAP_ASYNC | MAP_PERSISTENT);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(llc_map, cache->map, cache->next_offset);
-      } else {
-         void *map = brw_bo_map(brw, cache->bo, MAP_READ);
-         brw_bo_subdata(new_bo, 0, cache->next_offset, map);
-         brw_bo_unmap(cache->bo);
-      }
+#ifdef USE_SSE41
+      if (!cache->bo->cache_coherent && cpu_has_sse4_1)
+         _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
+      else
+#endif
+         memcpy(map, cache->map, cache->next_offset);
    }
 
-   if (brw->has_llc)
-      brw_bo_unmap(cache->bo);
+   brw_bo_unmap(cache->bo);
    brw_bo_unreference(cache->bo);
    cache->bo = new_bo;
-   cache->map = brw->has_llc ? llc_map : NULL;
+   cache->map = map;
    cache->bo_used_by_gpu = false;
 
    /* Since we have a new BO in place, we need to signal the units
@@ -257,27 +255,13 @@ brw_lookup_prog(const struct brw_cache *cache,
                 enum brw_cache_id cache_id,
                 const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
    unsigned i;
    const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-         int ret;
-
-         if (item->cache_id != cache_id || item->size != data_size)
-            continue;
-
-         void *map;
-         if (!brw->has_llc)
-            map = brw_bo_map(brw, cache->bo, MAP_READ);
-         else
-            map = cache->map;
-
-         ret = memcmp(map + item->offset, data, item->size);
-         if (!brw->has_llc)
-            brw_bo_unmap(cache->bo);
-         if (ret)
+         if (item->cache_id != cache_id || item->size != data_size ||
+             memcmp(cache->map + item->offset, data, item->size) != 0)
             continue;
 
          return item;
@@ -291,7 +275,6 @@ static uint32_t
 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
    uint32_t offset;
-   struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
    if (cache->next_offset + size > cache->bo->size) {
@@ -303,14 +286,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
       brw_cache_new_bo(cache, new_size);
    }
 
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
    offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
@@ -348,7 +323,6 @@ brw_upload_cache(struct brw_cache *cache,
                  uint32_t *out_offset,
                  void *out_aux)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    const struct brw_cache_item *matching_data =
       brw_lookup_prog(cache, cache_id, data, data_size);
@@ -375,11 +349,7 @@ brw_upload_cache(struct brw_cache *cache,
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy(cache->map + item->offset, data, data_size);
-      } else {
-         brw_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      memcpy(cache->map + item->offset, data, data_size);
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -418,10 +388,9 @@ brw_init_caches(struct brw_context *brw)
    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
    if (can_do_exec_capture(brw->screen))
       cache->bo->kflags = EXEC_OBJECT_CAPTURE;
-   if (brw->has_llc) {
-      cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
-                                              MAP_ASYNC | MAP_PERSISTENT);
-   }
+
+   cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
+                                           MAP_ASYNC | MAP_PERSISTENT);
 }
 
 static void
@@ -500,8 +469,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    /* This can be NULL if context creation failed early on */
    if (cache->bo) {
-      if (brw->has_llc)
-         brw_bo_unmap(cache->bo);
+      brw_bo_unmap(cache->bo);
       brw_bo_unreference(cache->bo);
       cache->bo = NULL;
       cache->map = NULL;
@@ -551,21 +519,12 @@ brw_print_program_cache(struct brw_context *brw)
 {
    const struct brw_cache *cache = &brw->cache;
    struct brw_cache_item *item;
-   void *map;
-
-   if (!brw->has_llc)
-      map = brw_bo_map(brw, cache->bo, MAP_READ);
-   else
-      map = cache->map;
 
    for (unsigned i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
          fprintf(stderr, "%s:\n", cache_name(i));
-         brw_disassemble(&brw->screen->devinfo, map,
+         brw_disassemble(&brw->screen->devinfo, cache->map,
                          item->offset, item->size, stderr);
       }
    }
-
-   if (!brw->has_llc)
-      brw_bo_unmap(cache->bo);
 }
-- 
2.13.3



More information about the mesa-dev mailing list