[Mesa-dev] [PATCH 7/7] i965: Use persistent CPU mappings for the program cache even on non-LLC.

Tue Jan 17 09:40:26 UTC 2017

Patch 5-7 look good, but I prefer that more experienced eyes take a look
too.

Acked-by: Eduardo Lima Mitev <elima at igalia.com>

On 01/17/2017 08:14 AM, Kenneth Graunke wrote:
> The non-LLC story was a horror show.  We uploaded data via pwrite
> (drm_intel_bo_subdata), which would stall if the cache BO was in
> use (being read) by the GPU.  Obviously, we wanted to avoid that.
> So, we tried to detect whether the buffer was busy, and if so, we'd
> allocate a new BO, map the old one read-only (hopefully not stalling),
> copy all shaders compiled since the dawn of time to the new buffer,
> upload our new one, toss the old BO, and let the state upload code
> know that our program cache BO changed.  This was a lot of extra data
> copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new
> STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline.
> 
> Not only that, but our rudimentary busy tracking consistented of a flag
> set at execbuf time, and not cleared until we threw out the program
> cache BO.  So, the first shader upload after any drawing would hit this
> "abandon the cache and start over" copying path.
> 
> None of this is necessary - it's just ancient crufty code.  We can
> use the same persistent mapping paths on all platforms.  On non-LLC
> platforms, we simply need to clflush after memcpy'ing in new shaders,
> so they're visible to the GPU.  This is not only better, but the code
> is also significantly simpler.
> 
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h       |  1 -
>  src/mesa/drivers/dri/i965/brw_program_cache.c | 58 ++++++++-------------------
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c |  6 ---
>  3 files changed, 16 insertions(+), 49 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index b032d511a1e..70f240383b1 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -461,7 +461,6 @@ struct brw_cache {
>     GLuint size, n_items;
>  
>     uint32_t next_offset;
> -   bool bo_used_by_gpu;
>  };
>  
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
> index 0c1da172588..b76ff12e9fe 100644
> --- a/src/mesa/drivers/dri/i965/brw_program_cache.c
> +++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
> @@ -207,32 +207,31 @@ brw_search_cache(struct brw_cache *cache,
>  }
>  
>  static void
> +memcpy_coherent(bool has_llc, void *dest, const void *src, size_t n)
> +{
> +   memcpy(dest, src, n);
> +   if (!has_llc)
> +      brw_clflush_range(dest, n);
> +}
> +
> +static void
>  brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
>  {
>     struct brw_context *brw = cache->brw;
>     drm_intel_bo *new_bo;
>  
>     new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
> -   if (brw->has_llc)
> -      drm_intel_bo_map(new_bo, true);
> +   drm_intel_bo_map(new_bo, true);
>  
>     /* Copy any existing data that needs to be saved. */
>     if (cache->next_offset != 0) {
> -      if (brw->has_llc) {
> -         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
> -      } else {
> -         drm_intel_bo_map(cache->bo, false);
> -         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
> -                              cache->bo->virtual);
> -         drm_intel_bo_unmap(cache->bo);
> -      }
> +      memcpy_coherent(brw->has_llc, new_bo->virtual, cache->bo->virtual,
> +                      cache->next_offset);
>     }
>  
> -   if (brw->has_llc)
> -      drm_intel_bo_unmap(cache->bo);
> +   drm_intel_bo_unmap(cache->bo);
>     drm_intel_bo_unreference(cache->bo);
>     cache->bo = new_bo;
> -   cache->bo_used_by_gpu = false;
>  
>     /* Since we have a new BO in place, we need to signal the units
>      * that depend on it (state base address on gen5+, or unit state before).
> @@ -249,7 +248,6 @@ brw_lookup_prog(const struct brw_cache *cache,
>                  enum brw_cache_id cache_id,
>                  const void *data, unsigned data_size)
>  {
> -   const struct brw_context *brw = cache->brw;
>     unsigned i;
>     const struct brw_cache_item *item;
>  
> @@ -260,11 +258,7 @@ brw_lookup_prog(const struct brw_cache *cache,
>           if (item->cache_id != cache_id || item->size != data_size)
>              continue;
>  
> -         if (!brw->has_llc)
> -            drm_intel_bo_map(cache->bo, false);
>           ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
> -         if (!brw->has_llc)
> -            drm_intel_bo_unmap(cache->bo);
>           if (ret)
>              continue;
>  
> @@ -279,7 +273,6 @@ static uint32_t
>  brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
>  {
>     uint32_t offset;
> -   struct brw_context *brw = cache->brw;
>  
>     /* Allocate space in the cache BO for our new program. */
>     if (cache->next_offset + size > cache->bo->size) {
> @@ -291,14 +284,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
>        brw_cache_new_bo(cache, new_size);
>     }
>  
> -   /* If we would block on writing to an in-use program BO, just
> -    * recreate it.
> -    */
> -   if (!brw->has_llc && cache->bo_used_by_gpu) {
> -      perf_debug("Copying busy program cache buffer.\n");
> -      brw_cache_new_bo(cache, cache->bo->size);
> -   }
> -
>     offset = cache->next_offset;
>  
>     /* Programs are always 64-byte aligned, so set up the next one now */
> @@ -363,11 +348,8 @@ brw_upload_cache(struct brw_cache *cache,
>        item->offset = brw_alloc_item_data(cache, data_size);
>  
>        /* Copy data to the buffer */
> -      if (brw->has_llc) {
> -         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
> -      } else {
> -         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
> -      }
> +      memcpy_coherent(brw->has_llc, cache->bo->virtual + item->offset,
> +                      data, data_size);
>     }
>  
>     /* Set up the memory containing the key and aux_data */
> @@ -404,8 +386,7 @@ brw_init_caches(struct brw_context *brw)
>        calloc(cache->size, sizeof(struct brw_cache_item *));
>  
>     cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
> -   if (brw->has_llc)
> -      drm_intel_bo_map(cache->bo, true);
> +   drm_intel_bo_map(cache->bo, true);
>  }
>  
>  static void
> @@ -482,8 +463,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
>  
>     DBG("%s\n", __func__);
>  
> -   if (brw->has_llc)
> -      drm_intel_bo_unmap(cache->bo);
> +   drm_intel_bo_unmap(cache->bo);
>     drm_intel_bo_unreference(cache->bo);
>     cache->bo = NULL;
>     brw_clear_cache(brw, cache);
> @@ -532,9 +512,6 @@ brw_print_program_cache(struct brw_context *brw)
>     const struct brw_cache *cache = &brw->cache;
>     struct brw_cache_item *item;
>  
> -   if (!brw->has_llc)
> -      drm_intel_bo_map(cache->bo, false);
> -
>     for (unsigned i = 0; i < cache->size; i++) {
>        for (item = cache->items[i]; item; item = item->next) {
>           fprintf(stderr, "%s:\n", cache_name(i));
> @@ -542,7 +519,4 @@ brw_print_program_cache(struct brw_context *brw)
>                           item->offset, item->size, stderr);
>        }
>     }
> -
> -   if (!brw->has_llc)
> -      drm_intel_bo_unmap(cache->bo);
>  }
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index d1b9317a8c1..29ff49d06f1 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -265,12 +265,6 @@ brw_finish_batch(struct brw_context *brw)
>                                            PIPE_CONTROL_CS_STALL);
>        }
>     }
> -
> -   /* Mark that the current program cache BO has been used by the GPU.
> -    * It will be reallocated if we need to put new programs in for the
> -    * next batch.
> -    */
> -   brw->cache.bo_used_by_gpu = true;
>  }
>  
>  static void
>