[Mesa-dev] [PATCH 7/7] i965: Use persistent CPU mappings for the program cache even on non-LLC.
Eduardo Lima Mitev
elima at igalia.com
Tue Jan 17 09:40:26 UTC 2017
Patch 5-7 look good, but I prefer that more experienced eyes take a look
too.
Acked-by: Eduardo Lima Mitev <elima at igalia.com>
On 01/17/2017 08:14 AM, Kenneth Graunke wrote:
> The non-LLC story was a horror show. We uploaded data via pwrite
> (drm_intel_bo_subdata), which would stall if the cache BO was in
> use (being read) by the GPU. Obviously, we wanted to avoid that.
> So, we tried to detect whether the buffer was busy, and if so, we'd
> allocate a new BO, map the old one read-only (hopefully not stalling),
> copy all shaders compiled since the dawn of time to the new buffer,
> upload our new one, toss the old BO, and let the state upload code
> know that our program cache BO changed. This was a lot of extra data
> copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new
> STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline.
>
> Not only that, but our rudimentary busy tracking consistented of a flag
> set at execbuf time, and not cleared until we threw out the program
> cache BO. So, the first shader upload after any drawing would hit this
> "abandon the cache and start over" copying path.
>
> None of this is necessary - it's just ancient crufty code. We can
> use the same persistent mapping paths on all platforms. On non-LLC
> platforms, we simply need to clflush after memcpy'ing in new shaders,
> so they're visible to the GPU. This is not only better, but the code
> is also significantly simpler.
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 1 -
> src/mesa/drivers/dri/i965/brw_program_cache.c | 58 ++++++++-------------------
> src/mesa/drivers/dri/i965/intel_batchbuffer.c | 6 ---
> 3 files changed, 16 insertions(+), 49 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index b032d511a1e..70f240383b1 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -461,7 +461,6 @@ struct brw_cache {
> GLuint size, n_items;
>
> uint32_t next_offset;
> - bool bo_used_by_gpu;
> };
>
>
> diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
> index 0c1da172588..b76ff12e9fe 100644
> --- a/src/mesa/drivers/dri/i965/brw_program_cache.c
> +++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
> @@ -207,32 +207,31 @@ brw_search_cache(struct brw_cache *cache,
> }
>
> static void
> +memcpy_coherent(bool has_llc, void *dest, const void *src, size_t n)
> +{
> + memcpy(dest, src, n);
> + if (!has_llc)
> + brw_clflush_range(dest, n);
> +}
> +
> +static void
> brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
> {
> struct brw_context *brw = cache->brw;
> drm_intel_bo *new_bo;
>
> new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
> - if (brw->has_llc)
> - drm_intel_bo_map(new_bo, true);
> + drm_intel_bo_map(new_bo, true);
>
> /* Copy any existing data that needs to be saved. */
> if (cache->next_offset != 0) {
> - if (brw->has_llc) {
> - memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
> - } else {
> - drm_intel_bo_map(cache->bo, false);
> - drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
> - cache->bo->virtual);
> - drm_intel_bo_unmap(cache->bo);
> - }
> + memcpy_coherent(brw->has_llc, new_bo->virtual, cache->bo->virtual,
> + cache->next_offset);
> }
>
> - if (brw->has_llc)
> - drm_intel_bo_unmap(cache->bo);
> + drm_intel_bo_unmap(cache->bo);
> drm_intel_bo_unreference(cache->bo);
> cache->bo = new_bo;
> - cache->bo_used_by_gpu = false;
>
> /* Since we have a new BO in place, we need to signal the units
> * that depend on it (state base address on gen5+, or unit state before).
> @@ -249,7 +248,6 @@ brw_lookup_prog(const struct brw_cache *cache,
> enum brw_cache_id cache_id,
> const void *data, unsigned data_size)
> {
> - const struct brw_context *brw = cache->brw;
> unsigned i;
> const struct brw_cache_item *item;
>
> @@ -260,11 +258,7 @@ brw_lookup_prog(const struct brw_cache *cache,
> if (item->cache_id != cache_id || item->size != data_size)
> continue;
>
> - if (!brw->has_llc)
> - drm_intel_bo_map(cache->bo, false);
> ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
> - if (!brw->has_llc)
> - drm_intel_bo_unmap(cache->bo);
> if (ret)
> continue;
>
> @@ -279,7 +273,6 @@ static uint32_t
> brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
> {
> uint32_t offset;
> - struct brw_context *brw = cache->brw;
>
> /* Allocate space in the cache BO for our new program. */
> if (cache->next_offset + size > cache->bo->size) {
> @@ -291,14 +284,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
> brw_cache_new_bo(cache, new_size);
> }
>
> - /* If we would block on writing to an in-use program BO, just
> - * recreate it.
> - */
> - if (!brw->has_llc && cache->bo_used_by_gpu) {
> - perf_debug("Copying busy program cache buffer.\n");
> - brw_cache_new_bo(cache, cache->bo->size);
> - }
> -
> offset = cache->next_offset;
>
> /* Programs are always 64-byte aligned, so set up the next one now */
> @@ -363,11 +348,8 @@ brw_upload_cache(struct brw_cache *cache,
> item->offset = brw_alloc_item_data(cache, data_size);
>
> /* Copy data to the buffer */
> - if (brw->has_llc) {
> - memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
> - } else {
> - drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
> - }
> + memcpy_coherent(brw->has_llc, cache->bo->virtual + item->offset,
> + data, data_size);
> }
>
> /* Set up the memory containing the key and aux_data */
> @@ -404,8 +386,7 @@ brw_init_caches(struct brw_context *brw)
> calloc(cache->size, sizeof(struct brw_cache_item *));
>
> cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", 4096, 64);
> - if (brw->has_llc)
> - drm_intel_bo_map(cache->bo, true);
> + drm_intel_bo_map(cache->bo, true);
> }
>
> static void
> @@ -482,8 +463,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
>
> DBG("%s\n", __func__);
>
> - if (brw->has_llc)
> - drm_intel_bo_unmap(cache->bo);
> + drm_intel_bo_unmap(cache->bo);
> drm_intel_bo_unreference(cache->bo);
> cache->bo = NULL;
> brw_clear_cache(brw, cache);
> @@ -532,9 +512,6 @@ brw_print_program_cache(struct brw_context *brw)
> const struct brw_cache *cache = &brw->cache;
> struct brw_cache_item *item;
>
> - if (!brw->has_llc)
> - drm_intel_bo_map(cache->bo, false);
> -
> for (unsigned i = 0; i < cache->size; i++) {
> for (item = cache->items[i]; item; item = item->next) {
> fprintf(stderr, "%s:\n", cache_name(i));
> @@ -542,7 +519,4 @@ brw_print_program_cache(struct brw_context *brw)
> item->offset, item->size, stderr);
> }
> }
> -
> - if (!brw->has_llc)
> - drm_intel_bo_unmap(cache->bo);
> }
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index d1b9317a8c1..29ff49d06f1 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -265,12 +265,6 @@ brw_finish_batch(struct brw_context *brw)
> PIPE_CONTROL_CS_STALL);
> }
> }
> -
> - /* Mark that the current program cache BO has been used by the GPU.
> - * It will be reallocated if we need to put new programs in for the
> - * next batch.
> - */
> - brw->cache.bo_used_by_gpu = true;
> }
>
> static void
>
More information about the mesa-dev
mailing list