[PATCH 14/14] drm/i915/gem: Make relocations atomic within execbuf
Chris Wilson
chris at chris-wilson.co.uk
Sun May 24 21:00:28 UTC 2020
Testcase: igt/gem_exec_reloc/basic-concurrent
Fixes: ef398881d27d ("drm/i915/gem: Limit struct_mutex to eb_reserve")
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
.../gpu/drm/i915/gem/i915_gem_execbuffer.c | 202 ++++++++++++------
.../i915/gem/selftests/i915_gem_execbuffer.c | 35 +--
2 files changed, 154 insertions(+), 83 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b7a827966ce9..4a466019e2fb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -5,6 +5,7 @@
*/
#include <linux/intel-iommu.h>
+#include <linux/dma-fence-proxy.h>
#include <linux/dma-resv.h>
#include <linux/sync_file.h>
#include <linux/uaccess.h>
@@ -259,7 +260,8 @@ struct i915_execbuffer {
bool has_fence : 1;
bool needs_unfenced : 1;
- struct i915_vma *target;
+ struct dma_fence *fence;
+
struct i915_request *rq;
struct i915_vma *rq_vma;
u32 *rq_cmd;
@@ -556,16 +558,6 @@ eb_add_vma(struct i915_execbuffer *eb,
ev->exec = entry;
ev->flags = entry->flags;
- if (eb->lut_size > 0) {
- ev->handle = entry->handle;
- hlist_add_head(&ev->node,
- &eb->buckets[hash_32(entry->handle,
- eb->lut_size)]);
- }
-
- if (entry->relocation_count)
- list_add_tail(&ev->reloc_link, &eb->relocs);
-
/*
* SNA is doing fancy tricks with compressing batch buffers, which leads
* to negative relocation deltas. Usually that works out ok since the
@@ -582,9 +574,21 @@ eb_add_vma(struct i915_execbuffer *eb,
if (eb->reloc_cache.has_fence)
ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
+ INIT_LIST_HEAD(&ev->reloc_link);
+
eb->batch = ev;
}
+ if (entry->relocation_count)
+ list_add_tail(&ev->reloc_link, &eb->relocs);
+
+ if (eb->lut_size > 0) {
+ ev->handle = entry->handle;
+ hlist_add_head(&ev->node,
+ &eb->buckets[hash_32(entry->handle,
+ eb->lut_size)]);
+ }
+
if (eb_pin_vma(eb, entry, ev)) {
if (entry->offset != vma->node.start) {
entry->offset = vma->node.start | UPDATE;
@@ -924,7 +928,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
cache->has_fence = cache->gen < 4;
cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
cache->node.flags = 0;
- cache->target = NULL;
+ cache->fence = NULL;
}
static inline void *unmask_page(unsigned long p)
@@ -1023,13 +1027,9 @@ static unsigned int reloc_bb_flags(const struct reloc_cache *cache)
static int reloc_gpu_flush(struct reloc_cache *cache)
{
- struct i915_request *rq;
+ struct i915_request *rq = cache->rq;
int err;
- rq = fetch_and_zero(&cache->rq);
- if (!rq)
- return 0;
-
if (cache->rq_vma) {
struct drm_i915_gem_object *obj = cache->rq_vma->obj;
@@ -1053,31 +1053,12 @@ static int reloc_gpu_flush(struct reloc_cache *cache)
i915_request_set_error_once(rq, err);
intel_gt_chipset_flush(rq->engine->gt);
+ i915_request_get(rq);
i915_request_add(rq);
return err;
}
-static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
-{
- struct drm_i915_gem_object *obj = vma->obj;
- int err;
-
- i915_vma_lock(vma);
-
- if (obj->cache_dirty & ~obj->cache_coherent)
- i915_gem_clflush_object(obj, 0);
- obj->write_domain = 0;
-
- err = i915_request_await_object(rq, vma->obj, true);
- if (err == 0)
- err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
-
- i915_vma_unlock(vma);
-
- return err;
-}
-
static int
__reloc_gpu_alloc(struct i915_execbuffer *eb, struct intel_engine_cs *engine)
{
@@ -1177,16 +1158,6 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
u32 *cmd;
int err;
- if (vma != cache->target) {
- err = reloc_move_to_gpu(cache->rq, vma);
- if (unlikely(err)) {
- i915_request_set_error_once(cache->rq, err);
- return ERR_PTR(err);
- }
-
- cache->target = vma;
- }
-
if (unlikely(cache->rq_size + len >
PAGE_SIZE / sizeof(u32) - RELOC_TAIL)) {
err = reloc_gpu_chain(cache);
@@ -1385,16 +1356,6 @@ eb_relocate_entry(struct i915_execbuffer *eb,
return -EINVAL;
}
- /*
- * If we write into the object, we need to force the synchronisation
- * barrier, either with an asynchronous clflush or if we executed the
- * patching using the GPU (though that should be serialised by the
- * timeline). To be completely sure, and since we are required to
- * do relocations we are already stalling, disable the user's opt
- * out of our synchronisation.
- */
- ev->flags &= ~EXEC_OBJECT_ASYNC;
-
/* and update the user's relocation entry */
return relocate_entry(eb, ev->vma, reloc, target->vma);
}
@@ -1477,6 +1438,76 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
return 0;
}
+static int reloc_move_to_gpu(struct reloc_cache *cache, struct eb_vma *ev)
+{
+ struct i915_request *rq = cache->rq;
+ struct i915_vma *vma = ev->vma;
+ struct drm_i915_gem_object *obj = vma->obj;
+ int err;
+
+ if (obj->cache_dirty & ~obj->cache_coherent)
+ i915_gem_clflush_object(obj, 0);
+
+ obj->write_domain = I915_GEM_DOMAIN_RENDER;
+ obj->read_domains = I915_GEM_DOMAIN_RENDER;
+
+ err = i915_request_await_object(rq, obj, true);
+ if (err == 0) {
+ dma_resv_add_excl_fence(vma->resv, cache->fence);
+ ev->flags |= EXEC_OBJECT_ASYNC;
+
+ err = __i915_vma_move_to_active(vma, rq);
+ }
+
+ return err;
+}
+
+static int
+lock_relocs(struct i915_execbuffer *eb)
+{
+ struct ww_acquire_ctx acquire;
+ struct eb_vma *ev;
+ int err = 0;
+
+ ww_acquire_init(&acquire, &reservation_ww_class);
+
+ list_for_each_entry(ev, &eb->relocs, reloc_link) {
+ struct i915_vma *vma = ev->vma;
+
+ err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire);
+ if (err == -EDEADLK) {
+ struct eb_vma *unlock = ev, *en;
+
+ list_for_each_entry_safe_continue_reverse(unlock, en,
+ &eb->relocs,
+ reloc_link) {
+ ww_mutex_unlock(&unlock->vma->resv->lock);
+ list_move_tail(&unlock->reloc_link,
+ &eb->relocs);
+ }
+
+ GEM_BUG_ON(!list_is_first(&ev->reloc_link,
+ &eb->relocs));
+ err = ww_mutex_lock_slow_interruptible(&vma->resv->lock,
+ &acquire);
+ }
+ if (err)
+ break;
+ }
+
+ ww_acquire_done(&acquire);
+
+ list_for_each_entry_continue_reverse(ev, &eb->relocs, reloc_link) {
+ if (err == 0)
+ err = reloc_move_to_gpu(&eb->reloc_cache, ev);
+ ww_mutex_unlock(&ev->vma->resv->lock);
+ }
+
+ ww_acquire_fini(&acquire);
+
+ return err;
+}
+
static bool reloc_can_use_engine(const struct intel_engine_cs *engine)
{
return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
@@ -1495,7 +1526,7 @@ static int reloc_gpu_alloc(struct i915_execbuffer *eb)
return __reloc_gpu_alloc(eb, engine);
}
-static int eb_relocate(struct i915_execbuffer *eb)
+static int eb_reloc(struct i915_execbuffer *eb)
{
int err;
@@ -1514,15 +1545,24 @@ static int eb_relocate(struct i915_execbuffer *eb)
struct eb_vma *ev;
int flush;
+ eb->reloc_cache.fence = __dma_fence_create_proxy(0, 0);
+ if (!eb->reloc_cache.fence)
+ return -ENOMEM;
+
err = reloc_gpu_alloc(eb);
- if (err)
+ if (err) {
+ dma_fence_put(eb->reloc_cache.fence);
+ eb->reloc_cache.fence = NULL;
return err;
+ }
+ GEM_BUG_ON(!eb->reloc_cache.rq);
+ err = lock_relocs(eb);
list_for_each_entry(ev, &eb->relocs, reloc_link) {
- err = eb_relocate_vma(eb, ev);
- if (err)
- break;
+ if (err == 0)
+ err = eb_relocate_vma(eb, ev);
}
+ GEM_BUG_ON(dma_fence_is_signaled(eb->reloc_cache.fence));
flush = reloc_gpu_flush(&eb->reloc_cache);
if (!err)
@@ -1532,6 +1572,15 @@ static int eb_relocate(struct i915_execbuffer *eb)
return err;
}
+static void eb_reloc_signal(struct i915_execbuffer *eb, struct i915_request *rq)
+{
+ dma_fence_proxy_set_real(eb->reloc_cache.fence, &rq->fence);
+ i915_request_put(eb->reloc_cache.rq);
+
+ dma_fence_put(eb->reloc_cache.fence);
+ eb->reloc_cache.fence = NULL;
+}
+
static int eb_move_to_gpu(struct i915_execbuffer *eb)
{
const unsigned int count = eb->buffer_count;
@@ -1775,10 +1824,15 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
if (err)
goto err_batch_unlock;
- /* Wait for all writes (and relocs) into the batch to complete */
- err = i915_sw_fence_await_reservation(&pw->base.chain,
- pw->batch->resv, NULL, false,
- 0, I915_FENCE_GFP);
+ /* Wait for all writes (or relocs) into the batch to complete */
+ if (!eb->reloc_cache.fence || list_empty(&eb->batch->reloc_link))
+ err = i915_sw_fence_await_reservation(&pw->base.chain,
+ pw->batch->resv, NULL,
+ false, 0, I915_FENCE_GFP);
+ else
+ err = i915_sw_fence_await_dma_fence(&pw->base.chain,
+ &eb->reloc_cache.rq->fence,
+ 0, I915_FENCE_GFP);
if (err < 0)
goto err_batch_unlock;
@@ -1903,6 +1957,15 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
{
int err;
+ if (eb->reloc_cache.fence) {
+ err = i915_request_await_dma_fence(eb->request,
+ &eb->reloc_cache.rq->fence);
+ if (err)
+ return err;
+
+ eb_reloc_signal(eb, eb->request);
+ }
+
err = eb_move_to_gpu(eb);
if (err)
return err;
@@ -2427,7 +2490,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
if (unlikely(err))
goto err_context;
- err = eb_relocate(&eb);
+ err = eb_reloc(&eb);
if (err) {
/*
* If the user expects the execobject.offset and
@@ -2460,7 +2523,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
err = eb_parse(&eb);
if (err)
- goto err_vma;
+ goto err_reloc;
/*
* snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
@@ -2561,6 +2624,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
err_parse:
if (batch->private)
intel_gt_buffer_pool_put(batch->private);
+err_reloc:
+ if (eb.reloc_cache.fence)
+ eb_reloc_signal(&eb, eb.reloc_cache.rq);
err_vma:
if (eb.trampoline)
i915_vma_unpin(eb.trampoline);
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
index f0ec23632a78..e0900846de38 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
@@ -23,16 +23,15 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
const u64 mask =
GENMASK_ULL(eb->reloc_cache.use_64bit_reloc ? 63 : 31, 0);
const u32 *map = page_mask_bits(obj->mm.mapping);
- struct i915_request *rq;
- struct i915_vma *vma;
+ struct eb_vma ev;
int err;
int i;
- vma = i915_vma_instance(obj, eb->context->vm, NULL);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
+ ev.vma = i915_vma_instance(obj, eb->context->vm, NULL);
+ if (IS_ERR(ev.vma))
+ return PTR_ERR(ev.vma);
- err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
+ err = i915_vma_pin(ev.vma, 0, 0, PIN_USER | PIN_HIGH);
if (err)
return err;
@@ -40,13 +39,21 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
if (err)
goto unpin_vma;
+ eb->reloc_cache.fence = &eb->reloc_cache.rq->fence;
+
+ i915_vma_lock(ev.vma);
+ err = reloc_move_to_gpu(&eb->reloc_cache, &ev);
+ i915_vma_unlock(ev.vma);
+ if (err)
+ goto unpin_vma;
+
/* 8-Byte aligned */
- err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0);
+ err = __reloc_entry_gpu(eb, ev.vma, offsets[0] * sizeof(u32), 0);
if (err)
goto unpin_vma;
/* !8-Byte aligned */
- err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1);
+ err = __reloc_entry_gpu(eb, ev.vma, offsets[1] * sizeof(u32), 1);
if (err)
goto unpin_vma;
@@ -58,16 +65,13 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
eb->reloc_cache.rq_size += i;
/* Force batch chaining */
- err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2);
+ err = __reloc_entry_gpu(eb, ev.vma, offsets[2] * sizeof(u32), 2);
if (err)
goto unpin_vma;
- GEM_BUG_ON(!eb->reloc_cache.rq);
- rq = i915_request_get(eb->reloc_cache.rq);
err = reloc_gpu_flush(&eb->reloc_cache);
if (err)
goto put_rq;
- GEM_BUG_ON(eb->reloc_cache.rq);
err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
if (err) {
@@ -75,7 +79,7 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
goto put_rq;
}
- if (!i915_request_completed(rq)) {
+ if (!i915_request_completed(eb->reloc_cache.rq)) {
pr_err("%s: did not wait for relocations!\n", eb->engine->name);
err = -EINVAL;
goto put_rq;
@@ -94,9 +98,10 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
igt_hexdump(map, 4096);
put_rq:
- i915_request_put(rq);
+ i915_request_put(eb->reloc_cache.rq);
+ eb->reloc_cache.rq = NULL;
unpin_vma:
- i915_vma_unpin(vma);
+ i915_vma_unpin(ev.vma);
return err;
}
--
2.20.1
More information about the Intel-gfx-trybot
mailing list