[Intel-gfx] [PATCH 03/20] drm/i915/gem: Don't drop the timeline lock during execbuf
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Jul 8 16:54:51 UTC 2020
On 06/07/2020 07:19, Chris Wilson wrote:
> Our timeline lock is our defence against a concurrent execbuf
> interrupting our request construction. we need hold it throughout or,
> for example, a second thread may interject a relocation request in
> between our own relocation request and execution in the ring.
>
> A second, major benefit, is that it allows us to preserve a large chunk
> of the ringbuffer for our exclusive use; which should virtually
> eliminate the threat of hitting a wait_for_space during request
> construction -- although we should have already dropped other
> contentious locks at that point.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 241 ++++++++++++------
> .../i915/gem/selftests/i915_gem_execbuffer.c | 24 +-
> 2 files changed, 186 insertions(+), 79 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index dd15a799f9d6..e4d06db3f313 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -259,6 +259,8 @@ struct i915_execbuffer {
> bool has_fence : 1;
> bool needs_unfenced : 1;
>
> + struct intel_context *ce;
> +
> struct i915_vma *target;
> struct i915_request *rq;
> struct i915_vma *rq_vma;
> @@ -639,6 +641,35 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
> return 0;
> }
>
> +static void retire_requests(struct intel_timeline *tl, struct i915_request *end)
> +{
> + struct i915_request *rq, *rn;
> +
> + list_for_each_entry_safe(rq, rn, &tl->requests, link)
> + if (rq == end || !i915_request_retire(rq))
> + break;
> +}
> +
> +static int wait_for_timeline(struct intel_timeline *tl)
> +{
> + do {
> + struct dma_fence *fence;
> + int err;
> +
> + fence = i915_active_fence_get(&tl->last_request);
> + if (!fence)
> + return 0;
> +
> + err = dma_fence_wait(fence, true);
> + dma_fence_put(fence);
> + if (err)
> + return err;
> +
> + /* Retiring may trigger a barrier, requiring an extra pass */
> + retire_requests(tl, NULL);
> + } while (1);
> +}
> +
> static int eb_reserve(struct i915_execbuffer *eb)
> {
> const unsigned int count = eb->buffer_count;
> @@ -646,7 +677,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
> struct list_head last;
> struct eb_vma *ev;
> unsigned int i, pass;
> - int err = 0;
>
> /*
> * Attempt to pin all of the buffers into the GTT.
> @@ -662,18 +692,22 @@ static int eb_reserve(struct i915_execbuffer *eb)
> * room for the earlier objects *unless* we need to defragment.
> */
>
> - if (mutex_lock_interruptible(&eb->i915->drm.struct_mutex))
> - return -EINTR;
> -
> pass = 0;
> do {
> + int err = 0;
> +
> + if (mutex_lock_interruptible(&eb->i915->drm.struct_mutex))
> + return -EINTR;
Recently you explained to me why we still use struct mutex here so
maybe, while moving the code, document that in a comment.
> +
> list_for_each_entry(ev, &eb->unbound, bind_link) {
> err = eb_reserve_vma(eb, ev, pin_flags);
> if (err)
> break;
> }
> - if (!(err == -ENOSPC || err == -EAGAIN))
> - break;
> + if (!(err == -ENOSPC || err == -EAGAIN)) {
> + mutex_unlock(&eb->i915->drm.struct_mutex);
> + return err;
> + }
>
> /* Resort *all* the objects into priority order */
> INIT_LIST_HEAD(&eb->unbound);
> @@ -702,11 +736,10 @@ static int eb_reserve(struct i915_execbuffer *eb)
> list_add_tail(&ev->bind_link, &last);
> }
> list_splice_tail(&last, &eb->unbound);
> + mutex_unlock(&eb->i915->drm.struct_mutex);
>
> if (err == -EAGAIN) {
> - mutex_unlock(&eb->i915->drm.struct_mutex);
> flush_workqueue(eb->i915->mm.userptr_wq);
> - mutex_lock(&eb->i915->drm.struct_mutex);
> continue;
> }
>
> @@ -715,25 +748,23 @@ static int eb_reserve(struct i915_execbuffer *eb)
> break;
>
> case 1:
> - /* Too fragmented, unbind everything and retry */
> - mutex_lock(&eb->context->vm->mutex);
> - err = i915_gem_evict_vm(eb->context->vm);
> - mutex_unlock(&eb->context->vm->mutex);
> + /*
> + * Too fragmented, retire everything on the timeline
> + * and so make it all [contexts included] available to
> + * evict.
> + */
> + err = wait_for_timeline(eb->context->timeline);
> if (err)
> - goto unlock;
> + return err;
> +
> break;
>
> default:
> - err = -ENOSPC;
> - goto unlock;
> + return -ENOSPC;
> }
>
> pin_flags = PIN_USER;
> } while (1);
> -
> -unlock:
> - mutex_unlock(&eb->i915->drm.struct_mutex);
> - return err;
> }
>
> static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
> @@ -1007,13 +1038,44 @@ static int reloc_gpu_chain(struct reloc_cache *cache)
> return err;
> }
>
> +static struct i915_request *
> +nested_request_create(struct intel_context *ce)
> +{
> + struct i915_request *rq;
> +
> + /* XXX This only works once; replace with shared timeline */
Once as in attempt to use the same local intel_context from another eb
would upset lockdep? It's not a problem I think.
> + mutex_lock_nested(&ce->timeline->mutex, SINGLE_DEPTH_NESTING);
> + intel_context_enter(ce);
> +
> + rq = __i915_request_create(ce, GFP_KERNEL);
> +
> + intel_context_exit(ce);
> + if (IS_ERR(rq))
> + mutex_unlock(&ce->timeline->mutex);
> +
> + return rq;
> +}
> +
> +static void __i915_request_add(struct i915_request *rq,
> + struct i915_sched_attr *attr)
> +{
> + struct intel_timeline * const tl = i915_request_timeline(rq);
> +
> + lockdep_assert_held(&tl->mutex);
> + lockdep_unpin_lock(&tl->mutex, rq->cookie);
> +
> + __i915_request_commit(rq);
> + __i915_request_queue(rq, attr);
> +}
> +
> static unsigned int reloc_bb_flags(const struct reloc_cache *cache)
> {
> return cache->gen > 5 ? 0 : I915_DISPATCH_SECURE;
> }
>
> -static int reloc_gpu_flush(struct reloc_cache *cache)
> +static int reloc_gpu_flush(struct i915_execbuffer *eb)
> {
> + struct reloc_cache *cache = &eb->reloc_cache;
> struct i915_request *rq;
> int err;
>
> @@ -1044,7 +1106,9 @@ static int reloc_gpu_flush(struct reloc_cache *cache)
> i915_request_set_error_once(rq, err);
>
> intel_gt_chipset_flush(rq->engine->gt);
> - i915_request_add(rq);
> + __i915_request_add(rq, &eb->gem_context->sched);
> + if (i915_request_timeline(rq) != eb->context->timeline)
> + mutex_unlock(&i915_request_timeline(rq)->mutex);
>
> return err;
> }
> @@ -1103,27 +1167,15 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
> if (err)
> goto err_unmap;
>
> - if (engine == eb->context->engine) {
> - rq = i915_request_create(eb->context);
> - } else {
> - struct intel_context *ce;
> -
> - ce = intel_context_create(engine);
> - if (IS_ERR(ce)) {
> - err = PTR_ERR(ce);
> - goto err_unpin;
> - }
> -
> - i915_vm_put(ce->vm);
> - ce->vm = i915_vm_get(eb->context->vm);
> -
> - rq = intel_context_create_request(ce);
> - intel_context_put(ce);
> - }
> + if (cache->ce == eb->context)
> + rq = __i915_request_create(cache->ce, GFP_KERNEL);
> + else
> + rq = nested_request_create(cache->ce);
> if (IS_ERR(rq)) {
> err = PTR_ERR(rq);
> goto err_unpin;
> }
> + rq->cookie = lockdep_pin_lock(&i915_request_timeline(rq)->mutex);
>
> err = intel_gt_buffer_pool_mark_active(pool, rq);
> if (err)
> @@ -1151,7 +1203,9 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
> skip_request:
> i915_request_set_error_once(rq, err);
> err_request:
> - i915_request_add(rq);
> + __i915_request_add(rq, &eb->gem_context->sched);
> + if (i915_request_timeline(rq) != eb->context->timeline)
> + mutex_unlock(&i915_request_timeline(rq)->mutex);
> err_unpin:
> i915_vma_unpin(batch);
> err_unmap:
> @@ -1161,11 +1215,6 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
> return err;
> }
>
> -static bool reloc_can_use_engine(const struct intel_engine_cs *engine)
> -{
> - return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> -}
> -
> static u32 *reloc_gpu(struct i915_execbuffer *eb,
> struct i915_vma *vma,
> unsigned int len)
> @@ -1177,12 +1226,6 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> if (unlikely(!cache->rq)) {
> struct intel_engine_cs *engine = eb->engine;
>
> - if (!reloc_can_use_engine(engine)) {
> - engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> - if (!engine)
> - return ERR_PTR(-ENODEV);
> - }
> -
> err = __reloc_gpu_alloc(eb, engine, len);
> if (unlikely(err))
> return ERR_PTR(err);
> @@ -1513,7 +1556,7 @@ static int eb_relocate(struct i915_execbuffer *eb)
> break;
> }
>
> - flush = reloc_gpu_flush(&eb->reloc_cache);
> + flush = reloc_gpu_flush(eb);
> if (!err)
> err = flush;
> }
> @@ -1737,21 +1780,17 @@ parser_mark_active(struct eb_parse_work *pw, struct intel_timeline *tl)
> {
> int err;
>
> - mutex_lock(&tl->mutex);
> -
> err = __parser_mark_active(pw->shadow, tl, &pw->base.dma);
> if (err)
> - goto unlock;
> + return err;
>
> if (pw->trampoline) {
> err = __parser_mark_active(pw->trampoline, tl, &pw->base.dma);
> if (err)
> - goto unlock;
> + return err;
> }
>
> -unlock:
> - mutex_unlock(&tl->mutex);
> - return err;
> + return 0;
> }
>
> static int eb_parse_pipeline(struct i915_execbuffer *eb,
> @@ -2044,6 +2083,54 @@ static struct i915_request *eb_throttle(struct intel_context *ce)
> return i915_request_get(rq);
> }
>
> +static bool reloc_can_use_engine(const struct intel_engine_cs *engine)
> +{
> + return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> +}
> +
> +static int __eb_pin_reloc_engine(struct i915_execbuffer *eb)
> +{
> + struct intel_engine_cs *engine = eb->engine;
> + struct intel_context *ce;
> + int err;
> +
> + if (reloc_can_use_engine(engine)) {
> + eb->reloc_cache.ce = eb->context;
> + return 0;
> + }
> +
> + engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> + if (!engine)
> + return -ENODEV;
> +
> + ce = intel_context_create(engine);
> + if (IS_ERR(ce))
> + return PTR_ERR(ce);
> +
> + /* Reuse eb->context->timeline with scheduler! */
> +
> + i915_vm_put(ce->vm);
> + ce->vm = i915_vm_get(eb->context->vm);
> +
> + err = intel_context_pin(ce);
> + if (err)
> + return err;
> +
> + eb->reloc_cache.ce = ce;
> + return 0;
> +}
> +
> +static void __eb_unpin_reloc_engine(struct i915_execbuffer *eb)
> +{
> + struct intel_context *ce = eb->reloc_cache.ce;
> +
> + if (ce == eb->context)
> + return;
> +
> + intel_context_unpin(ce);
> + intel_context_put(ce);
> +}
> +
> static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce)
> {
> struct intel_timeline *tl;
> @@ -2087,9 +2174,7 @@ static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce)
> intel_context_enter(ce);
> rq = eb_throttle(ce);
>
> - intel_context_timeline_unlock(tl);
> -
> - if (rq) {
> + while (rq) {
> bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
> long timeout;
>
> @@ -2097,23 +2182,34 @@ static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce)
> if (nonblock)
> timeout = 0;
>
> + mutex_unlock(&tl->mutex);
"Don't drop the timeline lock during execbuf"? Is the "during execbuf"
actually a smaller subset
> +
> timeout = i915_request_wait(rq,
> I915_WAIT_INTERRUPTIBLE,
> timeout);
> i915_request_put(rq);
>
> + mutex_lock(&tl->mutex);
> +
> if (timeout < 0) {
> err = nonblock ? -EWOULDBLOCK : timeout;
> goto err_exit;
> }
> +
> + retire_requests(tl, NULL);
> + rq = eb_throttle(ce);
Alternative to avoid two call sites to eb_throttle of
while (rq = eb_throttle(ce)) {
Or checkpatch does not like it?
> }
>
> eb->engine = ce->engine;
> eb->context = ce;
> +
> + err = __eb_pin_reloc_engine(eb);
> + if (err)
> + goto err_exit;
> +
> return 0;
>
> err_exit:
> - mutex_lock(&tl->mutex);
> intel_context_exit(ce);
> intel_context_timeline_unlock(tl);
> err_unpin:
> @@ -2124,11 +2220,11 @@ static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce)
> static void eb_unpin_engine(struct i915_execbuffer *eb)
> {
> struct intel_context *ce = eb->context;
> - struct intel_timeline *tl = ce->timeline;
>
> - mutex_lock(&tl->mutex);
> + __eb_unpin_reloc_engine(eb);
> +
> intel_context_exit(ce);
> - mutex_unlock(&tl->mutex);
> + intel_context_timeline_unlock(ce->timeline);
>
> intel_context_unpin(ce);
> }
> @@ -2330,15 +2426,6 @@ signal_fence_array(struct i915_execbuffer *eb,
> }
> }
>
> -static void retire_requests(struct intel_timeline *tl, struct i915_request *end)
> -{
> - struct i915_request *rq, *rn;
> -
> - list_for_each_entry_safe(rq, rn, &tl->requests, link)
> - if (rq == end || !i915_request_retire(rq))
> - break;
> -}
> -
> static void eb_request_add(struct i915_execbuffer *eb)
> {
> struct i915_request *rq = eb->request;
> @@ -2367,8 +2454,6 @@ static void eb_request_add(struct i915_execbuffer *eb)
> /* Try to clean up the client's timeline after submitting the request */
> if (prev)
> retire_requests(tl, prev);
> -
> - mutex_unlock(&tl->mutex);
> }
>
> static int
> @@ -2455,6 +2540,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> err = eb_pin_engine(&eb, file, args);
> if (unlikely(err))
> goto err_context;
> + lockdep_assert_held(&eb.context->timeline->mutex);
>
> err = eb_relocate(&eb);
> if (err) {
> @@ -2522,11 +2608,12 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> GEM_BUG_ON(eb.reloc_cache.rq);
>
> /* Allocate a request for this batch buffer nice and early. */
> - eb.request = i915_request_create(eb.context);
> + eb.request = __i915_request_create(eb.context, GFP_KERNEL);
> if (IS_ERR(eb.request)) {
> err = PTR_ERR(eb.request);
> goto err_batch_unpin;
> }
> + eb.request->cookie = lockdep_pin_lock(&eb.context->timeline->mutex);
>
> if (in_fence) {
> if (args->flags & I915_EXEC_FENCE_SUBMIT)
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> index 57c14d3340cd..992d46db1b33 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> @@ -7,6 +7,9 @@
>
> #include "gt/intel_engine_pm.h"
> #include "selftests/igt_flush_test.h"
> +#include "selftests/mock_drm.h"
> +
> +#include "mock_context.h"
>
> static u64 read_reloc(const u32 *map, int x, const u64 mask)
> {
> @@ -60,7 +63,7 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
>
> GEM_BUG_ON(!eb->reloc_cache.rq);
> rq = i915_request_get(eb->reloc_cache.rq);
> - err = reloc_gpu_flush(&eb->reloc_cache);
> + err = reloc_gpu_flush(eb);
> if (err)
> goto put_rq;
> GEM_BUG_ON(eb->reloc_cache.rq);
> @@ -100,14 +103,22 @@ static int igt_gpu_reloc(void *arg)
> {
> struct i915_execbuffer eb;
> struct drm_i915_gem_object *scratch;
> + struct file *file;
> int err = 0;
> u32 *map;
>
> + file = mock_file(arg);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> eb.i915 = arg;
> + eb.gem_context = live_context(arg, file);
> + if (IS_ERR(eb.gem_context))
> + goto err_file;
>
> scratch = i915_gem_object_create_internal(eb.i915, 4096);
> if (IS_ERR(scratch))
> - return PTR_ERR(scratch);
> + goto err_file;
>
> map = i915_gem_object_pin_map(scratch, I915_MAP_WC);
> if (IS_ERR(map)) {
> @@ -130,8 +141,15 @@ static int igt_gpu_reloc(void *arg)
> if (err)
> goto err_put;
>
> + mutex_lock(&eb.context->timeline->mutex);
> + intel_context_enter(eb.context);
> + eb.reloc_cache.ce = eb.context;
> +
> err = __igt_gpu_reloc(&eb, scratch);
>
> + intel_context_exit(eb.context);
> + mutex_unlock(&eb.context->timeline->mutex);
> +
> intel_context_unpin(eb.context);
> err_put:
> intel_context_put(eb.context);
> @@ -146,6 +164,8 @@ static int igt_gpu_reloc(void *arg)
>
> err_scratch:
> i915_gem_object_put(scratch);
> +err_file:
> + fput(file);
> return err;
> }
>
>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list