[Intel-gfx] [PATCH] drm/i915: Serialise i915_active_fence_set() with itself
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Nov 27 14:53:38 UTC 2019
On 27/11/2019 13:45, Chris Wilson wrote:
> The expected downside to commit 58b4c1a07ada ("drm/i915: Reduce nested
> prepare_remote_context() to a trylock") was that it would need to return
> -EAGAIN to userspace in order to resolve potential mutex inversion. Such
> an unsightly round trip is unnecessary if we could atomically insert a
> barrier into the i915_active_fence, so make it happen.
>
> Currently, we use the timeline->mutex (or some other named outer lock)
> to order insertion into the i915_active_fence (and so individual nodes
> of i915_active). Inside __i915_active_fence_set, we only need then
> serialise with the interrupt handler in order to claim the timeline for
> ourselves.
>
> However, if we remove the outer lock, we need to ensure the order is
> intact between not only multiple threads trying to insert themselves
> into the timeline, but also with the interrupt handler completing the
> previous occupant. We use xchg() on insert so that we have an ordered
> sequence of insertions (and each caller knows the previous fence on
> which to wait, preserving the chain of all fences in the timeline), but
> we then have to cmpxchg() in the interrupt handler to avoid overwriting
> the new occupant. The only nasty side-effect is having to temporarily
> strip off the RCU-annotations to apply the atomic operations, otherwise
> the rules are much more conventional!
>
> Fixes: 58b4c1a07ada ("drm/i915: Reduce nested prepare_remote_context() to a trylock")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_context.c | 19 ----
> drivers/gpu/drm/i915/gt/intel_engine_pm.c | 2 +-
> drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +-
> .../gpu/drm/i915/gt/selftests/mock_timeline.c | 2 +-
> drivers/gpu/drm/i915/i915_active.c | 107 ++++++++++--------
> drivers/gpu/drm/i915/i915_active.h | 17 +--
> drivers/gpu/drm/i915/i915_active_types.h | 15 ---
> 7 files changed, 62 insertions(+), 102 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index ef7bc41ffffa..b5e9c35ec6b8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -310,27 +310,8 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
> GEM_BUG_ON(rq->hw_context == ce);
>
> if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */
> - /*
> - * Ideally, we just want to insert our foreign fence as
> - * a barrier into the remove context, such that this operation
> - * occurs after all current operations in that context, and
> - * all future operations must occur after this.
> - *
> - * Currently, the timeline->last_request tracking is guarded
> - * by its mutex and so we must obtain that to atomically
> - * insert our barrier. However, since we already hold our
> - * timeline->mutex, we must be careful against potential
> - * inversion if we are the kernel_context as the remote context
> - * will itself poke at the kernel_context when it needs to
> - * unpin. Ergo, if already locked, we drop both locks and
> - * try again (through the magic of userspace repeating EAGAIN).
> - */
> - if (!mutex_trylock(&tl->mutex))
> - return -EAGAIN;
> -
> /* Queue this switch after current activity by this context. */
> err = i915_active_fence_set(&tl->last_request, rq);
> - mutex_unlock(&tl->mutex);
> if (err)
> return err;
> }
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 0e1ad4a4bd97..21183ad87368 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -183,7 +183,7 @@ static void call_idle_barriers(struct intel_engine_cs *engine)
> container_of((struct list_head *)node,
> typeof(*cb), node);
>
> - cb->func(NULL, cb);
> + cb->func(ERR_PTR(-EAGAIN), cb);
> }
> }
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index c1d2419444f8..038e05a6336c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -254,7 +254,7 @@ int intel_timeline_init(struct intel_timeline *timeline,
>
> mutex_init(&timeline->mutex);
>
> - INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
> + INIT_ACTIVE_FENCE(&timeline->last_request);
> INIT_LIST_HEAD(&timeline->requests);
>
> i915_syncmap_init(&timeline->sync);
> diff --git a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> index 2a77c051f36a..aeb1d1f616e8 100644
> --- a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> @@ -15,7 +15,7 @@ void mock_timeline_init(struct intel_timeline *timeline, u64 context)
>
> mutex_init(&timeline->mutex);
>
> - INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
> + INIT_ACTIVE_FENCE(&timeline->last_request);
> INIT_LIST_HEAD(&timeline->requests);
>
> i915_syncmap_init(&timeline->sync);
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index dca15ace88f6..e5809727aab2 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -186,18 +186,33 @@ active_retire(struct i915_active *ref)
> __active_retire(ref);
> }
>
> +static inline struct dma_fence **
> +__active_fence_slot(struct i915_active_fence *active)
> +{
> + return (struct dma_fence ** __force)&active->fence;
> +}
> +
> +static inline bool
> +active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> + struct i915_active_fence *active =
> + container_of(cb, typeof(*active), cb);
> +
> + return cmpxchg(__active_fence_slot(active), fence, NULL) == fence;
> +}
> +
> static void
> node_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
> {
> - i915_active_fence_cb(fence, cb);
> - active_retire(container_of(cb, struct active_node, base.cb)->ref);
> + if (active_fence_cb(fence, cb))
> + active_retire(container_of(cb, struct active_node, base.cb)->ref);
> }
>
> static void
> excl_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
> {
> - i915_active_fence_cb(fence, cb);
> - active_retire(container_of(cb, struct i915_active, excl.cb));
> + if (active_fence_cb(fence, cb))
> + active_retire(container_of(cb, struct i915_active, excl.cb));
> }
>
> static struct i915_active_fence *
> @@ -244,7 +259,7 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl)
> }
>
> node = prealloc;
> - __i915_active_fence_init(&node->base, &tl->mutex, NULL, node_retire);
> + __i915_active_fence_init(&node->base, NULL, node_retire);
> node->ref = ref;
> node->timeline = idx;
>
> @@ -281,7 +296,7 @@ void __i915_active_init(struct i915_active *ref,
> init_llist_head(&ref->preallocated_barriers);
> atomic_set(&ref->count, 0);
> __mutex_init(&ref->mutex, "i915_active", key);
> - __i915_active_fence_init(&ref->excl, &ref->mutex, NULL, excl_retire);
> + __i915_active_fence_init(&ref->excl, NULL, excl_retire);
> INIT_WORK(&ref->work, active_work);
> }
>
> @@ -376,15 +391,8 @@ void i915_active_set_exclusive(struct i915_active *ref, struct dma_fence *f)
> /* We expect the caller to manage the exclusive timeline ordering */
> GEM_BUG_ON(i915_active_is_idle(ref));
>
> - /*
> - * As we don't know which mutex the caller is using, we told a small
> - * lie to the debug code that it is using the i915_active.mutex;
> - * and now we must stick to that lie.
> - */
> - mutex_acquire(&ref->mutex.dep_map, 0, 0, _THIS_IP_);
> if (!__i915_active_fence_set(&ref->excl, f))
> atomic_inc(&ref->count);
> - mutex_release(&ref->mutex.dep_map, 0, _THIS_IP_);
> }
>
> bool i915_active_acquire_if_busy(struct i915_active *ref)
> @@ -615,10 +623,6 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
> goto unwind;
> }
>
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> - node->base.lock =
> - &engine->kernel_context->timeline->mutex;
> -#endif
> RCU_INIT_POINTER(node->base.fence, NULL);
> node->base.cb.func = node_retire;
> node->timeline = idx;
> @@ -639,6 +643,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
> node->base.cb.node.prev = (void *)engine;
> atomic_inc(&ref->count);
> }
> + GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN));
>
> GEM_BUG_ON(barrier_to_engine(node) != engine);
> llist_add(barrier_to_ll(node), &ref->preallocated_barriers);
> @@ -702,6 +707,11 @@ void i915_active_acquire_barrier(struct i915_active *ref)
> }
> }
>
> +static struct dma_fence **ll_to_fence_slot(struct llist_node *node)
> +{
> + return __active_fence_slot(&barrier_from_ll(node)->base);
> +}
> +
> void i915_request_add_active_barriers(struct i915_request *rq)
> {
> struct intel_engine_cs *engine = rq->engine;
> @@ -721,19 +731,13 @@ void i915_request_add_active_barriers(struct i915_request *rq)
> */
> spin_lock_irqsave(&rq->lock, flags);
> llist_for_each_safe(node, next, node) {
> - RCU_INIT_POINTER(barrier_from_ll(node)->base.fence, &rq->fence);
> - smp_wmb(); /* serialise with reuse_idle_barrier */
> + /* serialise with reuse_idle_barrier */
> + smp_store_mb(*ll_to_fence_slot(node), &rq->fence);
> list_add_tail((struct list_head *)node, &rq->fence.cb_list);
> }
> spin_unlock_irqrestore(&rq->lock, flags);
> }
>
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -#define active_is_held(active) lockdep_is_held((active)->lock)
> -#else
> -#define active_is_held(active) true
> -#endif
> -
> /*
> * __i915_active_fence_set: Update the last active fence along its timeline
> * @active: the active tracker
> @@ -744,7 +748,7 @@ void i915_request_add_active_barriers(struct i915_request *rq)
> * fence onto this one. Returns the previous fence (if not already completed),
> * which the caller must ensure is executed before the new fence. To ensure
> * that the order of fences within the timeline of the i915_active_fence is
> - * maintained, it must be locked by the caller.
> + * understood, it should be locked by the caller.
> */
> struct dma_fence *
> __i915_active_fence_set(struct i915_active_fence *active,
> @@ -753,34 +757,41 @@ __i915_active_fence_set(struct i915_active_fence *active,
> struct dma_fence *prev;
> unsigned long flags;
>
> - /* NB: must be serialised by an outer timeline mutex (active->lock) */
> - spin_lock_irqsave(fence->lock, flags);
> + if (fence == rcu_access_pointer(active->fence))
> + return fence;
> +
> GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags));
>
> - prev = rcu_dereference_protected(active->fence, active_is_held(active));
> + /*
> + * Consider that we have two threads arriving (A and B), with
> + * C already resident as the active->fence.
> + *
> + * A does the xchg first, and so it sees C or NULL depending
> + * on the timing of the interrupt handler. If it is NULL, the
> + * previous fence must have been signaled and we know that
> + * we are first on the timeline. If it is still present,
> + * we acquire the lock on that fence and serialise with the interrupt
> + * handler, in the process removing it from any future interrupt
> + * callback. A will then wait on C before executing (if present).
> + *
> + * As B is second, it sees A as the previous fence and so waits for
> + * it to complete its transition and takes over the occupancy for
> + * itself -- remembering that it needs to wait on A before executing.
> + *
> + * Note the strong ordering of the timeline also provides consistent
> + * nesting rules for the fence->lock; the outer lock is always the
> + * older lock.
> + */
> + spin_lock_irqsave(fence->lock, flags);
> + prev = xchg(__active_fence_slot(active), fence);
> if (prev) {
> GEM_BUG_ON(prev == fence);
> spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING);
> __list_del_entry(&active->cb.node);
> spin_unlock(prev->lock); /* serialise with prev->cb_list */
> -
> - /*
> - * active->fence is reset by the callback from inside
> - * interrupt context. We need to serialise our list
> - * manipulation with the fence->lock to prevent the prev
> - * being lost inside an interrupt (it can't be replaced as
> - * no other caller is allowed to enter __i915_active_fence_set
> - * as we hold the timeline lock). After serialising with
> - * the callback, we need to double check which ran first,
> - * our list_del() [decoupling prev from the callback] or
> - * the callback...
> - */
> - prev = rcu_access_pointer(active->fence);
> }
> -
> - rcu_assign_pointer(active->fence, fence);
> + GEM_BUG_ON(rcu_access_pointer(active->fence) != fence);
> list_add_tail(&active->cb.node, &fence->cb_list);
> -
> spin_unlock_irqrestore(fence->lock, flags);
>
> return prev;
> @@ -792,10 +803,6 @@ int i915_active_fence_set(struct i915_active_fence *active,
> struct dma_fence *fence;
> int err = 0;
>
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> - lockdep_assert_held(active->lock);
> -#endif
> -
> /* Must maintain timeline ordering wrt previous active requests */
> rcu_read_lock();
> fence = __i915_active_fence_set(active, &rq->fence);
> @@ -812,7 +819,7 @@ int i915_active_fence_set(struct i915_active_fence *active,
>
> void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb)
> {
> - i915_active_fence_cb(fence, cb);
> + active_fence_cb(fence, cb);
> }
>
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index 5dd62323b92a..3208cc2e8c1a 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -61,19 +61,15 @@ void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb);
> */
> static inline void
> __i915_active_fence_init(struct i915_active_fence *active,
> - struct mutex *lock,
> void *fence,
> dma_fence_func_t fn)
> {
> RCU_INIT_POINTER(active->fence, fence);
> active->cb.func = fn ?: i915_active_noop;
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> - active->lock = lock;
> -#endif
> }
>
> -#define INIT_ACTIVE_FENCE(A, LOCK) \
> - __i915_active_fence_init((A), (LOCK), NULL, NULL)
> +#define INIT_ACTIVE_FENCE(A) \
> + __i915_active_fence_init((A), NULL, NULL)
>
> struct dma_fence *
> __i915_active_fence_set(struct i915_active_fence *active,
> @@ -127,15 +123,6 @@ i915_active_fence_isset(const struct i915_active_fence *active)
> return rcu_access_pointer(active->fence);
> }
>
> -static inline void
> -i915_active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> -{
> - struct i915_active_fence *active =
> - container_of(cb, typeof(*active), cb);
> -
> - RCU_INIT_POINTER(active->fence, NULL);
> -}
> -
> /*
> * GPU activity tracking
> *
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> index 96aed0ee700a..6360c3e4b765 100644
> --- a/drivers/gpu/drm/i915/i915_active_types.h
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -20,21 +20,6 @@
> struct i915_active_fence {
> struct dma_fence __rcu *fence;
> struct dma_fence_cb cb;
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> - /*
> - * Incorporeal!
> - *
> - * Updates to the i915_active_request must be serialised under a lock
> - * to ensure that the timeline is ordered. Normally, this is the
> - * timeline->mutex, but another mutex may be used so long as it is
> - * done so consistently.
> - *
> - * For lockdep tracking of the above, we store the lock we intend
> - * to always use for updates of this i915_active_request during
> - * construction and assert that is held on every update.
> - */
> - struct mutex *lock;
> -#endif
> };
>
> struct active_node;
>
I think I understand it. xchg guarantees only one thread/irq can consume
active->fence and after that its easy.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list