[Intel-gfx] [PATCH] drm/i915: Serialise i915_active_fence_set() with itself

Wed Nov 27 14:53:38 UTC 2019

On 27/11/2019 13:45, Chris Wilson wrote:
> The expected downside to commit 58b4c1a07ada ("drm/i915: Reduce nested
> prepare_remote_context() to a trylock") was that it would need to return
> -EAGAIN to userspace in order to resolve potential mutex inversion. Such
> an unsightly round trip is unnecessary if we could atomically insert a
> barrier into the i915_active_fence, so make it happen.
> 
> Currently, we use the timeline->mutex (or some other named outer lock)
> to order insertion into the i915_active_fence (and so individual nodes
> of i915_active). Inside __i915_active_fence_set, we only need then
> serialise with the interrupt handler in order to claim the timeline for
> ourselves.
> 
> However, if we remove the outer lock, we need to ensure the order is
> intact between not only multiple threads trying to insert themselves
> into the timeline, but also with the interrupt handler completing the
> previous occupant. We use xchg() on insert so that we have an ordered
> sequence of insertions (and each caller knows the previous fence on
> which to wait, preserving the chain of all fences in the timeline), but
> we then have to cmpxchg() in the interrupt handler to avoid overwriting
> the new occupant. The only nasty side-effect is having to temporarily
> strip off the RCU-annotations to apply the atomic operations, otherwise
> the rules are much more conventional!
> 
> Fixes: 58b4c1a07ada ("drm/i915: Reduce nested prepare_remote_context() to a trylock")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       |  19 ----
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c     |   2 +-
>   drivers/gpu/drm/i915/gt/intel_timeline.c      |   2 +-
>   .../gpu/drm/i915/gt/selftests/mock_timeline.c |   2 +-
>   drivers/gpu/drm/i915/i915_active.c            | 107 ++++++++++--------
>   drivers/gpu/drm/i915/i915_active.h            |  17 +--
>   drivers/gpu/drm/i915/i915_active_types.h      |  15 ---
>   7 files changed, 62 insertions(+), 102 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index ef7bc41ffffa..b5e9c35ec6b8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -310,27 +310,8 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
>   	GEM_BUG_ON(rq->hw_context == ce);
>   
>   	if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */
> -		/*
> -		 * Ideally, we just want to insert our foreign fence as
> -		 * a barrier into the remove context, such that this operation
> -		 * occurs after all current operations in that context, and
> -		 * all future operations must occur after this.
> -		 *
> -		 * Currently, the timeline->last_request tracking is guarded
> -		 * by its mutex and so we must obtain that to atomically
> -		 * insert our barrier. However, since we already hold our
> -		 * timeline->mutex, we must be careful against potential
> -		 * inversion if we are the kernel_context as the remote context
> -		 * will itself poke at the kernel_context when it needs to
> -		 * unpin. Ergo, if already locked, we drop both locks and
> -		 * try again (through the magic of userspace repeating EAGAIN).
> -		 */
> -		if (!mutex_trylock(&tl->mutex))
> -			return -EAGAIN;
> -
>   		/* Queue this switch after current activity by this context. */
>   		err = i915_active_fence_set(&tl->last_request, rq);
> -		mutex_unlock(&tl->mutex);
>   		if (err)
>   			return err;
>   	}
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 0e1ad4a4bd97..21183ad87368 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -183,7 +183,7 @@ static void call_idle_barriers(struct intel_engine_cs *engine)
>   			container_of((struct list_head *)node,
>   				     typeof(*cb), node);
>   
> -		cb->func(NULL, cb);
> +		cb->func(ERR_PTR(-EAGAIN), cb);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index c1d2419444f8..038e05a6336c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -254,7 +254,7 @@ int intel_timeline_init(struct intel_timeline *timeline,
>   
>   	mutex_init(&timeline->mutex);
>   
> -	INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
> +	INIT_ACTIVE_FENCE(&timeline->last_request);
>   	INIT_LIST_HEAD(&timeline->requests);
>   
>   	i915_syncmap_init(&timeline->sync);
> diff --git a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> index 2a77c051f36a..aeb1d1f616e8 100644
> --- a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
> @@ -15,7 +15,7 @@ void mock_timeline_init(struct intel_timeline *timeline, u64 context)
>   
>   	mutex_init(&timeline->mutex);
>   
> -	INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
> +	INIT_ACTIVE_FENCE(&timeline->last_request);
>   	INIT_LIST_HEAD(&timeline->requests);
>   
>   	i915_syncmap_init(&timeline->sync);
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index dca15ace88f6..e5809727aab2 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -186,18 +186,33 @@ active_retire(struct i915_active *ref)
>   	__active_retire(ref);
>   }
>   
> +static inline struct dma_fence **
> +__active_fence_slot(struct i915_active_fence *active)
> +{
> +	return (struct dma_fence ** __force)&active->fence;
> +}
> +
> +static inline bool
> +active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> +	struct i915_active_fence *active =
> +		container_of(cb, typeof(*active), cb);
> +
> +	return cmpxchg(__active_fence_slot(active), fence, NULL) == fence;
> +}
> +
>   static void
>   node_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
>   {
> -	i915_active_fence_cb(fence, cb);
> -	active_retire(container_of(cb, struct active_node, base.cb)->ref);
> +	if (active_fence_cb(fence, cb))
> +		active_retire(container_of(cb, struct active_node, base.cb)->ref);
>   }
>   
>   static void
>   excl_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
>   {
> -	i915_active_fence_cb(fence, cb);
> -	active_retire(container_of(cb, struct i915_active, excl.cb));
> +	if (active_fence_cb(fence, cb))
> +		active_retire(container_of(cb, struct i915_active, excl.cb));
>   }
>   
>   static struct i915_active_fence *
> @@ -244,7 +259,7 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl)
>   	}
>   
>   	node = prealloc;
> -	__i915_active_fence_init(&node->base, &tl->mutex, NULL, node_retire);
> +	__i915_active_fence_init(&node->base, NULL, node_retire);
>   	node->ref = ref;
>   	node->timeline = idx;
>   
> @@ -281,7 +296,7 @@ void __i915_active_init(struct i915_active *ref,
>   	init_llist_head(&ref->preallocated_barriers);
>   	atomic_set(&ref->count, 0);
>   	__mutex_init(&ref->mutex, "i915_active", key);
> -	__i915_active_fence_init(&ref->excl, &ref->mutex, NULL, excl_retire);
> +	__i915_active_fence_init(&ref->excl, NULL, excl_retire);
>   	INIT_WORK(&ref->work, active_work);
>   }
>   
> @@ -376,15 +391,8 @@ void i915_active_set_exclusive(struct i915_active *ref, struct dma_fence *f)
>   	/* We expect the caller to manage the exclusive timeline ordering */
>   	GEM_BUG_ON(i915_active_is_idle(ref));
>   
> -	/*
> -	 * As we don't know which mutex the caller is using, we told a small
> -	 * lie to the debug code that it is using the i915_active.mutex;
> -	 * and now we must stick to that lie.
> -	 */
> -	mutex_acquire(&ref->mutex.dep_map, 0, 0, _THIS_IP_);
>   	if (!__i915_active_fence_set(&ref->excl, f))
>   		atomic_inc(&ref->count);
> -	mutex_release(&ref->mutex.dep_map, 0, _THIS_IP_);
>   }
>   
>   bool i915_active_acquire_if_busy(struct i915_active *ref)
> @@ -615,10 +623,6 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
>   				goto unwind;
>   			}
>   
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -			node->base.lock =
> -				&engine->kernel_context->timeline->mutex;
> -#endif
>   			RCU_INIT_POINTER(node->base.fence, NULL);
>   			node->base.cb.func = node_retire;
>   			node->timeline = idx;
> @@ -639,6 +643,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
>   			node->base.cb.node.prev = (void *)engine;
>   			atomic_inc(&ref->count);
>   		}
> +		GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN));
>   
>   		GEM_BUG_ON(barrier_to_engine(node) != engine);
>   		llist_add(barrier_to_ll(node), &ref->preallocated_barriers);
> @@ -702,6 +707,11 @@ void i915_active_acquire_barrier(struct i915_active *ref)
>   	}
>   }
>   
> +static struct dma_fence **ll_to_fence_slot(struct llist_node *node)
> +{
> +	return __active_fence_slot(&barrier_from_ll(node)->base);
> +}
> +
>   void i915_request_add_active_barriers(struct i915_request *rq)
>   {
>   	struct intel_engine_cs *engine = rq->engine;
> @@ -721,19 +731,13 @@ void i915_request_add_active_barriers(struct i915_request *rq)
>   	 */
>   	spin_lock_irqsave(&rq->lock, flags);
>   	llist_for_each_safe(node, next, node) {
> -		RCU_INIT_POINTER(barrier_from_ll(node)->base.fence, &rq->fence);
> -		smp_wmb(); /* serialise with reuse_idle_barrier */
> +		/* serialise with reuse_idle_barrier */
> +		smp_store_mb(*ll_to_fence_slot(node), &rq->fence);
>   		list_add_tail((struct list_head *)node, &rq->fence.cb_list);
>   	}
>   	spin_unlock_irqrestore(&rq->lock, flags);
>   }
>   
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -#define active_is_held(active) lockdep_is_held((active)->lock)
> -#else
> -#define active_is_held(active) true
> -#endif
> -
>   /*
>    * __i915_active_fence_set: Update the last active fence along its timeline
>    * @active: the active tracker
> @@ -744,7 +748,7 @@ void i915_request_add_active_barriers(struct i915_request *rq)
>    * fence onto this one. Returns the previous fence (if not already completed),
>    * which the caller must ensure is executed before the new fence. To ensure
>    * that the order of fences within the timeline of the i915_active_fence is
> - * maintained, it must be locked by the caller.
> + * understood, it should be locked by the caller.
>    */
>   struct dma_fence *
>   __i915_active_fence_set(struct i915_active_fence *active,
> @@ -753,34 +757,41 @@ __i915_active_fence_set(struct i915_active_fence *active,
>   	struct dma_fence *prev;
>   	unsigned long flags;
>   
> -	/* NB: must be serialised by an outer timeline mutex (active->lock) */
> -	spin_lock_irqsave(fence->lock, flags);
> +	if (fence == rcu_access_pointer(active->fence))
> +		return fence;
> +
>   	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags));
>   
> -	prev = rcu_dereference_protected(active->fence, active_is_held(active));
> +	/*
> +	 * Consider that we have two threads arriving (A and B), with
> +	 * C already resident as the active->fence.
> +	 *
> +	 * A does the xchg first, and so it sees C or NULL depending
> +	 * on the timing of the interrupt handler. If it is NULL, the
> +	 * previous fence must have been signaled and we know that
> +	 * we are first on the timeline. If it is still present,
> +	 * we acquire the lock on that fence and serialise with the interrupt
> +	 * handler, in the process removing it from any future interrupt
> +	 * callback. A will then wait on C before executing (if present).
> +	 *
> +	 * As B is second, it sees A as the previous fence and so waits for
> +	 * it to complete its transition and takes over the occupancy for
> +	 * itself -- remembering that it needs to wait on A before executing.
> +	 *
> +	 * Note the strong ordering of the timeline also provides consistent
> +	 * nesting rules for the fence->lock; the outer lock is always the
> +	 * older lock.
> +	 */
> +	spin_lock_irqsave(fence->lock, flags);
> +	prev = xchg(__active_fence_slot(active), fence);
>   	if (prev) {
>   		GEM_BUG_ON(prev == fence);
>   		spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING);
>   		__list_del_entry(&active->cb.node);
>   		spin_unlock(prev->lock); /* serialise with prev->cb_list */
> -
> -		/*
> -		 * active->fence is reset by the callback from inside
> -		 * interrupt context. We need to serialise our list
> -		 * manipulation with the fence->lock to prevent the prev
> -		 * being lost inside an interrupt (it can't be replaced as
> -		 * no other caller is allowed to enter __i915_active_fence_set
> -		 * as we hold the timeline lock). After serialising with
> -		 * the callback, we need to double check which ran first,
> -		 * our list_del() [decoupling prev from the callback] or
> -		 * the callback...
> -		 */
> -		prev = rcu_access_pointer(active->fence);
>   	}
> -
> -	rcu_assign_pointer(active->fence, fence);
> +	GEM_BUG_ON(rcu_access_pointer(active->fence) != fence);
>   	list_add_tail(&active->cb.node, &fence->cb_list);
> -
>   	spin_unlock_irqrestore(fence->lock, flags);
>   
>   	return prev;
> @@ -792,10 +803,6 @@ int i915_active_fence_set(struct i915_active_fence *active,
>   	struct dma_fence *fence;
>   	int err = 0;
>   
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -	lockdep_assert_held(active->lock);
> -#endif
> -
>   	/* Must maintain timeline ordering wrt previous active requests */
>   	rcu_read_lock();
>   	fence = __i915_active_fence_set(active, &rq->fence);
> @@ -812,7 +819,7 @@ int i915_active_fence_set(struct i915_active_fence *active,
>   
>   void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb)
>   {
> -	i915_active_fence_cb(fence, cb);
> +	active_fence_cb(fence, cb);
>   }
>   
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index 5dd62323b92a..3208cc2e8c1a 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -61,19 +61,15 @@ void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb);
>    */
>   static inline void
>   __i915_active_fence_init(struct i915_active_fence *active,
> -			 struct mutex *lock,
>   			 void *fence,
>   			 dma_fence_func_t fn)
>   {
>   	RCU_INIT_POINTER(active->fence, fence);
>   	active->cb.func = fn ?: i915_active_noop;
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -	active->lock = lock;
> -#endif
>   }
>   
> -#define INIT_ACTIVE_FENCE(A, LOCK) \
> -	__i915_active_fence_init((A), (LOCK), NULL, NULL)
> +#define INIT_ACTIVE_FENCE(A) \
> +	__i915_active_fence_init((A), NULL, NULL)
>   
>   struct dma_fence *
>   __i915_active_fence_set(struct i915_active_fence *active,
> @@ -127,15 +123,6 @@ i915_active_fence_isset(const struct i915_active_fence *active)
>   	return rcu_access_pointer(active->fence);
>   }
>   
> -static inline void
> -i915_active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> -{
> -	struct i915_active_fence *active =
> -		container_of(cb, typeof(*active), cb);
> -
> -	RCU_INIT_POINTER(active->fence, NULL);
> -}
> -
>   /*
>    * GPU activity tracking
>    *
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> index 96aed0ee700a..6360c3e4b765 100644
> --- a/drivers/gpu/drm/i915/i915_active_types.h
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -20,21 +20,6 @@
>   struct i915_active_fence {
>   	struct dma_fence __rcu *fence;
>   	struct dma_fence_cb cb;
> -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> -	/*
> -	 * Incorporeal!
> -	 *
> -	 * Updates to the i915_active_request must be serialised under a lock
> -	 * to ensure that the timeline is ordered. Normally, this is the
> -	 * timeline->mutex, but another mutex may be used so long as it is
> -	 * done so consistently.
> -	 *
> -	 * For lockdep tracking of the above, we store the lock we intend
> -	 * to always use for updates of this i915_active_request during
> -	 * construction and assert that is held on every update.
> -	 */
> -	struct mutex *lock;
> -#endif
>   };
>   
>   struct active_node;
> 

I think I understand it. xchg guarantees only one thread/irq can consume 
active->fence and after that its easy.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko