[Intel-gfx] [PATCH] drm/i915/breadcrumbs: Drop request reference for the signaler thread

Wed Jan 31 17:18:55 UTC 2018

On 22/01/2018 15:41, Chris Wilson wrote:
> If we remember to cancel the signaler on a request when retiring it
> (after we know that the request has been signaled), we do not need to
> carry an additional request in the signaler itself. This prevents an
> issue whereby the signaler threads may be delayed and hold on to
> thousands of request references, causing severe memory fragmentation and
> premature oom (most noticeable on 32b snb due to the limited GFP_KERNEL
> and frequent use of inter-engine fences).
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_request.c  |   6 +-
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 149 +++++++++++++++++--------------
>   2 files changed, 85 insertions(+), 70 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index a0f451b4a4e8..e4eca6ba0d05 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -441,7 +441,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>   	spin_lock_irq(&request->lock);
>   	if (request->waitboost)
>   		atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> -	dma_fence_signal_locked(&request->fence);
> +	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> +		dma_fence_signal_locked(&request->fence);
> +	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> +		intel_engine_cancel_signaling(request);
>   	spin_unlock_irq(&request->lock);
>   
>   	i915_priotree_fini(request->i915, &request->priotree);
> @@ -738,6 +741,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   
>   	/* No zalloc, must clear what we need by hand */
>   	req->global_seqno = 0;
> +	req->signaling.wait.seqno = 0;
>   	req->file_priv = NULL;
>   	req->batch = NULL;
>   	req->capture_list = NULL;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 86acac010bb8..e3667dc1e96d 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -235,7 +235,7 @@ void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
>   	struct intel_wait *wait, *n;
>   
>   	if (!b->irq_armed)
> -		goto wakeup_signaler;
> +		return;
>   
>   	/*
>   	 * We only disarm the irq when we are idle (all requests completed),
> @@ -260,14 +260,6 @@ void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
>   	b->waiters = RB_ROOT;
>   
>   	spin_unlock_irq(&b->rb_lock);
> -
> -	/*
> -	 * The signaling thread may be asleep holding a reference to a request,
> -	 * that had its signaling cancelled prior to being preempted. We need
> -	 * to kick the signaler, just in case, to release any such reference.
> -	 */
> -wakeup_signaler:
> -	wake_up_process(b->signaler);
>   }
>   
>   static bool use_fake_irq(const struct intel_breadcrumbs *b)
> @@ -644,6 +636,62 @@ static void signaler_set_rtpriority(void)
>   	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
>   }
>   
> +static void __intel_engine_remove_signal(struct intel_engine_cs *engine,
> +					 struct drm_i915_gem_request *request)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +
> +	lockdep_assert_held(&b->rb_lock);
> +
> +	/*
> +	 * Wake up all other completed waiters and select the
> +	 * next bottom-half for the next user interrupt.
> +	 */
> +	__intel_engine_remove_wait(engine, &request->signaling.wait);
> +
> +	/*
> +	 * Find the next oldest signal. Note that as we have
> +	 * not been holding the lock, another client may

Which lock, request-lock? b->first_signal seems to always be updated 
under the b->rb_lock, so I am not sure of the 
request->signaling.wait.seqno check.

wait.seqno is also updated without the request->lock below, so maybe you 
were talking about the rb_lock after all?

How does wait.seqno becomes 0 outside the rb_lock? Is it due RCU 
business this patch adds? Like it is retired before the signaler runs?

> +	 * have installed an even older signal than the one
> +	 * we just completed - so double check we are still
> +	 * the oldest before picking the next one.
> +	 */
> +	if (request->signaling.wait.seqno) {
> +		if (request == rcu_access_pointer(b->first_signal)) {
> +			struct rb_node *rb = rb_next(&request->signaling.node);
> +			rcu_assign_pointer(b->first_signal,
> +					   rb ? to_signaler(rb) : NULL);
> +		}
> +
> +		rb_erase(&request->signaling.node, &b->signals);
> +		request->signaling.wait.seqno = 0;
> +	}
> +}
> +
> +static struct drm_i915_gem_request *first_signal(struct intel_breadcrumbs *b)
> +{
> +	/*
> +	 * See the big warnings for i915_gem_active_get_rcu() and similarly
> +	 * for dma_fence_get_rcu_safe() that explain the intricacies involved
> +	 * here with defeating CPU/compiler speculation and enforcing
> +	 * the required memory barriers.
> +	 */
> +	do {
> +		struct drm_i915_gem_request *request;
> +
> +		request = rcu_dereference(b->first_signal);
> +		if (request)
> +			request = i915_gem_request_get_rcu(request);
> +
> +		barrier();
> +
> +		if (!request || request == rcu_access_pointer(b->first_signal))
> +			return rcu_pointer_handoff(request);
> +
> +		i915_gem_request_put(request);
> +	} while (1);
> +}
> +
>   static int intel_breadcrumbs_signaler(void *arg)
>   {
>   	struct intel_engine_cs *engine = arg;
> @@ -667,41 +715,21 @@ static int intel_breadcrumbs_signaler(void *arg)
>   		 * a new client.
>   		 */
>   		rcu_read_lock();
> -		request = rcu_dereference(b->first_signal);
> -		if (request)
> -			request = i915_gem_request_get_rcu(request);
> +		request = first_signal(b);

get_ prefix would be good to signify a get vs peek. Maybe even _rcu suffix.

>   		rcu_read_unlock();
>   		if (signal_complete(request)) {
> -			local_bh_disable();
> -			dma_fence_signal(&request->fence);
> -			local_bh_enable(); /* kick start the tasklets */
> -
> -			spin_lock_irq(&b->rb_lock);
> -
> -			/* Wake up all other completed waiters and select the
> -			 * next bottom-half for the next user interrupt.
> -			 */
> -			__intel_engine_remove_wait(engine,
> -						   &request->signaling.wait);
> -
> -			/* Find the next oldest signal. Note that as we have
> -			 * not been holding the lock, another client may
> -			 * have installed an even older signal than the one
> -			 * we just completed - so double check we are still
> -			 * the oldest before picking the next one.
> -			 */
> -			if (request == rcu_access_pointer(b->first_signal)) {
> -				struct rb_node *rb =
> -					rb_next(&request->signaling.node);
> -				rcu_assign_pointer(b->first_signal,
> -						   rb ? to_signaler(rb) : NULL);
> +			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> +				      &request->fence.flags)) {
> +				local_bh_disable();
> +				dma_fence_signal(&request->fence);
> +				local_bh_enable(); /* kick start the tasklets */
>   			}
> -			rb_erase(&request->signaling.node, &b->signals);
> -			RB_CLEAR_NODE(&request->signaling.node);
> -
> -			spin_unlock_irq(&b->rb_lock);
>   
> -			i915_gem_request_put(request);
> +			if (request->signaling.wait.seqno) {
> +				spin_lock_irq(&b->rb_lock);
> +				__intel_engine_remove_signal(engine, request);
> +				spin_unlock_irq(&b->rb_lock);

This is safe against double invocation on a single request (race between 
retire and signaler) because of RB_EMPTY_NODE guard in 
__intel_engine_remove_wait?

> +			}
>   
>   			/* If the engine is saturated we may be continually
>   			 * processing completed requests. This angers the
> @@ -712,19 +740,17 @@ static int intel_breadcrumbs_signaler(void *arg)
>   			 */
>   			do_schedule = need_resched();
>   		}
> +		i915_gem_request_put(request);
>   
>   		if (unlikely(do_schedule)) {
>   			if (kthread_should_park())
>   				kthread_parkme();
>   
> -			if (unlikely(kthread_should_stop())) {
> -				i915_gem_request_put(request);
> +			if (unlikely(kthread_should_stop()))
>   				break;
> -			}
>   
>   			schedule();
>   		}
> -		i915_gem_request_put(request);
>   	} while (1);
>   	__set_current_state(TASK_RUNNING);
>   
> @@ -753,12 +779,12 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
>   	if (!seqno)
>   		return;
>   
> +	spin_lock(&b->rb_lock);
> +
> +	GEM_BUG_ON(request->signaling.wait.seqno);
>   	request->signaling.wait.tsk = b->signaler;
>   	request->signaling.wait.request = request;
>   	request->signaling.wait.seqno = seqno;
> -	i915_gem_request_get(request);
> -
> -	spin_lock(&b->rb_lock);
>   
>   	/* First add ourselves into the list of waiters, but register our
>   	 * bottom-half as the signaller thread. As per usual, only the oldest
> @@ -797,7 +823,7 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
>   			rcu_assign_pointer(b->first_signal, request);
>   	} else {
>   		__intel_engine_remove_wait(engine, &request->signaling.wait);
> -		i915_gem_request_put(request);
> +		request->signaling.wait.seqno = 0;
>   		wakeup = false;
>   	}
>   
> @@ -809,32 +835,17 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
>   
>   void intel_engine_cancel_signaling(struct drm_i915_gem_request *request)
>   {
> -	struct intel_engine_cs *engine = request->engine;
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
>   	GEM_BUG_ON(!irqs_disabled());
>   	lockdep_assert_held(&request->lock);
> -	GEM_BUG_ON(!request->signaling.wait.seqno);
>   
> -	spin_lock(&b->rb_lock);
> +	if (request->signaling.wait.seqno) {

Do you need some special annotation like READ_ONCE on wait.seqno every 
time it is used for decisions like this one?

> +		struct intel_engine_cs *engine = request->engine;
> +		struct intel_breadcrumbs *b = &engine->breadcrumbs;
>   
> -	if (!RB_EMPTY_NODE(&request->signaling.node)) {
> -		if (request == rcu_access_pointer(b->first_signal)) {
> -			struct rb_node *rb =
> -				rb_next(&request->signaling.node);
> -			rcu_assign_pointer(b->first_signal,
> -					   rb ? to_signaler(rb) : NULL);
> -		}
> -		rb_erase(&request->signaling.node, &b->signals);
> -		RB_CLEAR_NODE(&request->signaling.node);
> -		i915_gem_request_put(request);
> +		spin_lock(&b->rb_lock);
> +		__intel_engine_remove_signal(engine, request);
> +		spin_unlock(&b->rb_lock);
>   	}
> -
> -	__intel_engine_remove_wait(engine, &request->signaling.wait);
> -
> -	spin_unlock(&b->rb_lock);
> -
> -	request->signaling.wait.seqno = 0;
>   }
>   
>   int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> 

Not sure yet, because I dread adding more complex RCU rules. I still 
can't understand how submission can overtake interrupt processing and 
signaling. I would have guessed submission path is much more CPU heavy 
and also lower priority.

At minimum I need another pass over it. But anyway, just so you know I 
started looking at it.

Regards,

Tvrtko