[Intel-gfx] [PATCH 1/2] drm/i915: signal first fence from irq handler if complete

Wed Feb 22 16:53:35 UTC 2017

On 17/02/2017 15:51, Chris Wilson wrote:
> As execlists and other non-semaphore multi-engine devices coordinate
> between engines using interrupts, we can shave off a few 10s of
> microsecond of scheduling latency by doing the fence signaling from the
> interrupt as opposed to a RT kthread. (Realistically the delay adds
> about 1% to an individual cross-engine workload.) We only signal the
> first fence in order to limit the amount of work we move into the
> interrupt handler. We also have to remember that our breadcrumbs may be
> unordered with respect to the interrupt and so we still require the
> waiter process to perform some heavyweight coherency fixups, as well as
> traversing the tree of waiters.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h          | 19 +++++++++-------
>  drivers/gpu/drm/i915/i915_gem_request.c  |  1 +
>  drivers/gpu/drm/i915/i915_gem_request.h  |  2 ++
>  drivers/gpu/drm/i915/i915_irq.c          | 38 ++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 10 +--------
>  drivers/gpu/drm/i915/intel_ringbuffer.h  | 30 +++++++++----------------
>  6 files changed, 62 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5005922f267b..2592a15d7727 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -4005,7 +4005,10 @@ static inline bool
>  __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  {
>  	struct intel_engine_cs *engine = req->engine;
> -	u32 seqno = i915_gem_request_global_seqno(req);
> +	u32 seqno;
> +
> +	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &req->fence.flags))
> +		return true;
>
>  	/* The request was dequeued before we were awoken. We check after
>  	 * inspecting the hw to confirm that this was the same request
> @@ -4013,6 +4016,7 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  	 * the request execution are sufficient to ensure that a check
>  	 * after reading the value from hw matches this request.
>  	 */
> +	seqno = i915_gem_request_global_seqno(req);
>  	if (!seqno)
>  		return false;
>
> @@ -4034,9 +4038,8 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  	 * is woken.
>  	 */
>  	if (engine->irq_seqno_barrier &&
> -	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
>  	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
> -		struct task_struct *tsk;
> +		unsigned long flags;
>
>  		/* The ordering of irq_posted versus applying the barrier
>  		 * is crucial. The clearing of the current irq_posted must
> @@ -4058,17 +4061,17 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  		 * the seqno before we believe it coherent since they see
>  		 * irq_posted == false but we are still running).
>  		 */
> -		rcu_read_lock();
> -		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> -		if (tsk && tsk != current)
> +		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> +		if (engine->breadcrumbs.first_wait &&
> +		    engine->breadcrumbs.first_wait->tsk != current)
>  			/* Note that if the bottom-half is changed as we
>  			 * are sending the wake-up, the new bottom-half will
>  			 * be woken by whomever made the change. We only have
>  			 * to worry about when we steal the irq-posted for
>  			 * ourself.
>  			 */
> -			wake_up_process(tsk);
> -		rcu_read_unlock();
> +			wake_up_process(engine->breadcrumbs.first_wait->tsk);
> +		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);

Worth caching &engine->breadcrumbs maybe?

>
>  		if (__i915_gem_request_completed(req, seqno))
>  			return true;
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index e22eacec022d..2e7bdb0cf069 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1083,6 +1083,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>  	}
>
>  	wait.tsk = current;
> +	wait.request = req;

I guess this was the preemption prep tree already. If you decide to keep 
the intel_wait_init helper could add req to it.

>
>  restart:
>  	do {
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 5f73d8c0a38a..0efee879df23 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -32,10 +32,12 @@
>
>  struct drm_file;
>  struct drm_i915_gem_object;
> +struct drm_i915_gem_request;
>
>  struct intel_wait {
>  	struct rb_node node;
>  	struct task_struct *tsk;
> +	struct drm_i915_gem_request *request;
>  	u32 seqno;
>  };
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 57fa1bf78a85..0c370c687c2a 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1033,10 +1033,44 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
>
>  static void notify_ring(struct intel_engine_cs *engine)
>  {
> +	struct drm_i915_gem_request *rq = NULL;
> +	struct intel_wait *wait;
> +
> +	if (!intel_engine_has_waiter(engine))
> +		return;
> +
> +	trace_i915_gem_request_notify(engine);
>  	atomic_inc(&engine->irq_count);
>  	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
> -	if (intel_engine_wakeup(engine))
> -		trace_i915_gem_request_notify(engine);
> +
> +	rcu_read_lock();

Not sure this is required from an irq handler. But I don't think it 
harms either, so maybe has an usefulness in documenting things.

> +
> +	spin_lock(&engine->breadcrumbs.lock);
> +	wait = engine->breadcrumbs.first_wait;
> +	if (wait) {
> +		/* We use a callback from the dma-fence to submit
> +		 * requests after waiting on our own requests. To
> +		 * ensure minimum delay in queuing the next request to
> +		 * hardware, signal the fence now rather than wait for
> +		 * the signaler to be woken up. We still wake up the
> +		 * waiter in order to handle the irq-seqno coherency
> +		 * issues (we may receive the interrupt before the
> +		 * seqno is written, see __i915_request_irq_complete())
> +		 * and to handle coalescing of multiple seqno updates
> +		 * and many waiters.
> +		 */
> +		if (i915_seqno_passed(intel_engine_get_seqno(engine),
> +				      wait->seqno))
> +			rq = wait->request;
> +
> +		wake_up_process(wait->tsk);
> +	}
> +	spin_unlock(&engine->breadcrumbs.lock);
> +
> +	if (rq)
> +		dma_fence_signal(&rq->fence);
> +
> +	rcu_read_unlock();
>  }
>
>  static void vlv_c0_read(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 3a080c7345d5..860372653a59 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -276,7 +276,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  	}
>  	rb_link_node(&wait->node, parent, p);
>  	rb_insert_color(&wait->node, &b->waiters);
> -	GEM_BUG_ON(!first && !rcu_access_pointer(b->irq_seqno_bh));
>
>  	if (completed) {
>  		struct rb_node *next = rb_next(completed);
> @@ -285,7 +284,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  		if (next && next != &wait->node) {
>  			GEM_BUG_ON(first);
>  			b->first_wait = to_wait(next);
> -			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
>  			/* As there is a delay between reading the current
>  			 * seqno, processing the completed tasks and selecting
>  			 * the next waiter, we may have missed the interrupt
> @@ -312,7 +310,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  	if (first) {
>  		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
>  		b->first_wait = wait;
> -		rcu_assign_pointer(b->irq_seqno_bh, wait->tsk);
>  		/* After assigning ourselves as the new bottom-half, we must
>  		 * perform a cursory check to prevent a missed interrupt.
>  		 * Either we miss the interrupt whilst programming the hardware,
> @@ -323,7 +320,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  		 */
>  		__intel_breadcrumbs_enable_irq(b);
>  	}
> -	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh));
>  	GEM_BUG_ON(!b->first_wait);
>  	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
>
> @@ -371,8 +367,6 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  		const int priority = wakeup_priority(b, wait->tsk);
>  		struct rb_node *next;
>
> -		GEM_BUG_ON(rcu_access_pointer(b->irq_seqno_bh) != wait->tsk);
> -
>  		/* We are the current bottom-half. Find the next candidate,
>  		 * the first waiter in the queue on the remaining oldest
>  		 * request. As multiple seqnos may complete in the time it
> @@ -414,13 +408,11 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  			 * exception rather than a seqno completion.
>  			 */
>  			b->first_wait = to_wait(next);
> -			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
>  			if (b->first_wait->seqno != wait->seqno)
>  				__intel_breadcrumbs_enable_irq(b);
>  			wake_up_process(b->first_wait->tsk);
>  		} else {
>  			b->first_wait = NULL;
> -			rcu_assign_pointer(b->irq_seqno_bh, NULL);
>  			__intel_breadcrumbs_disable_irq(b);
>  		}
>  	} else {
> @@ -434,7 +426,6 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  	GEM_BUG_ON(b->first_wait == wait);
>  	GEM_BUG_ON(rb_first(&b->waiters) !=
>  		   (b->first_wait ? &b->first_wait->node : NULL));
> -	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh) ^ RB_EMPTY_ROOT(&b->waiters));
>  }
>
>  void intel_engine_remove_wait(struct intel_engine_cs *engine,
> @@ -598,6 +589,7 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request)
>  		return;
>
>  	request->signaling.wait.tsk = b->signaler;
> +	request->signaling.wait.request = request;
>  	request->signaling.wait.seqno = seqno;
>  	i915_gem_request_get(request);
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 45d2c2fa946e..1271b6ebdd4d 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -235,8 +235,6 @@ struct intel_engine_cs {
>  	 * the overhead of waking that client is much preferred.
>  	 */
>  	struct intel_breadcrumbs {
> -		struct task_struct __rcu *irq_seqno_bh; /* bh for interrupts */
> -
>  		spinlock_t lock; /* protects the lists of requests; irqsafe */
>  		struct rb_root waiters; /* sorted by retirement, priority */
>  		struct rb_root signals; /* sorted by retirement */
> @@ -602,31 +600,25 @@ void intel_engine_cancel_signaling(struct drm_i915_gem_request *request);
>
>  static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
>  {
> -	return rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh);
> +	return READ_ONCE(engine->breadcrumbs.first_wait);
>  }
>
> -static inline bool intel_engine_wakeup(const struct intel_engine_cs *engine)
> +static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
>  {
> -	bool wakeup = false;
> +	struct intel_wait *wait = NULL;
> +	unsigned long flags;
>
> -	/* Note that for this not to dangerously chase a dangling pointer,
> -	 * we must hold the rcu_read_lock here.
> -	 *
> -	 * Also note that tsk is likely to be in !TASK_RUNNING state so an
> -	 * early test for tsk->state != TASK_RUNNING before wake_up_process()
> -	 * is unlikely to be beneficial.
> -	 */
>  	if (intel_engine_has_waiter(engine)) {
> -		struct task_struct *tsk;
> +		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> +
> +		wait = engine->breadcrumbs.first_wait;
> +		if (wait)
> +			wake_up_process(wait->tsk);
>
> -		rcu_read_lock();
> -		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> -		if (tsk)
> -			wakeup = wake_up_process(tsk);
> -		rcu_read_unlock();
> +		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
>  	}
>
> -	return wakeup;
> +	return wait;
>  }
>
>  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine);
>

Looks OK.

Last time I tested it was a definitive latency improvement but also a 
slight throughput regression on crazy microbenchmarks like gem_latency 
IIRC. But I think overall a good thing for more realistic workloads.

I leave to you the ordering wrt/ the preemption prep series. I can apply 
my r-b at that point, not sure if it makes sense now?

Regards,

Tvrtko