[Intel-gfx] [PATCH 10/10] drm/i915/breadcrumbs: Reduce signaler rbtree to a sorted list

Chris Wilson chris at chris-wilson.co.uk
Mon Jan 15 21:24:55 UTC 2018


The goal here is to try and reduce the latency of signaling additional
requests following the wakeup from interrupt by reducing the list of
to-be-signaled requests from an rbtree to a sorted linked list. The
original choice of using an rbtree was to facilitate random insertions
of request into the signaler while maintaining a sorted list. However,
if we assume that most new requests are added when they are submitted,
we see those new requests in execution order making a insertion sort
fast, and the reduction in overhead of each signaler iteration
significant.

Since commit 56299fb7d904 ("drm/i915: Signal first fence from irq handler
if complete"), we signal most fences directly from notify_ring() in the
interrupt handler greatly reducing the amount of work that actually
needs to be done by the signaler kthread. All the thread is then
required to do is operate as the bottom-half, cleaning up after the
interrupt handler and preparing the next waiter. This includes signaling
all later completed fences in a saturated system, but on a mostly idle
system we only have to rebuild the wait rbtree in time for the next
interrupt. With this de-emphasis of the signaler's role, we want to
rejig it's datastructures to reduce the amount of work we require to
both setup the signal tree and maintain it on every interrupt.

References: 56299fb7d904 ("drm/i915: Signal first fence from irq handler if complete")
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.h  |   2 +-
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 281 +++++++++++++------------------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |   4 +-
 3 files changed, 117 insertions(+), 170 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 6c607f8dbf92..a2dd3d71c0f8 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -44,8 +44,8 @@ struct intel_wait {
 };
 
 struct intel_signal_node {
-	struct rb_node node;
 	struct intel_wait wait;
+	struct list_head link;
 };
 
 struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 7c82cfe23922..8a69bd69070f 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -350,7 +350,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
 	lockdep_assert_held(&b->rb_lock);
 	GEM_BUG_ON(b->irq_wait == wait);
 
-	/* This request is completed, so remove it from the tree, mark it as
+	/*
+	 * This request is completed, so remove it from the tree, mark it as
 	 * complete, and *then* wake up the associated task. N.B. when the
 	 * task wakes up, it will find the empty rb_node, discern that it
 	 * has already been removed from the tree and skip the serialisation
@@ -361,7 +362,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
 	rb_erase(&wait->node, &b->waiters);
 	RB_CLEAR_NODE(&wait->node);
 
-	wake_up_process(wait->tsk); /* implicit smp_wmb() */
+	if (wait->tsk->state != TASK_RUNNING)
+		wake_up_process(wait->tsk); /* implicit smp_wmb() */
 }
 
 static inline void __intel_breadcrumbs_next(struct intel_engine_cs *engine,
@@ -602,36 +604,6 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 	spin_unlock_irq(&b->rb_lock);
 }
 
-static bool signal_valid(const struct drm_i915_gem_request *request)
-{
-	return intel_wait_check_request(&request->signaling.wait, request);
-}
-
-static bool signal_complete(const struct drm_i915_gem_request *request)
-{
-	if (!request)
-		return false;
-
-	/* If another process served as the bottom-half it may have already
-	 * signalled that this wait is already completed.
-	 */
-	if (intel_wait_complete(&request->signaling.wait))
-		return signal_valid(request);
-
-	/* Carefully check if the request is complete, giving time for the
-	 * seqno to be visible or if the GPU hung.
-	 */
-	if (__i915_request_irq_complete(request))
-		return true;
-
-	return false;
-}
-
-static struct drm_i915_gem_request *to_signaler(struct rb_node *rb)
-{
-	return rb_entry(rb, struct drm_i915_gem_request, signaling.node);
-}
-
 static void signaler_set_rtpriority(void)
 {
 	 struct sched_param param = { .sched_priority = 1 };
@@ -639,77 +611,25 @@ static void signaler_set_rtpriority(void)
 	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
 }
 
-static void __intel_engine_remove_signal(struct intel_engine_cs *engine,
-					 struct drm_i915_gem_request *request)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	lockdep_assert_held(&b->rb_lock);
-
-	/*
-	 * Wake up all other completed waiters and select the
-	 * next bottom-half for the next user interrupt.
-	 */
-	__intel_engine_remove_wait(engine, &request->signaling.wait);
-
-	/*
-	 * Find the next oldest signal. Note that as we have
-	 * not been holding the lock, another client may
-	 * have installed an even older signal than the one
-	 * we just completed - so double check we are still
-	 * the oldest before picking the next one.
-	 */
-	if (request->signaling.wait.seqno) {
-		if (request == rcu_access_pointer(b->first_signal)) {
-			struct rb_node *rb = rb_next(&request->signaling.node);
-			rcu_assign_pointer(b->first_signal,
-					   rb ? to_signaler(rb) : NULL);
-		}
-
-		rb_erase(&request->signaling.node, &b->signals);
-		request->signaling.wait.seqno = 0;
-	}
-}
-
-static struct drm_i915_gem_request *first_signal(struct intel_breadcrumbs *b)
-{
-	/*
-	 * See the big warnings for i915_gem_active_get_rcu() and similarly
-	 * for dma_fence_get_rcu_safe() that explain the intricacies involved
-	 * here with defeating CPU/compiler speculation and enforcing
-	 * the required memory barriers.
-	 */
-	do {
-		struct drm_i915_gem_request *request;
-
-		request = rcu_dereference(b->first_signal);
-		if (request)
-			request = i915_gem_request_get_rcu(request);
-
-		barrier();
-
-		if (!request || request == rcu_access_pointer(b->first_signal))
-			return rcu_pointer_handoff(request);
-
-		i915_gem_request_put(request);
-	} while (1);
-}
-
 static int intel_breadcrumbs_signaler(void *arg)
 {
 	struct intel_engine_cs *engine = arg;
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct drm_i915_gem_request *request;
+	struct drm_i915_gem_request *rq, *n;
 
 	/* Install ourselves with high priority to reduce signalling latency */
 	signaler_set_rtpriority();
 
 	do {
-		bool do_schedule = true;
+		LIST_HEAD(list);
+		u32 seqno;
 
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&b->signals))
+			goto sleep;
 
-		/* We are either woken up by the interrupt bottom-half,
+		/*
+		 * We are either woken up by the interrupt bottom-half,
 		 * or by a client adding a new signaller. In both cases,
 		 * the GPU seqno may have advanced beyond our oldest signal.
 		 * If it has, propagate the signal, remove the waiter and
@@ -717,35 +637,60 @@ static int intel_breadcrumbs_signaler(void *arg)
 		 * need to wait for a new interrupt from the GPU or for
 		 * a new client.
 		 */
-		rcu_read_lock();
-		request = first_signal(b);
-		rcu_read_unlock();
-		if (signal_complete(request)) {
-			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-				      &request->fence.flags)) {
-				local_bh_disable();
-				dma_fence_signal(&request->fence);
-				local_bh_enable(); /* kick start the tasklets */
+		seqno = intel_engine_get_seqno(engine);
+
+		spin_lock_irq(&b->rb_lock);
+		list_for_each_entry_safe(rq, n, &b->signals, signaling.link) {
+			u32 this = rq->signaling.wait.seqno;
+
+			GEM_BUG_ON(!rq->signaling.wait.seqno);
+
+			if (!i915_seqno_passed(seqno, this))
+				break;
+
+			if (this == i915_gem_request_global_seqno(rq)) {
+				__intel_engine_remove_wait(engine,
+							   &rq->signaling.wait);
+
+				rq->signaling.wait.seqno = 0;
+				__list_del_entry(&rq->signaling.link);
+
+				if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+					      &rq->fence.flags)) {
+					list_add_tail(&rq->signaling.link,
+						      &list);
+					i915_gem_request_get(rq);
+				}
 			}
+		}
+		spin_unlock_irq(&b->rb_lock);
 
-			if (request->signaling.wait.seqno) {
-				spin_lock_irq(&b->rb_lock);
-				__intel_engine_remove_signal(engine, request);
-				spin_unlock_irq(&b->rb_lock);
+		if (!list_empty(&list)) {
+			local_bh_disable();
+			list_for_each_entry_safe(rq, n, &list, signaling.link) {
+				dma_fence_signal(&rq->fence);
+				i915_gem_request_put(rq);
 			}
+			local_bh_enable(); /* kick start the tasklets */
+		}
 
-			/* If the engine is saturated we may be continually
-			 * processing completed requests. This angers the
-			 * NMI watchdog if we never let anything else
-			 * have access to the CPU. Let's pretend to be nice
-			 * and relinquish the CPU if we burn through the
-			 * entire RT timeslice!
-			 */
-			do_schedule = need_resched();
+		if (engine->irq_seqno_barrier &&
+		    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB,
+				       &engine->irq_posted)) {
+			engine->irq_seqno_barrier(engine);
+			intel_engine_wakeup(engine);
 		}
-		i915_gem_request_put(request);
 
-		if (unlikely(do_schedule)) {
+		/*
+		 * If the engine is saturated we may be continually
+		 * processing completed requests. This angers the
+		 * NMI watchdog if we never let anything else
+		 * have access to the CPU. Let's pretend to be nice
+		 * and relinquish the CPU if we burn through the
+		 * entire RT timeslice!
+		 */
+		if (list_empty(&list) || need_resched()) {
+sleep:
 			if (kthread_should_park())
 				kthread_parkme();
 
@@ -760,6 +705,32 @@ static int intel_breadcrumbs_signaler(void *arg)
 	return 0;
 }
 
+static void insert_signal(struct intel_breadcrumbs *b,
+			  struct drm_i915_gem_request *request,
+			  const u32 seqno)
+{
+	struct drm_i915_gem_request *iter;
+
+	lockdep_assert_held(&b->rb_lock);
+
+	/*
+	 * The reasonable assumption is that we are called to add signals
+	 * in sequence, as the requests are submitted for execution and
+	 * assigned a global_seqno. This will be the case for the majority
+	 * of internally generated signals (inter-engine signaling).
+	 *
+	 * Out of order waiters triggering random signaling enabling will
+	 * be more problematic, but hopefully rare enough and the list
+	 * small enough that the O(N) insertion sort is not an issue.
+	 */
+
+	list_for_each_entry_reverse(iter, &b->signals, signaling.link)
+		if (i915_seqno_passed(seqno, iter->signaling.wait.seqno))
+			break;
+
+	list_add(&request->signaling.link, &iter->signaling.link);
+}
+
 void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 				   bool wakeup)
 {
@@ -767,7 +738,8 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 	u32 seqno;
 
-	/* Note that we may be called from an interrupt handler on another
+	/*
+	 * Note that we may be called from an interrupt handler on another
 	 * device (e.g. nouveau signaling a fence completion causing us
 	 * to submit a request, and so enable signaling). As such,
 	 * we need to make sure that all other users of b->rb_lock protect
@@ -779,17 +751,16 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 	lockdep_assert_held(&request->lock);
 
 	seqno = i915_gem_request_global_seqno(request);
-	if (!seqno)
+	if (!seqno) /* will be enabled later upon execution */
 		return;
 
-	spin_lock(&b->rb_lock);
-
 	GEM_BUG_ON(request->signaling.wait.seqno);
 	request->signaling.wait.tsk = b->signaler;
 	request->signaling.wait.request = request;
 	request->signaling.wait.seqno = seqno;
 
-	/* First add ourselves into the list of waiters, but register our
+	/*
+	 * Add ourselves into the list of waiters, but registering our
 	 * bottom-half as the signaller thread. As per usual, only the oldest
 	 * waiter (not just signaller) is tasked as the bottom-half waking
 	 * up all completed waiters after the user interrupt.
@@ -797,39 +768,9 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 	 * If we are the oldest waiter, enable the irq (after which we
 	 * must double check that the seqno did not complete).
 	 */
+	spin_lock(&b->rb_lock);
+	insert_signal(b, request, seqno);
 	wakeup &= __intel_engine_add_wait(engine, &request->signaling.wait);
-
-	if (!__i915_gem_request_completed(request, seqno)) {
-		struct rb_node *parent, **p;
-		bool first;
-
-		/* Now insert ourselves into the retirement ordered list of
-		 * signals on this engine. We track the oldest seqno as that
-		 * will be the first signal to complete.
-		 */
-		parent = NULL;
-		first = true;
-		p = &b->signals.rb_node;
-		while (*p) {
-			parent = *p;
-			if (i915_seqno_passed(seqno,
-					      to_signaler(parent)->signaling.wait.seqno)) {
-				p = &parent->rb_right;
-				first = false;
-			} else {
-				p = &parent->rb_left;
-			}
-		}
-		rb_link_node(&request->signaling.node, parent, p);
-		rb_insert_color(&request->signaling.node, &b->signals);
-		if (first)
-			rcu_assign_pointer(b->first_signal, request);
-	} else {
-		__intel_engine_remove_wait(engine, &request->signaling.wait);
-		request->signaling.wait.seqno = 0;
-		wakeup = false;
-	}
-
 	spin_unlock(&b->rb_lock);
 
 	if (wakeup)
@@ -838,17 +779,20 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 
 void intel_engine_cancel_signaling(struct drm_i915_gem_request *request)
 {
+	struct intel_engine_cs *engine = request->engine;
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&request->lock);
 
-	if (request->signaling.wait.seqno) {
-		struct intel_engine_cs *engine = request->engine;
-		struct intel_breadcrumbs *b = &engine->breadcrumbs;
+	if (!request->signaling.wait.seqno)
+		return;
 
-		spin_lock(&b->rb_lock);
-		__intel_engine_remove_signal(engine, request);
-		spin_unlock(&b->rb_lock);
-	}
+	spin_lock(&b->rb_lock);
+	__intel_engine_remove_wait(engine, &request->signaling.wait);
+	if (fetch_and_zero(&request->signaling.wait.seqno))
+		__list_del_entry(&request->signaling.link);
+	spin_unlock(&b->rb_lock);
 }
 
 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -862,6 +806,8 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
 	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
 
+	INIT_LIST_HEAD(&b->signals);
+
 	/* Spawn a thread to provide a common bottom-half for all signals.
 	 * As this is an asynchronous interface we cannot steal the current
 	 * task for handling the bottom-half to the user interrupt, therefore
@@ -921,8 +867,7 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 	/* The engines should be idle and all requests accounted for! */
 	WARN_ON(READ_ONCE(b->irq_wait));
 	WARN_ON(!RB_EMPTY_ROOT(&b->waiters));
-	WARN_ON(rcu_access_pointer(b->first_signal));
-	WARN_ON(!RB_EMPTY_ROOT(&b->signals));
+	WARN_ON(!list_empty(&b->signals));
 
 	if (!IS_ERR_OR_NULL(b->signaler))
 		kthread_stop(b->signaler);
@@ -935,20 +880,22 @@ bool intel_breadcrumbs_busy(struct intel_engine_cs *engine)
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 	bool busy = false;
 
-	spin_lock_irq(&b->rb_lock);
-
 	if (b->irq_wait) {
-		wake_up_process(b->irq_wait->tsk);
-		busy = true;
+		spin_lock_irq(&b->irq_lock);
+
+		if (b->irq_wait) {
+			wake_up_process(b->irq_wait->tsk);
+			busy = true;
+		}
+
+		spin_unlock_irq(&b->irq_lock);
 	}
 
-	if (rcu_access_pointer(b->first_signal)) {
+	if (!busy && !list_empty(&b->signals)) {
 		wake_up_process(b->signaler);
 		busy = true;
 	}
 
-	spin_unlock_irq(&b->rb_lock);
-
 	return busy;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index cea2092d25d9..fedf298de036 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -331,9 +331,9 @@ struct intel_engine_cs {
 
 		spinlock_t rb_lock; /* protects the rb and wraps irq_lock */
 		struct rb_root waiters; /* sorted by retirement, priority */
-		struct rb_root signals; /* sorted by retirement */
+		struct list_head signals; /* sorted by retirement */
 		struct task_struct *signaler; /* used for fence signalling */
-		struct drm_i915_gem_request __rcu *first_signal;
+
 		struct timer_list fake_irq; /* used after a missed interrupt */
 		struct timer_list hangcheck; /* detect missed interrupts */
 
-- 
2.15.1



More information about the Intel-gfx mailing list