[Intel-gfx] [PATCH 15/32] drm/i915: Slaughter the thundering i915_wait_request herd

Fri Dec 11 03:33:11 PST 2015

One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.

Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.

v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
Cc: Dave Gordon <david.s.gordon at intel.com>
---
 drivers/gpu/drm/i915/Makefile            |   1 +
 drivers/gpu/drm/i915/i915_debugfs.c      |  19 ++-
 drivers/gpu/drm/i915/i915_drv.h          |   3 +-
 drivers/gpu/drm/i915/i915_gem.c          | 152 ++++++++---------
 drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
 drivers/gpu/drm/i915/i915_irq.c          |  14 +-
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 274 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  63 ++++++-
 10 files changed, 436 insertions(+), 102 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 0851de07bd13..d3b9d3618719 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
 	  i915_gem_userptr.o \
 	  i915_gpu_error.o \
 	  i915_trace_points.o \
+	  intel_breadcrumbs.o \
 	  intel_lrc.o \
 	  intel_mocs.o \
 	  intel_ringbuffer.o \
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d5f66bbdb160..48e574247a30 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -730,10 +730,22 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
 static void i915_ring_seqno_info(struct seq_file *m,
 				 struct intel_engine_cs *ring)
 {
+	struct rb_node *rb;
+
 	if (ring->get_seqno) {
 		seq_printf(m, "Current sequence (%s): %x\n",
 			   ring->name, ring->get_seqno(ring, false));
 	}
+
+	spin_lock(&ring->breadcrumbs.lock);
+	for (rb = rb_first(&ring->breadcrumbs.requests);
+	     rb != NULL;
+	     rb = rb_next(rb)) {
+		struct intel_breadcrumb *b = container_of(rb, typeof(*b), node);
+		seq_printf(m, "Waiting (%s): %s [%d] on %x\n",
+			   ring->name, b->task->comm, b->task->pid, b->seqno);
+	}
+	spin_unlock(&ring->breadcrumbs.lock);
 }
 
 static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1356,8 +1368,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 
 	for_each_ring(ring, dev_priv, i) {
 		seq_printf(m, "%s:\n", ring->name);
-		seq_printf(m, "\tseqno = %x [current %x]\n",
-			   ring->hangcheck.seqno, seqno[i]);
+		seq_printf(m, "\tseqno = %x [current %x], waiters? %d\n",
+			   ring->hangcheck.seqno, seqno[i],
+			   intel_engine_has_waiter(ring));
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)ring->hangcheck.acthd,
 			   (long long)acthd[i]);
@@ -2322,7 +2335,7 @@ static int count_irq_waiters(struct drm_i915_private *i915)
 	int i;
 
 	for_each_ring(ring, i915, i)
-		count += ring->irq_refcount;
+		count += intel_engine_has_waiter(ring);
 
 	return count;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f82e8fb19c9b..830d760aa562 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1382,7 +1382,7 @@ struct i915_gpu_error {
 #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
 
 	/* For missed irq/seqno simulation. */
-	unsigned int test_irq_rings;
+	unsigned long test_irq_rings;
 };
 
 enum modeset_restore {
@@ -2825,7 +2825,6 @@ ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
 	ibx_display_interrupt_update(dev_priv, bits, 0);
 }
 
-
 /* i915_gem.c */
 int i915_gem_create_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0340a5fe9cda..fa0cf6c9f4d0 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1123,17 +1123,6 @@ i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
 	return 0;
 }
 
-static void fake_irq(unsigned long data)
-{
-	wake_up_process((struct task_struct *)data);
-}
-
-static bool missed_irq(struct drm_i915_private *dev_priv,
-		       struct intel_engine_cs *ring)
-{
-	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
-}
-
 static unsigned long local_clock_us(unsigned *cpu)
 {
 	unsigned long t;
@@ -1166,7 +1155,9 @@ static bool busywait_stop(unsigned long timeout, unsigned cpu)
 	return this_cpu != cpu;
 }
 
-static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
+static bool __i915_spin_request(struct drm_i915_gem_request *req,
+				struct intel_breadcrumb *wait,
+				int state)
 {
 	unsigned long timeout;
 	unsigned cpu;
@@ -1181,31 +1172,30 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	if (req->ring->irq_refcount)
-		return -EBUSY;
-
 	/* Only spin if we know the GPU is processing this request */
 	if (!i915_gem_request_started(req, true))
-		return -EAGAIN;
+		return false;
 
 	timeout = local_clock_us(&cpu) + 5;
-	while (!need_resched()) {
+	do {
 		if (i915_gem_request_completed(req, true))
-			return 0;
+			return true;
 
-		if (signal_pending_state(state, current))
+		if (signal_pending_state(state, wait->task))
 			break;
 
 		if (busywait_stop(timeout, cpu))
 			break;
 
 		cpu_relax_lowlatency();
-	}
 
-	if (i915_gem_request_completed(req, false))
-		return 0;
+		/* Break the loop if we have consumed the timeslice (or been
+		 * preempted) or when either the background thread has
+		 * enabled the interrupt, or the IRQ has fired.
+		 */
+	} while (!need_resched() && wait->task->state == state);
 
-	return -EAGAIN;
+	return false;
 }
 
 /**
@@ -1229,18 +1219,13 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 			s64 *timeout,
 			struct intel_rps_client *rps)
 {
-	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	const bool irq_test_in_progress =
-		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
 	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	DEFINE_WAIT(wait);
-	unsigned long timeout_expire;
+	struct intel_breadcrumb wait;
+	unsigned long timeout_remain;
 	s64 before, now;
-	int ret;
+	int ret = 0;
 
-	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
+	might_sleep();
 
 	if (list_empty(&req->list))
 		return 0;
@@ -1248,7 +1233,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	if (i915_gem_request_completed(req, true))
 		return 0;
 
-	timeout_expire = 0;
+	timeout_remain = MAX_SCHEDULE_TIMEOUT;
 	if (timeout) {
 		if (WARN_ON(*timeout < 0))
 			return -EINVAL;
@@ -1256,83 +1241,80 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 		if (*timeout == 0)
 			return -ETIME;
 
-		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
+		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
 	}
 
-	if (INTEL_INFO(dev_priv)->gen >= 6)
-		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
-
 	/* Record current time in case interrupted by signal, or wedged */
 	trace_i915_gem_request_wait_begin(req);
 	before = ktime_get_raw_ns();
 
-	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req, state);
-	if (ret == 0)
-		goto out;
+	if (INTEL_INFO(req->i915)->gen >= 6)
+		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
 
-	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
-		ret = -ENODEV;
-		goto out;
-	}
+	intel_breadcrumb_init(&wait, req->seqno);
+	set_task_state(wait.task, state);
 
-	for (;;) {
-		struct timer_list timer;
+	/* Optimistic spin for the next ~jiffie before touching IRQs */
+	if (intel_engine_add_breadcrumb(req->ring, &wait)) {
+		if (__i915_spin_request(req, &wait, state))
+			goto out;
 
-		prepare_to_wait(&ring->irq_queue, &wait, state);
+		intel_engine_enable_breadcrumb_irq(req->ring);
 
-		/* We need to check whether any gpu reset happened in between
-		 * the request being submitted and now. If a reset has occurred,
-		 * the request is effectively complete (we either are in the
-		 * process of or have discarded the rendering and completely
-		 * reset the GPU. The results of the request are lost and we
-		 * are free to continue on with the original operation.
+		/* In order to check that we haven't missed the interrupt
+		 * as we enabled it, we need to kick ourselves to do a
+		 * coherent check on the seqno before we sleep.
 		 */
-		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
-			ret = 0;
-			break;
-		}
-
-		if (i915_gem_request_completed(req, false)) {
-			ret = 0;
-			break;
-		}
+		goto wakeup;
+	}
 
-		if (signal_pending_state(state, current)) {
+	for (;;) {
+		if (signal_pending_state(state, wait.task)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
 
-		if (timeout && time_after_eq(jiffies, timeout_expire)) {
+		/* Ensure that even if the GPU hangs, we get woken up. */
+		i915_queue_hangcheck(req->i915);
+
+		timeout_remain = io_schedule_timeout(timeout_remain);
+		if (timeout_remain == 0) {
 			ret = -ETIME;
 			break;
 		}
 
-		/* Ensure that even if the GPU hangs, we get woken up. */
-		i915_queue_hangcheck(dev_priv);
-
-		timer.function = NULL;
-		if (timeout || missed_irq(dev_priv, ring)) {
-			unsigned long expire;
+		if (intel_breadcrumb_complete(&wait))
+			break;
 
-			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
-			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
-			mod_timer(&timer, expire);
-		}
+wakeup:		set_task_state(wait.task, state);
 
-		io_schedule();
+		/* Ensure our read of the seqno is coherent so that we
+		 * do not "miss an interrupt" (i.e. if this is the last
+		 * request and the seqno write from the GPU is not visible
+		 * by the time the interrupt fires, we will see that the
+		 * request is incomplete and go back to sleep awaiting
+		 * another interrupt that will never come.)
+		 *
+		 * Strictly, we only need to do this once after an interrupt,
+		 * but it is easier and safer to do it every time the waiter
+		 * is woken.
+		 */
+		if (i915_gem_request_completed(req, false))
+			break;
 
-		if (timer.function) {
-			del_singleshot_timer_sync(&timer);
-			destroy_timer_on_stack(&timer);
-		}
+		/* We need to check whether any gpu reset happened in between
+		 * the request being submitted and now. If a reset has occurred,
+		 * the request is effectively complete (we either are in the
+		 * process of or have discarded the rendering and completely
+		 * reset the GPU. The results of the request are lost and we
+		 * are free to continue on with the original operation.
+		 */
+		if (req->reset_counter != i915_reset_counter(&req->i915->gpu_error))
+			break;
 	}
-	if (!irq_test_in_progress)
-		ring->irq_put(ring);
-
-	finish_wait(&ring->irq_queue, &wait);
-
 out:
+	intel_engine_remove_breadcrumb(req->ring, &wait);
+	__set_task_state(wait.task, TASK_RUNNING);
 	now = ktime_get_raw_ns();
 	trace_i915_gem_request_wait_end(req);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 06ca4082735b..f805d117f3d1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -900,7 +900,7 @@ static void i915_record_ring_state(struct drm_device *dev,
 		ering->instdone = I915_READ(GEN2_INSTDONE);
 	}
 
-	ering->waiting = waitqueue_active(&ring->irq_queue);
+	ering->waiting = intel_engine_has_waiter(ring);
 	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
 	ering->seqno = ring->get_seqno(ring, false);
 	ering->acthd = intel_ring_get_active_head(ring);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5f88869e2207..d250b4721a6a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1000,8 +1000,7 @@ static void notify_ring(struct intel_engine_cs *ring)
 		return;
 
 	trace_i915_gem_request_notify(ring);
-
-	wake_up_all(&ring->irq_queue);
+	intel_engine_wakeup(ring);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -1083,7 +1082,7 @@ static bool any_waiters(struct drm_i915_private *dev_priv)
 	int i;
 
 	for_each_ring(ring, dev_priv, i)
-		if (ring->irq_refcount)
+		if (intel_engine_has_waiter(ring))
 			return true;
 
 	return false;
@@ -2411,7 +2410,7 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 
 	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
 	for_each_ring(ring, dev_priv, i)
-		wake_up_all(&ring->irq_queue);
+		intel_engine_wakeup(ring);
 
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
@@ -2986,16 +2985,17 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 			if (ring_idle(ring, seqno)) {
 				ring->hangcheck.action = HANGCHECK_IDLE;
 
-				if (waitqueue_active(&ring->irq_queue)) {
+				if (intel_engine_has_waiter(ring)) {
 					/* Issue a wake-up to catch stuck h/w. */
 					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
+						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
 							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
 								  ring->name);
 						else
 							DRM_INFO("Fake missed irq on %s\n",
 								 ring->name);
-						wake_up_all(&ring->irq_queue);
+
+						intel_engine_enable_fake_irq(ring);
 					}
 					/* Safeguard against driver failure */
 					ring->hangcheck.score += BUSY;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
new file mode 100644
index 000000000000..a9a199bca2c2
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "i915_drv.h"
+
+static void intel_breadcrumbs_fake_irq(unsigned long data)
+{
+	struct intel_breadcrumbs *b = (struct intel_breadcrumbs *)data;
+	struct task_struct *task;
+
+	/*
+	 * The timer persists in case we cannot enable interrupts,
+	 * or if we have previously seen seqno/interrupt incoherency
+	 * ("missed interrupt" syndrome). Here the worker will wake up
+	 * every jiffie in order to kick the oldest waiter to do the
+	 * coherent seqno check.
+	 */
+
+	task = READ_ONCE(b->first_waiter);
+	if (task) {
+		wake_up_process(task);
+		mod_timer(&b->fake_irq, jiffies + 1);
+	}
+}
+
+static void irq_enable(struct intel_engine_cs *engine)
+{
+	WARN_ON(!engine->irq_get(engine));
+}
+
+static void irq_disable(struct intel_engine_cs *engine)
+{
+	engine->irq_put(engine);
+}
+
+static void __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
+{
+	struct intel_engine_cs *engine =
+		container_of(b, struct intel_engine_cs, breadcrumbs);
+	bool noirq;
+
+	if (b->rpm_wakelock)
+		return;
+
+	/* Since we are waiting on a request, the GPU should be busy
+	 * and should have its own rpm reference. For completeness,
+	 * record an rpm reference for ourselves to cover the
+	 * interrupt we unmask.
+	 */
+	intel_runtime_pm_get_noresume(engine->i915);
+	b->rpm_wakelock = true;
+
+	/* No interrupts? Kick the waiter every jiffie! */
+	noirq = true;
+	if (intel_irqs_enabled(engine->i915)) {
+		noirq = test_bit(engine->id,
+				 &engine->i915->gpu_error.missed_irq_rings);
+		if (!test_bit(engine->id,
+			      &engine->i915->gpu_error.test_irq_rings)) {
+			irq_enable(engine);
+			b->irq_enabled = true;
+		}
+	}
+	if (noirq)
+		mod_timer(&b->fake_irq, jiffies + 1);
+}
+
+static void __intel_breadcrumbs_disable_irq(struct intel_breadcrumbs *b)
+{
+	struct intel_engine_cs *engine =
+		container_of(b, struct intel_engine_cs, breadcrumbs);
+
+	if (!b->rpm_wakelock)
+		return;
+
+	if (b->irq_enabled) {
+		irq_disable(engine);
+		b->irq_enabled = false;
+	}
+
+	intel_runtime_pm_put(engine->i915);
+	b->rpm_wakelock = false;
+}
+
+inline struct intel_breadcrumb *to_crumb(struct rb_node *node)
+{
+	return container_of(node, struct intel_breadcrumb, node);
+}
+
+bool intel_engine_add_breadcrumb(struct intel_engine_cs *engine,
+				 struct intel_breadcrumb *wait)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+	u32 seqno = engine->get_seqno(engine, true);
+	struct rb_node **p, *parent, *completed;
+	bool first;
+
+	spin_lock(&b->lock);
+
+	/* Insert the request into the retirment ordered list
+	 * of waiters by walking the rbtree. If we are the oldest
+	 * seqno in the tree (the first to be retired), then
+	 * set ourselves as the bottom-half.
+	 *
+	 * As we descend the tree, prune completed branches since we hold the
+	 * spinlock we know that the first_waiter must be delayed and can
+	 * reduce some of the sequential wake up latency if we take action
+	 * ourselves and wake up the copmleted tasks in parallel.
+	 */
+	first = true;
+	parent = NULL;
+	completed = NULL;
+	p = &b->requests.rb_node;
+	while (*p) {
+		parent = *p;
+		if (i915_seqno_passed(wait->seqno, to_crumb(parent)->seqno)) {
+			p = &parent->rb_right;
+			if (i915_seqno_passed(seqno, to_crumb(parent)->seqno))
+				completed = parent;
+			else
+				first = false;
+		} else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&wait->node, parent, p);
+	rb_insert_color(&wait->node, &b->requests);
+
+	if (completed != NULL) {
+		struct rb_node *next = rb_next(completed);
+
+		if (next && next != &wait->node) {
+			smp_store_mb(b->first_waiter, to_crumb(next)->task);
+			__intel_breadcrumbs_enable_irq(b);
+			wake_up_process(to_crumb(next)->task);
+		}
+
+		do {
+			struct intel_breadcrumb *crumb = to_crumb(completed);
+			completed = rb_prev(completed);
+
+			rb_erase(&crumb->node, &b->requests);
+			RB_CLEAR_NODE(&crumb->node);
+			wake_up_process(crumb->task);
+		} while (completed != NULL);
+	}
+
+	if (first)
+		smp_store_mb(b->first_waiter, wait->task);
+	BUG_ON(b->first_waiter == NULL);
+
+	spin_unlock(&b->lock);
+
+	return first;
+}
+
+void intel_engine_enable_breadcrumb_irq(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	spin_lock(&b->lock);
+	__intel_breadcrumbs_enable_irq(b);
+	spin_unlock(&b->lock);
+}
+
+void intel_engine_enable_fake_irq(struct intel_engine_cs *engine)
+{
+	mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
+}
+
+void intel_engine_remove_breadcrumb(struct intel_engine_cs *engine,
+				    struct intel_breadcrumb *wait)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	/* Quick check to see if this waiter was already decoupled from
+	 * the tree by the bottom-half to avoid contention on the spinlock
+	 * by the herd.
+	 */
+	if (RB_EMPTY_NODE(&wait->node))
+		return;
+
+	spin_lock(&b->lock);
+
+	if (b->first_waiter == wait->task) {
+		struct rb_node *next;
+		struct task_struct *task;
+
+		/* We are the current bottom-half. Find the next candidate,
+		 * the first waiter in the queue on the remaining oldest
+		 * request. As multiple seqnos may complete in the time it
+		 * takes us to wake up and find the next waiter, we have to
+		 * wake up that waiter for it to perform its own coherent
+		 * completion check.
+		 */
+		next = rb_next(&wait->node);
+		if (next) {
+			/* If the next waiter is already complete,
+			 * wake it up and continue onto the next waiter. So
+			 * if have a small herd, they will wake up in parallel
+			 * rather than sequentially, which should reduce
+			 * the overall latency in waking all the completed
+			 * clients.
+			 */
+			u32 seqno = engine->get_seqno(engine, true);
+			while (i915_seqno_passed(seqno,
+						 to_crumb(next)->seqno)) {
+				struct rb_node *n = rb_next(next);
+
+				rb_erase(next, &b->requests);
+				RB_CLEAR_NODE(next);
+				wake_up_process(to_crumb(next)->task);
+
+				next = n;
+				if (next == NULL)
+					break;
+			}
+		}
+		task = next ? to_crumb(next)->task : NULL;
+
+		smp_store_mb(b->first_waiter, task);
+		if (task) {
+			/* In our haste, we may have completed the first waiter
+			 * before we enabled the interrupt. Do so now as we
+			 * have a second waiter for a future seqno. Afterwards,
+			 * we have to wake up that waiter in case we missed
+			 * the interrupt, or if we have to handle an
+			 * exception rather than a seqno completion.
+			 */
+			__intel_breadcrumbs_enable_irq(b);
+			wake_up_process(task);
+		} else
+			__intel_breadcrumbs_disable_irq(b);
+	}
+
+	if (!RB_EMPTY_NODE(&wait->node))
+		rb_erase(&wait->node, &b->requests);
+	spin_unlock(&b->lock);
+}
+
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	spin_lock_init(&b->lock);
+	setup_timer(&b->fake_irq, intel_breadcrumbs_fake_irq, (unsigned long)b);
+}
+
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	del_timer_sync(&b->fake_irq);
+}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 7030ff526941..780aa6d390aa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1898,6 +1898,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
 	i915_cmd_parser_fini_ring(ring);
 	i915_gem_batch_pool_fini(&ring->batch_pool);
 
+	intel_engine_fini_breadcrumbs(ring);
+
 	if (ring->status_page.obj) {
 		kunmap(sg_page(ring->status_page.obj->pages->sgl));
 		ring->status_page.obj = NULL;
@@ -1915,10 +1917,11 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 	ring->buffer = NULL;
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
-	init_waitqueue_head(&ring->irq_queue);
+	intel_engine_init_breadcrumbs(ring);
 
 	INIT_LIST_HEAD(&ring->buffers);
 	INIT_LIST_HEAD(&ring->execlist_queue);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 69dd69e46fa9..e90c28e2da31 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2152,6 +2152,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 	WARN_ON(ring->buffer);
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	INIT_LIST_HEAD(&ring->execlist_queue);
@@ -2159,7 +2160,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
 	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
 
-	init_waitqueue_head(&ring->irq_queue);
+	intel_engine_init_breadcrumbs(ring);
 
 	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
 	if (IS_ERR(ringbuf)) {
@@ -2223,6 +2224,8 @@ void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
 
 	i915_cmd_parser_fini_ring(ring);
 	i915_gem_batch_pool_fini(&ring->batch_pool);
+	intel_engine_fini_breadcrumbs(ring);
+
 	ring->dev = NULL;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 49574ffe54bc..281ed2c92b91 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -157,9 +157,35 @@ struct  intel_engine_cs {
 #define LAST_USER_RING (VECS + 1)
 	u32		mmio_base;
 	struct		drm_device *dev;
+	struct drm_i915_private *i915;
 	struct intel_ringbuffer *buffer;
 	struct list_head buffers;
 
+	/* Rather than have every client wait upon all user interrupts,
+	 * with the herd waking after every interrupt and each doing the
+	 * heavyweight seqno dance, we delegate the task (of being the
+	 * bottom-half of the user interrupt) to the first client. After
+	 * every interrupt, we wake up one client, who does the heavyweight
+	 * coherent seqno read and either goes back to sleep (if incomplete),
+	 * or wakes up all the completed clients in parallel, before then
+	 * transferring the bottom-half status to the next client in the queue.
+	 *
+	 * Compared to walking the entire list of waiters in a single dedicated
+	 * bottom-half, we reduce the latency of the first waiter by avoiding
+	 * a context switch, but incur additional coherent seqno reads when
+	 * following the chain of request breadcrumbs. Since it is most likely
+	 * that we have a single client waiting on each seqno, then reducing
+	 * the overhead of waking that client is much preferred.
+	 */
+	struct intel_breadcrumbs {
+		spinlock_t lock; /* protects the lists of requests */
+		struct rb_root requests; /* sorted by retirement */
+		struct task_struct *first_waiter; /* bh for user interrupts */
+		struct timer_list fake_irq; /* used after a missed interrupt */
+		bool irq_enabled;
+		bool rpm_wakelock;
+	} breadcrumbs;
+
 	/*
 	 * A pool of objects to use as shadow copies of client batch buffers
 	 * when the command parser is enabled. Prevents the client from
@@ -303,8 +329,6 @@ struct  intel_engine_cs {
 
 	bool gpu_caches_dirty;
 
-	wait_queue_head_t irq_queue;
-
 	struct intel_context *default_context;
 	struct intel_context *last_context;
 
@@ -510,4 +534,39 @@ void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf);
 /* Legacy ringbuffer specific portion of reservation code: */
 int intel_ring_reserve_space(struct drm_i915_gem_request *request);
 
+/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
+struct intel_breadcrumb {
+	struct rb_node node;
+	struct task_struct *task;
+	u32 seqno;
+};
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
+static inline void intel_breadcrumb_init(struct intel_breadcrumb *wait,
+					 u32 seqno)
+{
+	wait->task = current;
+	wait->seqno = seqno;
+}
+static inline bool intel_breadcrumb_complete(struct intel_breadcrumb *wait)
+{
+	return RB_EMPTY_NODE(&wait->node);
+}
+bool intel_engine_add_breadcrumb(struct intel_engine_cs *engine,
+				 struct intel_breadcrumb *wait);
+void intel_engine_remove_breadcrumb(struct intel_engine_cs *engine,
+				    struct intel_breadcrumb *wait);
+static inline bool intel_engine_has_waiter(struct intel_engine_cs *engine)
+{
+	return READ_ONCE(engine->breadcrumbs.first_waiter);
+}
+static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
+{
+	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
+	if (task)
+		wake_up_process(task);
+}
+void intel_engine_enable_breadcrumb_irq(struct intel_engine_cs *engine);
+void intel_engine_enable_fake_irq(struct intel_engine_cs *engine);
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
+
 #endif /* _INTEL_RINGBUFFER_H_ */
-- 
2.6.3