[Intel-gfx] [PATCH v7] drm/i915: Slaughter the thundering i915_wait_request herd

Mon Dec 7 07:08:28 PST 2015

Hi,

On 03/12/15 16:22, Chris Wilson wrote:
> One particularly stressful scenario consists of many independent tasks
> all competing for GPU time and waiting upon the results (e.g. realtime
> transcoding of many, many streams). One bottleneck in particular is that
> each client waits on its own results, but every client is woken up after
> every batchbuffer - hence the thunder of hooves as then every client must
> do its heavyweight dance to read a coherent seqno to see if it is the
> lucky one.
>
> Ideally, we only want one client to wake up after the interrupt and
> check its request for completion. Since the requests must retire in
> order, we can select the first client on the oldest request to be woken.
> Once that client has completed his wait, we can then wake up the
> next client and so on.
>
> v2: Convert from a kworker per engine into a dedicated kthread for the
> bottom-half.
> v3: Rename request members and tweak comments.
> v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
> v5: Fix race in locklessly checking waiter status and kicking the task on
> adding a new waiter.
> v6: Fix deciding when to force the timer to hide missing interrupts.
> v7: Move the bottom-half from the kthread to the first client process.

I like the idea a lot so I'll make some comments even before you post v9 
or later. :)

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>
> Cc: "Gong, Zhipeng" <zhipeng.gong at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> Cc: Dave Gordon <david.s.gordon at intel.com>
> ---
>   drivers/gpu/drm/i915/Makefile            |   1 +
>   drivers/gpu/drm/i915/i915_debugfs.c      |   2 +-
>   drivers/gpu/drm/i915/i915_drv.h          |   3 +-
>   drivers/gpu/drm/i915/i915_gem.c          | 108 ++++++-----------
>   drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
>   drivers/gpu/drm/i915/i915_irq.c          |  17 +--
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 197 +++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  47 +++++++-
>   10 files changed, 297 insertions(+), 90 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 0851de07bd13..d3b9d3618719 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
>   	  i915_gem_userptr.o \
>   	  i915_gpu_error.o \
>   	  i915_trace_points.o \
> +	  intel_breadcrumbs.o \
>   	  intel_lrc.o \
>   	  intel_mocs.o \
>   	  intel_ringbuffer.o \
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 0583eda642d7..68fbe4d1f91d 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2323,7 +2323,7 @@ static int count_irq_waiters(struct drm_i915_private *i915)
>   	int i;
>
>   	for_each_ring(ring, i915, i)
> -		count += ring->irq_refcount;
> +		count += intel_engine_has_waiter(ring);

Don't care how many any longer? Okay in debugfs, but also in error capture?

>   	return count;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2acf2cf0b836..30d470934874 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1383,7 +1383,7 @@ struct i915_gpu_error {
>   #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
>
>   	/* For missed irq/seqno simulation. */
> -	unsigned int test_irq_rings;
> +	unsigned long test_irq_rings;
>   };
>
>   enum modeset_restore {
> @@ -2800,7 +2800,6 @@ ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
>   	ibx_display_interrupt_update(dev_priv, bits, 0);
>   }
>
> -
>   /* i915_gem.c */
>   int i915_gem_create_ioctl(struct drm_device *dev, void *data,
>   			  struct drm_file *file_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 6e051402e340..8dcbac584e84 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1125,17 +1125,6 @@ i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
>   	return 0;
>   }
>
> -static void fake_irq(unsigned long data)
> -{
> -	wake_up_process((struct task_struct *)data);
> -}
> -
> -static bool missed_irq(struct drm_i915_private *dev_priv,
> -		       struct intel_engine_cs *ring)
> -{
> -	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> -}
> -
>   static unsigned long local_clock_us(unsigned *cpu)
>   {
>   	unsigned long t;
> @@ -1183,9 +1172,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   	 * takes to sleep on a request, on the order of a microsecond.
>   	 */
>
> -	if (req->ring->irq_refcount)
> -		return -EBUSY;
> -
>   	/* Only spin if we know the GPU is processing this request */
>   	if (!i915_gem_request_started(req, false))
>   		return -EAGAIN;
> @@ -1204,9 +1190,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		cpu_relax_lowlatency();
>   	}
>
> -	if (i915_gem_request_completed(req, false))
> -		return 0;
> -
>   	return -EAGAIN;
>   }
>
> @@ -1231,26 +1214,19 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			s64 *timeout,
>   			struct intel_rps_client *rps)
>   {
> -	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	const bool irq_test_in_progress =
> -		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
>   	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> -	DEFINE_WAIT(wait);
> -	unsigned long timeout_expire;
> +	struct intel_breadcrumb wait;
> +	unsigned long timeout_remain;
>   	s64 before, now;
>   	int ret;
>
> -	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
> -
>   	if (list_empty(&req->list))
>   		return 0;
>
>   	if (i915_gem_request_completed(req, true))
>   		return 0;
>
> -	timeout_expire = 0;
> +	timeout_remain = MAX_SCHEDULE_TIMEOUT;
>   	if (timeout) {
>   		if (WARN_ON(*timeout < 0))
>   			return -EINVAL;
> @@ -1258,34 +1234,43 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   		if (*timeout == 0)
>   			return -ETIME;
>
> -		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
> +		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
>   	}
>
> -	if (INTEL_INFO(dev_priv)->gen >= 6)
> -		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
> -
>   	/* Record current time in case interrupted by signal, or wedged */
>   	trace_i915_gem_request_wait_begin(req);
>   	before = ktime_get_raw_ns();
>
> -	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req, state);
> -	if (ret == 0)
> -		goto out;
> +	if (INTEL_INFO(req->i915)->gen >= 6)
> +		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
>
> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> -		ret = -ENODEV;
> -		goto out;
> +	/* Optimistic spin for the next jiffie before touching IRQs */

Exactly the opposite! :)

> +	if (intel_breadcrumbs_add_waiter(req, &wait)) {
> +		ret = __i915_spin_request(req, state);
> +		if (ret == 0)
> +			goto out;
>   	}
>
>   	for (;;) {
> -		struct timer_list timer;
> -
> -		prepare_to_wait(&ring->irq_queue, &wait, state);
> +		set_task_state(wait.task, state);
> +
> +		/* Ensure our read of the seqno is coherent so that we
> +		 * do not "miss an interrupt" (i.e. if this is the last
> +		 * request and the seqno write from the GPU is not visible
> +		 * by the time the interrupt fires, we will see that the
> +		 * request is incomplete and go back to sleep awaiting
> +		 * another interrupt that will never come.)
> +		 *
> +		 * Strictly, we only need to do this once after an interrupt,
> +		 * but it is easier and safer to do it every time the waiter
> +		 * is woken.
> +		 */
> +		if (i915_gem_request_completed(req, false)) {
> +			ret = 0;
> +			break;
> +		}
>
> -		/* We need to check whether any gpu reset happened in between
> -		 * the caller grabbing the seqno and now ... */
> -		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> +		if (unlikely(req->reset_counter != i915_reset_counter(&req->i915->gpu_error))) {
>   			/* As we do not requeue the request over a GPU reset,
>   			 * if one does occur we know that the request is
>   			 * effectively complete.
> @@ -1294,46 +1279,21 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			break;
>   		}
>
> -		if (i915_gem_request_completed(req, false)) {
> -			ret = 0;
> -			break;
> -		}
> -
> -		if (signal_pending_state(state, current)) {
> +		if (signal_pending_state(state, wait.task)) {
>   			ret = -ERESTARTSYS;
>   			break;
>   		}
>
> -		if (timeout && time_after_eq(jiffies, timeout_expire)) {
> +		timeout_remain = io_schedule_timeout(timeout_remain);
> +		if (timeout_remain == 0) {
>   			ret = -ETIME;
>   			break;
>   		}
> -
> -		i915_queue_hangcheck(dev_priv);
> -
> -		timer.function = NULL;
> -		if (timeout || missed_irq(dev_priv, ring)) {
> -			unsigned long expire;
> -
> -			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> -			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
> -			mod_timer(&timer, expire);
> -		}
> -
> -		io_schedule();
> -
> -		if (timer.function) {
> -			del_singleshot_timer_sync(&timer);
> -			destroy_timer_on_stack(&timer);
> -		}
>   	}
> -	if (!irq_test_in_progress)
> -		ring->irq_put(ring);
> -
> -	finish_wait(&ring->irq_queue, &wait);
> -
> +	__set_task_state(wait.task, TASK_RUNNING);
>   out:
>   	now = ktime_get_raw_ns();
> +	intel_breadcrumbs_remove_waiter(req, &wait);
>   	trace_i915_gem_request_wait_end(req);
>
>   	if (timeout) {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 06ca4082735b..f805d117f3d1 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -900,7 +900,7 @@ static void i915_record_ring_state(struct drm_device *dev,
>   		ering->instdone = I915_READ(GEN2_INSTDONE);
>   	}
>
> -	ering->waiting = waitqueue_active(&ring->irq_queue);
> +	ering->waiting = intel_engine_has_waiter(ring);
>   	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
>   	ering->seqno = ring->get_seqno(ring, false);
>   	ering->acthd = intel_ring_get_active_head(ring);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 28c0329f8281..57b113132fd7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -996,12 +996,11 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
>
>   static void notify_ring(struct intel_engine_cs *ring)
>   {
> -	if (!intel_ring_initialized(ring))
> +	if (ring->i915 == NULL)

Glanced something about this in some other thread, maybe best to keep it 
consolidated in one patch?

>   		return;
>
>   	trace_i915_gem_request_notify(ring);
> -
> -	wake_up_all(&ring->irq_queue);
> +	intel_engine_wakeup(ring);
>   }
>
>   static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -1083,7 +1082,7 @@ static bool any_waiters(struct drm_i915_private *dev_priv)
>   	int i;
>
>   	for_each_ring(ring, dev_priv, i)
> -		if (ring->irq_refcount)
> +		if (intel_engine_has_waiter(ring))
>   			return true;
>
>   	return false;
> @@ -2411,7 +2410,7 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>
>   	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
>   	for_each_ring(ring, dev_priv, i)
> -		wake_up_all(&ring->irq_queue);
> +		intel_engine_wakeup(ring);
>
>   	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
>   	wake_up_all(&dev_priv->pending_flip_queue);
> @@ -2986,16 +2985,18 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   			if (ring_idle(ring, seqno)) {
>   				ring->hangcheck.action = HANGCHECK_IDLE;
>
> -				if (waitqueue_active(&ring->irq_queue)) {
> +				if (intel_engine_has_waiter(ring)) {
>   					/* Issue a wake-up to catch stuck h/w. */
>   					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> -						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
> +						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
>   							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
>   								  ring->name);
>   						else
>   							DRM_INFO("Fake missed irq on %s\n",
>   								 ring->name);
> -						wake_up_all(&ring->irq_queue);
> +						mod_delayed_work(system_highpri_wq,
> +								 &ring->breadcrumbs.irq,
> +								 0);
>   					}
>   					/* Safeguard against driver failure */
>   					ring->hangcheck.score += BUSY;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> new file mode 100644
> index 000000000000..17d51269b774
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -0,0 +1,197 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kthread.h>

Not any more.

> +
> +#include "i915_drv.h"
> +
> +static bool irq_get(struct intel_engine_cs *engine)
> +{
> +	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> +		return false;

I don't understand why knowledge of this debugfs interface needs to be 
sprinkled around? I mean, I am not even 100% sure what the debugfs 
interface is for. But it looks to be about software masking user 
interrupts on a ring? Why not just mask the interrupt delivery based on 
this mask at the source?

> +
> +	if (!intel_irqs_enabled(engine->i915))
> +		return false;
> +
> +	return engine->irq_get(engine);
> +}
> +
> +static void intel_breadcrumbs_irq(struct work_struct *work)
> +{
> +	struct intel_breadcrumbs *b =
> +		container_of(work, struct intel_breadcrumbs, irq.work);
> +	struct intel_engine_cs *engine =
> +		container_of(b, struct intel_engine_cs, breadcrumbs);
> +
> +	/* We unmask the user-interrupt in the background so that the
> +	 * first-waiter can do a little busywaiting before sleeping and
> +	 * not incur additional latency from setting up the IRQ.
> +	 *
> +	 * The worker also persists in case we cannot enable interrupts,
> +	 * or if we have previously seen seqno/interrupt incoherency
> +	 * ("missed interrupt" syndrome). Here the worker will wake up
> +	 * every jiffie in order to kick the original waiter to do the
> +	 * coherent seqno check. (Not as efficient as having a timer
> +	 * source kick the waiter directly, but this should only have to
> +	 * be used in rare cases where we try to workaround some
> +	 * hardware/driver failures.)
> +	 */
> +
> +	spin_lock(&b->lock);
> +	if (b->first_waiter) {
> +		if (!b->rpm_wakelock) {
> +			intel_runtime_pm_get(engine->i915);
> +			b->rpm_wakelock = true;
> +		}
> +
> +		if (!b->irq_enabled) {
> +			b->irq_enabled = irq_get(engine);
> +			/* If we were delayed, the original waiter may have
> +			 * gone to sleep already and we may have missed the
> +			 * interrupt. So after enabling the interrupt,
> +			 * kick the waiter.
> +			 */
> +			wake_up_process(b->first_waiter);
> +		}
> +
> +		/* No interrupts? Kick the waiter every jiffie! */
> +		if (!b->irq_enabled ||
> +		    test_bit(engine->id,
> +			     &engine->i915->gpu_error.missed_irq_rings)) {
> +			wake_up_process(b->first_waiter);
> +			queue_delayed_work(system_highpri_wq, &b->irq, 1);
> +		}
> +	} else {
> +		if (b->irq_enabled) {
> +			engine->irq_put(engine);
> +			b->irq_enabled = false;
> +		}
> +		if (b->rpm_wakelock) {
> +			intel_runtime_pm_put(engine->i915);
> +			b->rpm_wakelock = false;
> +		}
> +	}
> +	spin_unlock(&b->lock);
> +}
> +
> +inline struct intel_breadcrumb *to_wait(struct rb_node *node)
> +{
> +	if (node == NULL)
> +		return NULL;
> +
> +	return container_of(node, struct intel_breadcrumb, node);
> +}
> +
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
> +				  struct intel_breadcrumb *wait)
> +{
> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> +	struct rb_node **p, *parent;
> +	bool first = false;
> +
> +	wait->task = current;
> +	wait->seqno = request->seqno;
> +
> +	spin_lock(&b->lock);
> +
> +	/* First? Unmask the user interrupt */
> +	if (b->first_waiter == NULL)
> +		queue_delayed_work(system_highpri_wq, &b->irq, 0);

Who enables interrupts if the worker gets scheduled and completes before 
settig b->first_waiter below?

> +
> +	/* Insert the request into the retirment ordered list
> +	 * of waiters by walking the rbtree. If we are the oldest
> +	 * seqno in the tree (the first to be retired), then
> +	 * set ourselves as the bottom-half.
> +	 */
> +	first = true;
> +	parent = NULL;
> +	p = &b->requests.rb_node;
> +	while (*p) {
> +		parent = *p;
> +		if (i915_seqno_passed(wait->seqno, to_wait(parent)->seqno)) {
> +			p = &parent->rb_right;
> +			first = false;
> +		} else
> +			p = &parent->rb_left;
> +	}
> +	rb_link_node(&wait->node, parent, p);
> +	rb_insert_color(&wait->node, &b->requests);

WARN_ON(!first && !b->first_waiter) maybe?

> +	if (first)
> +		b->first_waiter = wait->task;
> +
> +	spin_unlock(&b->lock);
> +
> +	return first;
> +}
> +
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
> +				     struct intel_breadcrumb *wait)
> +{
> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> +
> +	spin_lock(&b->lock);
> +
> +	if (b->first_waiter == wait->task) {
> +		struct intel_breadcrumb *next;
> +		struct task_struct *task;
> +
> +		/* We are the current bottom-half. Find the next candidate,
> +		 * the first waiter in the queue on the remaining oldest
> +		 * request. As multiple seqnos may complete in the time it
> +		 * takes us to wake up and find the next waiter, we have to
> +		 * wake up that waiter for it to perform its own coherent
> +		 * completion check.
> +		 */

Equally, why wouldn't we wake up all waiters for which the requests have 
been completed?

Would be a cheap check here and it would save a cascading growing 
latency as one task wakes up the next one.

Even more benefit for multiple waiters on the same request.

> +		task = NULL;
> +		next = to_wait(rb_next(&wait->node));
> +		if (next)
> +			task = next->task;
> +
> +		b->first_waiter = task;
> +		if (task)
> +			wake_up_process(task);
> +		else
> +			/* Disable the user interrupts */
> +			mod_delayed_work(system_highpri_wq, &b->irq, 1);
> +	}
> +
> +	rb_erase(&wait->node, &b->requests);
> +	spin_unlock(&b->lock);
> +}
> +
> +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +
> +	spin_lock_init(&b->lock);
> +	INIT_DELAYED_WORK(&b->irq, intel_breadcrumbs_irq);
> +}
> +
> +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +
> +	flush_delayed_work(&b->irq);
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 08b982a6ae81..ce5119e2bcef 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1904,6 +1904,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
>   	i915_cmd_parser_fini_ring(ring);
>   	i915_gem_batch_pool_fini(&ring->batch_pool);
>
> +	intel_engine_fini_breadcrumbs(ring);
> +
>   	if (ring->status_page.obj) {
>   		kunmap(sg_page(ring->status_page.obj->pages->sgl));
>   		ring->status_page.obj = NULL;
> @@ -1920,10 +1922,11 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>   	ring->buffer = NULL;
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
> -	init_waitqueue_head(&ring->irq_queue);
> +	intel_engine_init_breadcrumbs(ring);
>
>   	INIT_LIST_HEAD(&ring->buffers);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 511efe556d73..8a0f02e73a4e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2157,6 +2157,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	WARN_ON(ring->buffer);
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> @@ -2164,7 +2165,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
>   	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
>
> -	init_waitqueue_head(&ring->irq_queue);
> +	intel_engine_init_breadcrumbs(ring);
>
>   	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
>   	if (IS_ERR(ringbuf))
> @@ -2225,6 +2226,8 @@ void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
>
>   	i915_cmd_parser_fini_ring(ring);
>   	i915_gem_batch_pool_fini(&ring->batch_pool);
> +
> +	intel_engine_fini_breadcrumbs(ring);
>   }
>
>   static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb206151d..7dde9310df1b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -157,9 +157,31 @@ struct  intel_engine_cs {
>   #define LAST_USER_RING (VECS + 1)
>   	u32		mmio_base;
>   	struct		drm_device *dev;
> +	struct drm_i915_private *i915;
>   	struct intel_ringbuffer *buffer;
>   	struct list_head buffers;
>
> +	/* Rather than have every client wait upon all user interrupts,
> +	 * with the herd waking after every interrupt and each doing the
> +	 * heavyweight seqno dance, we delegate the task to the first client.
> +	 * After the interrupt, we wake up one client, who does the heavyweight
> +	 * coherent seqno read and either goes back to sleep (if incomplete),
> +	 * or wakes up the next client in the queue and so forth.
> +	 *
> +	 * With respect to walking the entire list of waiters in a single

s/With respect to/In contrast to/ or "Compared to"?

> +	 * dedicated bottom-half, we reduce the latency of the first waiter
> +	 * by avoiding a context switch, but incur additional coherent seqno
> +	 * reads when following the chain of request breadcrumbs.
> +	 */
> +	struct intel_breadcrumbs {
> +		spinlock_t lock; /* protects the lists of requests */
> +		struct rb_root requests; /* sorted by retirement */
> +		struct task_struct *first_waiter; /* bh for user interrupts */
> +		struct delayed_work irq; /* used to enable or fake inerrupts */
> +		bool irq_enabled : 1;
> +		bool rpm_wakelock : 1;

Are bitfields worth it? There aren't that many engine so maybe it loses 
more in terms of instructions generated.

> +	} breadcrumbs;
> +
>   	/*
>   	 * A pool of objects to use as shadow copies of client batch buffers
>   	 * when the command parser is enabled. Prevents the client from
> @@ -303,8 +325,6 @@ struct  intel_engine_cs {
>
>   	bool gpu_caches_dirty;
>
> -	wait_queue_head_t irq_queue;
> -
>   	struct intel_context *default_context;
>   	struct intel_context *last_context;
>
> @@ -506,4 +526,27 @@ void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf);
>   /* Legacy ringbuffer specific portion of reservation code: */
>   int intel_ring_reserve_space(struct drm_i915_gem_request *request);
>
> +/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
> +struct intel_breadcrumb {
> +	struct rb_node node;
> +	struct task_struct *task;
> +	u32 seqno;
> +};
> +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
> +				  struct intel_breadcrumb *wait);
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
> +				     struct intel_breadcrumb *wait);
> +static inline bool intel_engine_has_waiter(struct intel_engine_cs *engine)
> +{
> +	return READ_ONCE(engine->breadcrumbs.first_waiter);
> +}
> +static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
> +{
> +	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
> +	if (task)
> +		wake_up_process(task);

What guarantees task is a valid task at this point?

I know it is so extremely unlikely, but it can theoretically sample the 
first_waiter which then exists and disappears bafore the wake_up_process 
call.

> +}
> +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
> +
>   #endif /* _INTEL_RINGBUFFER_H_ */
>

Regards,

Tvrtko