[Intel-gfx] [PATCH] RFC drm/i915: Slaughter the thundering i915_wait_request herd

Sun Nov 1 21:39:54 PST 2015

Chris-

The patch cannot be applied on the latest drm-intel-nightly directly. 
I modified it a little bit to make it applied.
The patch can help much in HSW, but a little bit in BDW.
The test is to transcode 26 streams, which creates 244 threads.

CPU util      |	w/o patch  |   w/ patch
----------------------------------------------------------
HSW async 1   |   102%     |     61%
HSW async 5   |   114%     |     46%
BDW async 1   |   116%     |     116%
BDW async 5   |   111%     |     107%

-Zhipeng
> -----Original Message-----
> From: Chris Wilson [mailto:chris at chris-wilson.co.uk]
> Sent: Saturday, October 31, 2015 6:35 PM
> To: intel-gfx at lists.freedesktop.org
> Cc: Chris Wilson; Rogozhkin, Dmitry V; Gong, Zhipeng
> Subject: [PATCH] RFC drm/i915: Slaughter the thundering i915_wait_request
> herd
> 
> One particularly stressful scenario consists of many independent tasks all
> competing for GPU time and waiting upon the results (e.g. realtime
> transcoding of many, many streams). One bottleneck in particular is that each
> client waits on its own results, but every client is woken up after every
> batchbuffer - hence the thunder of hooves as then every client must do its
> heavyweight dance to read a coherent seqno to see if it is the lucky one.
> Alternatively, we can have one worker responsible for wakeing after an
> interrupt, checking the seqno and only wakeing up the clients who are
> complete. The disadvantage is that in the uncontended scenario (i.e. only one
> waiter) we incur an extra context switch in the wakeup path - though that
> should be mitigated somewhat by the busywait we do first before sleeping.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>
> Cc: "Gong, Zhipeng" <zhipeng.gong at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |   2 +-
>  drivers/gpu/drm/i915/i915_gem.c         |  92 ++++--------------
>  drivers/gpu/drm/i915/i915_gem_request.h |   6 ++
>  drivers/gpu/drm/i915/intel_lrc.c        |   3 +
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 159
> +++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   9 ++
>  6 files changed, 196 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3d4c422b3587..fe0d5ddad49d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1442,7 +1442,7 @@ struct i915_gpu_error {
>  #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
> 
>  	/* For missed irq/seqno simulation. */
> -	unsigned int test_irq_rings;
> +	unsigned long test_irq_rings;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index 29bd5238b824..1a89e7cc76d1
> 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1144,17 +1144,6 @@ i915_gem_check_wedge(unsigned reset_counter,
>  	return 0;
>  }
> 
> -static void fake_irq(unsigned long data) -{
> -	wake_up_process((struct task_struct *)data);
> -}
> -
> -static bool missed_irq(struct drm_i915_private *dev_priv,
> -		       struct intel_engine_cs *ring)
> -{
> -	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> -}
> -
>  static int __i915_spin_request(struct drm_i915_gem_request *req)  {
>  	unsigned long timeout;
> @@ -1199,27 +1188,17 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
>  			s64 *timeout,
>  			struct intel_rps_client *rps)
>  {
> -	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
> -	struct drm_i915_private *dev_priv = req->i915;
> -	const bool irq_test_in_progress =
> -		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) &
> intel_ring_flag(ring);
>  	DEFINE_WAIT(wait);
> -	unsigned long timeout_expire;
> +	unsigned long timeout_remain;
>  	s64 before, now;
>  	int ret;
> 
> -	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
> -
> -	if (list_empty(&req->list))
> -		return 0;
> -
>  	if (i915_gem_request_completed(req, true))
>  		return 0;
> 
> -	timeout_expire = timeout ?
> -		jiffies + nsecs_to_jiffies_timeout((u64)*timeout) : 0;
> +	timeout_remain = timeout ? nsecs_to_jiffies_timeout((u64)*timeout) :
> +0;
> 
> -	intel_rps_boost(dev_priv, rps, req->emitted_jiffies);
> +	intel_rps_boost(req->i915, rps, req->emitted_jiffies);
> 
>  	/* Record current time in case interrupted by signal, or wedged */
>  	trace_i915_gem_request_wait_begin(req);
> @@ -1230,67 +1209,34 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
>  	if (ret == 0)
>  		goto out;
> 
> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> -		ret = -ENODEV;
> -		goto out;
> -	}
> -
> +	intel_engine_add_wakeup(req);
>  	for (;;) {
> -		struct timer_list timer;
> -
> -		prepare_to_wait(&ring->irq_queue, &wait,
> -				interruptible ? TASK_INTERRUPTIBLE :
> TASK_UNINTERRUPTIBLE);
> +		int state = interruptible ? TASK_INTERRUPTIBLE :
> +TASK_UNINTERRUPTIBLE;
> 
> -		/* We need to check whether any gpu reset happened in between
> -		 * the caller grabbing the seqno and now ... */
> -		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error))
> {
> -			/* As we do not requeue the request over a GPU reset,
> -			 * if one does occur we know that the request is
> -			 * effectively complete.
> -			 */
> -			ret = 0;
> -			break;
> -		}
> +		prepare_to_wait(&req->wait, &wait, state);
> 
> -		if (i915_gem_request_completed(req, false)) {
> +		if (i915_gem_request_completed(req, true) ||
> +		    req->reset_counter !=
> i915_reset_counter(&req->i915->gpu_error))
> +{
>  			ret = 0;
>  			break;
>  		}
> 
> -		if (interruptible && signal_pending(current)) {
> +		if (signal_pending_state(state, current)) {
>  			ret = -ERESTARTSYS;
>  			break;
>  		}
> 
> -		if (timeout && time_after_eq(jiffies, timeout_expire)) {
> -			ret = -ETIME;
> -			break;
> -		}
> -
> -		i915_queue_hangcheck(dev_priv);
> -
> -		trace_i915_gem_request_wait_sleep(req);
> -
> -		timer.function = NULL;
> -		if (timeout || missed_irq(dev_priv, ring)) {
> -			unsigned long expire;
> -
> -			setup_timer_on_stack(&timer, fake_irq, (unsigned
> long)current);
> -			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
> -			mod_timer(&timer, expire);
> -		}
> -
> -		io_schedule();
> -
> -		if (timer.function) {
> -			del_singleshot_timer_sync(&timer);
> -			destroy_timer_on_stack(&timer);
> -		}
> +		if (timeout) {
> +			timeout_remain = io_schedule_timeout(timeout_remain);
> +			if (timeout_remain == 0) {
> +				ret = -ETIME;
> +				break;
> +			}
> +		} else
> +			io_schedule();
>  	}
> -	if (!irq_test_in_progress)
> -		ring->irq_put(ring);
> -
> -	finish_wait(&ring->irq_queue, &wait);
> +	finish_wait(&req->wait, &wait);
> +	intel_engine_remove_wakeup(req);
> 
>  out:
>  	now = ktime_get_raw_ns();
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h
> b/drivers/gpu/drm/i915/i915_gem_request.h
> index a5e27b7de93a..6fc295d5ba0f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -27,6 +27,7 @@
> 
>  #include <linux/list.h>
>  #include <linux/kref.h>
> +#include <linux/rbtree.h>
> 
>  struct drm_i915_file_private;
>  struct drm_i915_gem_object;
> @@ -60,6 +61,11 @@ struct drm_i915_gem_request {
>  	/** GEM sequence number associated with this request. */
>  	uint32_t seqno;
> 
> +	/** List of clients waiting for completion of this request */
> +	wait_queue_head_t wait;
> +	struct rb_node irq_node;
> +	unsigned irq_count;
> +
>  	/** Position in the ringbuffer of the request */
>  	u32 head, tail, wa_tail;
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 70ca20ecbff4..4436616c00b8 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2024,6 +2024,7 @@ static int logical_ring_init(struct drm_device *dev,
> struct intel_engine_cs *rin
>  	ring->buffer = NULL;
> 
>  	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>  	INIT_LIST_HEAD(&ring->request_list);
>  	i915_gem_batch_pool_init(ring, &ring->batch_pool);
>  	init_waitqueue_head(&ring->irq_queue);
> @@ -2032,6 +2033,8 @@ static int logical_ring_init(struct drm_device *dev,
> struct intel_engine_cs *rin
>  	INIT_LIST_HEAD(&ring->execlist_completed);
>  	spin_lock_init(&ring->execlist_lock);
> 
> +	intel_engine_init_wakeup(ring);
> +
>  	ret = i915_cmd_parser_init_ring(ring);
>  	if (ret)
>  		goto error;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index f3fea688d2e5..6cb9a0aee833 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -33,6 +33,162 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
> 
> +static bool missed_irq(struct intel_engine_cs *engine) {
> +	return test_bit(engine->id,
> +&engine->i915->gpu_error.missed_irq_rings);
> +}
> +
> +static bool __irq_enable(struct intel_engine_cs *engine) {
> +	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> +		return false;
> +
> +	if (!intel_irqs_enabled(engine->i915))
> +		return false;
> +
> +	return engine->irq_get(engine);
> +}
> +
> +static struct drm_i915_gem_request *irq_first(struct intel_engine_cs
> +*engine) {
> +	if (engine->irq_first == NULL) {
> +		struct rb_node *rb;
> +
> +		if (RB_EMPTY_ROOT(&engine->irq_requests))
> +			return NULL;
> +
> +		rb = rb_first(&engine->irq_requests);
> +		engine->irq_first = rb_entry(rb, struct drm_i915_gem_request,
> irq_node);
> +	}
> +
> +	return engine->irq_first;
> +}
> +
> +static void intel_engine_irq_wakeup(struct work_struct *work) {
> +	struct intel_engine_cs *engine =
> +		container_of(work, struct intel_engine_cs, irq_work);
> +	const bool fake_irq = !__irq_enable(engine);
> +	DEFINE_WAIT(wait);
> +
> +	for (;;) {
> +		struct timer_list timer;
> +		struct drm_i915_gem_request *request;
> +
> +		prepare_to_wait(&engine->irq_queue, &wait,
> TASK_INTERRUPTIBLE);
> +
> +		spin_lock(&engine->irq_lock);
> +		request = irq_first(engine);
> +		while (request) {
> +			struct rb_node *rb;
> +
> +			if (request->reset_counter ==
> i915_reset_counter(&engine->i915->gpu_error) &&
> +			    !i915_gem_request_completed(request, false))
> +				break;
> +
> +			rb = rb_next(&request->irq_node);
> +			rb_erase(&request->irq_node, &engine->irq_requests);
> +			RB_CLEAR_NODE(&request->irq_node);
> +
> +			wake_up_all(&request->wait);
> +
> +			request =
> +				rb ?
> +				rb_entry(rb, typeof(*request), irq_node) :
> +				NULL;
> +		}
> +		engine->irq_first = request;
> +		spin_unlock(&engine->irq_lock);
> +		if (request == NULL)
> +			break;
> +
> +		i915_queue_hangcheck(engine->i915);
> +
> +		timer.function = NULL;
> +		if (fake_irq || missed_irq(engine)) {
> +			setup_timer_on_stack(&timer,
> +					     (void (*)(unsigned long))fake_irq,
> +					     (unsigned long)current);
> +			mod_timer(&timer, jiffies + 1);
> +		}
> +
> +		/* Unlike the individual clients, we do not want this
> +		 * background thread to contribute to the system load,
> +		 * i.e. we do not want to use io_schedule() here.
> +		 */
> +		schedule();
> +
> +		if (timer.function) {
> +			del_singleshot_timer_sync(&timer);
> +			destroy_timer_on_stack(&timer);
> +		}
> +	}
> +	finish_wait(&engine->irq_queue, &wait);
> +	if (!fake_irq)
> +		engine->irq_put(engine);
> +}
> +
> +void intel_engine_init_wakeup(struct intel_engine_cs *engine) {
> +	init_waitqueue_head(&engine->irq_queue);
> +	spin_lock_init(&engine->irq_lock);
> +	INIT_WORK(&engine->irq_work, intel_engine_irq_wakeup); }
> +
> +void intel_engine_add_wakeup(struct drm_i915_gem_request *request) {
> +	struct intel_engine_cs *engine = i915_gem_request_get_ring(request);
> +
> +	spin_lock(&engine->irq_lock);
> +	if (request->irq_count++ == 0) {
> +		struct rb_node **p, *parent;
> +		bool first;
> +
> +		if (RB_EMPTY_ROOT(&engine->irq_requests))
> +			schedule_work(&engine->irq_work);
> +
> +		init_waitqueue_head(&request->wait);
> +
> +		first = true;
> +		parent = NULL;
> +		p = &engine->irq_requests.rb_node;
> +		while (*p) {
> +			struct drm_i915_gem_request *__req;
> +
> +			parent = *p;
> +			__req = rb_entry(parent, typeof(*__req), irq_node);
> +
> +			if (i915_seqno_passed(request->seqno, __req->seqno)) {
> +				p = &parent->rb_right;
> +				first = false;
> +			} else
> +				p = &parent->rb_left;
> +		}
> +		if (first)
> +			engine->irq_first = request;
> +
> +		rb_link_node(&request->irq_node, parent, p);
> +		rb_insert_color(&request->irq_node, &engine->irq_requests);
> +	}
> +	spin_unlock(&engine->irq_lock);
> +}
> +
> +void intel_engine_remove_wakeup(struct drm_i915_gem_request *request) {
> +	struct intel_engine_cs *engine = i915_gem_request_get_ring(request);
> +
> +	if (RB_EMPTY_NODE(&request->irq_node))
> +		return;
> +
> +	spin_lock(&engine->irq_lock);
> +	if (--request->irq_count == 0 && !RB_EMPTY_NODE(&request->irq_node))
> {
> +		if (engine->irq_first == request)
> +			engine->irq_first = NULL;
> +		rb_erase(&request->irq_node, &engine->irq_requests);
> +	}
> +	spin_unlock(&engine->irq_lock);
> +}
> +
>  int __intel_ring_space(int head, int tail, int size)  {
>  	int space = head - tail;
> @@ -2087,6 +2243,7 @@ static int intel_init_ring_buffer(struct drm_device
> *dev,
>  	ring->buffer = ringbuf;
> 
>  	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>  	INIT_LIST_HEAD(&ring->request_list);
>  	INIT_LIST_HEAD(&ring->execlist_queue);
>  	i915_gem_batch_pool_init(ring, &ring->batch_pool); @@ -2095,7
> +2252,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>  	ringbuf->ring = ring;
>  	memset(ring->semaphore.sync_seqno, 0,
> sizeof(ring->semaphore.sync_seqno));
> 
> -	init_waitqueue_head(&ring->irq_queue);
> +	intel_engine_init_wakeup(ring);
> 
>  	if (I915_NEED_GFX_HWS(dev)) {
>  		ret = init_status_page(ring);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 66b7f32fd293..9a98268a55f5 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -160,6 +160,7 @@ struct  intel_engine_cs {  #define LAST_USER_RING
> (VECS + 1)
>  	u32		mmio_base;
>  	struct		drm_device *dev;
> +	struct drm_i915_private *i915;
>  	struct intel_ringbuffer *buffer;
> 
>  	/*
> @@ -295,7 +296,11 @@ struct  intel_engine_cs {
> 
>  	bool gpu_caches_dirty;
> 
> +	spinlock_t irq_lock;
> +	struct rb_root irq_requests;
> +	struct drm_i915_gem_request *irq_first;
>  	wait_queue_head_t irq_queue;
> +	struct work_struct irq_work;
> 
>  	struct intel_context *default_context;
>  	struct intel_context *last_context;
> @@ -499,4 +504,8 @@ void intel_ring_reserved_space_end(struct
> intel_ringbuffer *ringbuf);
>  /* Legacy ringbuffer specific portion of reservation code: */  int
> intel_ring_reserve_space(struct drm_i915_gem_request *request);
> 
> +void intel_engine_init_wakeup(struct intel_engine_cs *engine); void
> +intel_engine_add_wakeup(struct drm_i915_gem_request *request); void
> +intel_engine_remove_wakeup(struct drm_i915_gem_request *request);
> +
>  #endif /* _INTEL_RINGBUFFER_H_ */
> --
> 2.6.2