[Intel-gfx] [PATCH 088/190] drm/i915: Move execlists interrupt based submission to a bottom-half

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Fri Feb 19 12:08:14 UTC 2016


Hi,

On 11/01/16 10:44, Chris Wilson wrote:
> [  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
> [  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
> [  196.992115] clocksource: Switched to clocksource refined-jiffies
>
> Followed by a hard lockup.

What does the above mean? Irq handler taking too long interferes with 
time keeping ?

I like it BTW. Just the commit message needs more work. :)

How is performance impact with just this patch in isolation? Worth 
progressing it on its own?

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>   drivers/gpu/drm/i915/i915_gem.c         |  15 +--
>   drivers/gpu/drm/i915/i915_irq.c         |   2 +-
>   drivers/gpu/drm/i915/intel_lrc.c        | 164 +++++++++++++++++---------------
>   drivers/gpu/drm/i915/intel_lrc.h        |   3 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h |   1 +
>   6 files changed, 98 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 378bc73296aa..15a6fddfb79b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2094,7 +2094,6 @@ static int i915_execlists(struct seq_file *m, void *data)
>   	for_each_ring(ring, dev_priv, ring_id) {
>   		struct drm_i915_gem_request *head_req = NULL;
>   		int count = 0;
> -		unsigned long flags;
>
>   		seq_printf(m, "%s\n", ring->name);
>
> @@ -2121,12 +2120,12 @@ static int i915_execlists(struct seq_file *m, void *data)
>   				   i, status, ctx_id);
>   		}
>
> -		spin_lock_irqsave(&ring->execlist_lock, flags);
> +		spin_lock(&ring->execlist_lock);
>   		list_for_each(cursor, &ring->execlist_queue)
>   			count++;
>   		head_req = list_first_entry_or_null(&ring->execlist_queue,
>   				struct drm_i915_gem_request, execlist_link);
> -		spin_unlock_irqrestore(&ring->execlist_lock, flags);
> +		spin_unlock(&ring->execlist_lock);
>
>   		seq_printf(m, "\t%d requests in queue\n", count);
>   		if (head_req) {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 391f840d29b7..eb875ecd7907 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2192,13 +2192,13 @@ static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine)
>   	 */
>
>   	if (i915.enable_execlists) {
> -		spin_lock_irq(&engine->execlist_lock);
> +		spin_lock(&engine->execlist_lock);
>
>   		/* list_splice_tail_init checks for empty lists */
>   		list_splice_tail_init(&engine->execlist_queue,
>   				      &engine->execlist_retired_req_list);
>
> -		spin_unlock_irq(&engine->execlist_lock);
> +		spin_unlock(&engine->execlist_lock);
>   		intel_execlists_retire_requests(engine);
>   	}
>
> @@ -2290,15 +2290,8 @@ i915_gem_retire_requests(struct drm_device *dev)
>   	for_each_ring(ring, dev_priv, i) {
>   		i915_gem_retire_requests_ring(ring);
>   		idle &= list_empty(&ring->request_list);
> -		if (i915.enable_execlists) {
> -			unsigned long flags;
> -
> -			spin_lock_irqsave(&ring->execlist_lock, flags);
> -			idle &= list_empty(&ring->execlist_queue);
> -			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> -
> -			intel_execlists_retire_requests(ring);
> -		}
> +		if (i915.enable_execlists)
> +			idle &= intel_execlists_retire_requests(ring);
>   	}
>
>   	if (idle)
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ce047ac84f5f..b2ef2d0c211b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1316,7 +1316,7 @@ gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir, int test_shift)
>   	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>   		notify_ring(ring);
>   	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
> -		intel_lrc_irq_handler(ring);
> +		wake_up_process(ring->execlists_submit);
>   }
>
>   static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b5f62b5f4913..de5889e95d6d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -132,6 +132,8 @@
>    *
>    */
>
> +#include <linux/kthread.h>
> +
>   #include <drm/drmP.h>
>   #include <drm/i915_drm.h>
>   #include "i915_drv.h"
> @@ -341,7 +343,7 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>   	rq0->elsp_submitted++;
>
>   	/* You must always write both descriptors in the order below. */
> -	spin_lock(&dev_priv->uncore.lock);
> +	spin_lock_irq(&dev_priv->uncore.lock);
>   	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
>   	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
>   	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> @@ -353,7 +355,7 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>   	/* ELSP is a wo register, use another nearby reg for posting */
>   	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
>   	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
> -	spin_unlock(&dev_priv->uncore.lock);
> +	spin_unlock_irq(&dev_priv->uncore.lock);
>   }
>
>   static int execlists_update_context(struct drm_i915_gem_request *rq)
> @@ -492,89 +494,84 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>   	return false;
>   }
>
> -static void get_context_status(struct intel_engine_cs *ring,
> -			       u8 read_pointer,
> -			       u32 *status, u32 *context_id)
> +static void set_rtpriority(void)
>   {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -
> -	if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
> -		return;
> -
> -	*status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
> -	*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
> +	 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2-1 };
> +	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
>   }
>
> -/**
> - * intel_lrc_irq_handler() - handle Context Switch interrupts
> - * @ring: Engine Command Streamer to handle.
> - *
> - * Check the unread Context Status Buffers and manage the submission of new
> - * contexts to the ELSP accordingly.
> - */
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> +static int intel_execlists_submit(void *arg)
>   {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -	u32 status_pointer;
> -	u8 read_pointer;
> -	u8 write_pointer;
> -	u32 status = 0;
> -	u32 status_id;
> -	u32 submit_contexts = 0;
> +	struct intel_engine_cs *ring = arg;
> +	struct drm_i915_private *dev_priv = ring->i915;
>
> -	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> +	set_rtpriority();
>
> -	read_pointer = ring->next_context_status_buffer;
> -	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
> -	if (read_pointer > write_pointer)
> -		write_pointer += GEN8_CSB_ENTRIES;
> +	do {
> +		u32 status;
> +		u32 status_id;
> +		u32 submit_contexts;
> +		u8 head, tail;
>
> -	spin_lock(&ring->execlist_lock);
> +		set_current_state(TASK_INTERRUPTIBLE);

Hm, what is the effect of setting TASK_INTERRUPTIBLE at this stage 
rather than just before the call to schedule?

And why interruptible?

> +		head = ring->next_context_status_buffer;
> +		tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
> +		if (head == tail) {
> +			if (kthread_should_stop())
> +				return 0;
>
> -	while (read_pointer < write_pointer) {
> +			schedule();
> +			continue;
> +		}
> +		__set_current_state(TASK_RUNNING);
>
> -		get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
> -				   &status, &status_id);
> +		if (head > tail)
> +			tail += GEN8_CSB_ENTRIES;
>
> -		if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> -			continue;
> +		status = 0;
> +		submit_contexts = 0;
>
> -		if (status & GEN8_CTX_STATUS_PREEMPTED) {
> -			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
> -				if (execlists_check_remove_request(ring, status_id))
> -					WARN(1, "Lite Restored request removed from queue\n");
> -			} else
> -				WARN(1, "Preemption without Lite Restore\n");
> -		}
> +		spin_lock(&ring->execlist_lock);
>
> -		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
> -		    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
> -			if (execlists_check_remove_request(ring, status_id))
> -				submit_contexts++;
> -		}
> -	}
> +		while (head++ < tail) {
> +			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
> +			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));

One potentially cheap improvement I've been thinking of is to move the 
CSB reading outside the execlist_lock, to avoid slow MMIO contending the 
lock.

We could fetch all valid entries into a temporary local buffer, then 
grab the execlist_lock and process completions and submission from there.

> -	if (disable_lite_restore_wa(ring)) {
> -		/* Prevent a ctx to preempt itself */
> -		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
> -		    (submit_contexts != 0))
> -			execlists_context_unqueue(ring);
> -	} else if (submit_contexts != 0) {
> -		execlists_context_unqueue(ring);
> -	}
> +			if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> +				continue;
>
> -	spin_unlock(&ring->execlist_lock);
> +			if (status & GEN8_CTX_STATUS_PREEMPTED) {
> +				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
> +					if (execlists_check_remove_request(ring, status_id))
> +						WARN(1, "Lite Restored request removed from queue\n");
> +				} else
> +					WARN(1, "Preemption without Lite Restore\n");
> +			}
>
> -	if (unlikely(submit_contexts > 2))
> -		DRM_ERROR("More than two context complete events?\n");
> +			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
> +			    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
> +				if (execlists_check_remove_request(ring, status_id))
> +					submit_contexts++;
> +			}
> +		}
> +
> +		if (disable_lite_restore_wa(ring)) {
> +			/* Prevent a ctx to preempt itself */
> +			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
> +					(submit_contexts != 0))
> +				execlists_context_unqueue(ring);
> +		} else if (submit_contexts != 0) {
> +			execlists_context_unqueue(ring);
> +		}
>
> -	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
> +		spin_unlock(&ring->execlist_lock);

Would it be worth trying to grab the mutex and unpin the LRCs at this 
point? It would slow down the thread a bit but would get rid of the 
retired work queue. With the vma->iomap thingy could be quite cheap?

> -	/* Update the read pointer to the old write pointer. Manual ringbuffer
> -	 * management ftw </sarcasm> */
> -	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> -		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> -				 ring->next_context_status_buffer << 8));
> +		WARN(submit_contexts > 2, "More than two context complete events?\n");
> +		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
> +		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> +			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
> +					 ring->next_context_status_buffer<<8));
> +	} while (1);
>   }
>
>   static int execlists_context_queue(struct drm_i915_gem_request *request)
> @@ -585,7 +582,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>
>   	i915_gem_request_get(request);
>
> -	spin_lock_irq(&engine->execlist_lock);
> +	spin_lock(&engine->execlist_lock);
>
>   	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
>   		if (++num_elements > 2)
> @@ -611,7 +608,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>   	if (num_elements == 0)
>   		execlists_context_unqueue(engine);
>
> -	spin_unlock_irq(&engine->execlist_lock);
> +	spin_unlock(&engine->execlist_lock);

Hm, another thing which could be quite cool is if here we could just 
wake the thread and let it submit the request instead of doing it 
directly from 3rd party context.

That would make all ELSP accesses serialized for free since they would 
only be happening from a single thread. And potentially could reduce the 
scope of the lock even more.

But it would mean extra latency when transitioning the idle engine to 
busy. Maybe it wouldn't matter for such workloads.

Regards,

Tvrtko


More information about the Intel-gfx mailing list