[Intel-gfx] [PATCH 4/5] drm/i915/execlists: Split spinlock from its irq disabling side-effect

Fri Mar 2 15:50:53 UTC 2018

Chris Wilson <chris at chris-wilson.co.uk> writes:

> During reset/wedging, we have to clean up the requests on the timeline
> and flush the pending interrupt state. Currently, we are abusing the irq
> disabling of the timeline spinlock to protect the irq state in
> conjunction to the engine's timeline requests, but this is accidental
> and conflates the spinlock with the irq state. A baffling state of
> affairs for the reader.
>
> Instead, explicitly disable irqs over the critical section, and separate
> modifying the irq state from the timeline's requests.
>
> Suggested-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0482e54c94f0..7d1109aceabb 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -689,11 +689,13 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  
>  	GEM_TRACE("%s\n", engine->name);
>  
> -	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	local_irq_save(flags);

Chris explained in irc that this is for lockdep only. It was a bit
confusing as we already are guaranteed exclusive access to
state by tasklet being killed and dead at this point.

I think this warrants a comment that this is to soothe lockdep.
>  
>  	/* Cancel the requests on the HW and clear the ELSP tracker. */
>  	execlists_cancel_port_requests(execlists);
>  
> +	spin_lock(&engine->timeline->lock);
> +
>  	/* Mark all executing requests as skipped. */
>  	list_for_each_entry(rq, &engine->timeline->requests, link) {
>  		GEM_BUG_ON(!rq->global_seqno);
> @@ -727,6 +729,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	execlists->first = NULL;
>  	GEM_BUG_ON(port_isset(execlists->port));
>  
> +	spin_unlock(&engine->timeline->lock);
> +
>  	/*
>  	 * The port is checked prior to scheduling a tasklet, but
>  	 * just in case we have suspended the tasklet to do the
> @@ -738,7 +742,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	/* Mark all CS interrupts as complete */
>  	execlists->active = 0;
>  
> -	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +	local_irq_restore(flags);
>  }
>  
>  /*
> @@ -1618,10 +1622,11 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	GEM_TRACE("%s seqno=%x\n",
>  		  engine->name, request ? request->global_seqno : 0);
>  
> -	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	local_irq_save(flags);
>  
>  	reset_irq(engine);
>  
> +

Unintentional extra line?

Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>

>  	/*
>  	 * Catch up with any missed context-switch interrupts.
>  	 *
> @@ -1634,14 +1639,17 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	execlists_cancel_port_requests(execlists);
>  
>  	/* Push back any incomplete requests for replay after the reset. */
> +	spin_lock(&engine->timeline->lock);
>  	__unwind_incomplete_requests(engine);
> +	spin_unlock(&engine->timeline->lock);
>  
>  	/* Mark all CS interrupts as complete */
>  	execlists->active = 0;
>  
> -	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +	local_irq_restore(flags);
>  
> -	/* If the request was innocent, we leave the request in the ELSP
> +	/*
> +	 * If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
>  	 * have been corrupted by the reset, in which case we may have
>  	 * to service a new GPU hang, but more likely we can continue on
> @@ -1654,7 +1662,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	if (!request || request->fence.error != -EIO)
>  		return;
>  
> -	/* We want a simple context + ring to execute the breadcrumb update.
> +	/*
> +	 * We want a simple context + ring to execute the breadcrumb update.
>  	 * We cannot rely on the context being intact across the GPU hang,
>  	 * so clear it and rebuild just what we need for the breadcrumb.
>  	 * All pending requests for this context will be zapped, and any
> -- 
> 2.16.2