[Intel-gfx] [PATCH v4] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

Mon Apr 30 15:49:53 UTC 2018

On 04/28/2018 04:15 AM, Chris Wilson wrote:
> Previously, we just reset the ring register in the context image such
> that we could skip over the broken batch and emit the closing
> breadcrumb. However, on resume the context image and GPU state would be
> reloaded, which may have been left in an inconsistent state by the
> reset. The presumption was that at worst it would just cause another
> reset and skip again until it recovered, however it seems just as likely
> to cause an unrecoverable hang. Instead of risking loading an incomplete
> context image, restore it back to the default state.
> 
> v2: Fix up off-by-one from including the ppHSWP in with the register
> state.
> v3: Use a ring local to compact a few lines.
> v4: Beware setting the ring local before checking for a NULL request.

Didn't you want to set the ring local after this check?
	if (!request || request->fence.error != -EIO)

This is identical to v2.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Reviewed-by: Michel Thierry <michel.thierry at intel.com> #v2
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
>   1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ce23d5116482..513aee6b3634 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   			      struct i915_request *request)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	struct intel_context *ce;
>   	unsigned long flags;
> +	u32 *regs;
>   
>   	GEM_TRACE("%s request global=%x, current=%d\n",
>   		  engine->name, request ? request->global_seqno : 0,
> @@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   	 * future request will be after userspace has had the opportunity
>   	 * to recreate its own state.
>   	 */
> -	ce = &request->ctx->engine[engine->id];
> -	execlists_init_reg_state(ce->lrc_reg_state,
> -				 request->ctx, engine, ce->ring);
> +	regs = request->ctx->engine[engine->id].lrc_reg_state;
> +	if (engine->default_state) {
> +		void *defaults;
> +
> +		defaults = i915_gem_object_pin_map(engine->default_state,
> +						   I915_MAP_WB);
> +		if (!IS_ERR(defaults)) {
> +			memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +			       defaults + LRC_STATE_PN * PAGE_SIZE,
> +			       engine->context_size - PAGE_SIZE);
> +			i915_gem_object_unpin_map(engine->default_state);
> +		}
> +	}
> +	execlists_init_reg_state(regs, request->ctx, engine, request->ring);
>   
>   	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> -		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
> +	regs[CTX_RING_HEAD + 1] = request->postfix;
>   
>   	request->ring->head = request->postfix;
>   	intel_ring_update_space(request->ring);
>