[PATCH v1] drm/i915/gt: Retry RING_HEAD reset until it sticks

Andi Shyti andi.shyti at linux.intel.com
Wed Oct 9 22:48:28 UTC 2024


Hi Nitin,

On Thu, Oct 03, 2024 at 07:40:44PM +0530, Nitin Gote wrote:
> From: Chris Wilson <chris at chris-wilson.co.uk>
> 
> On Haswell, in particular, we see an issue where resets fails because
> the engine resumes from an incorrect RING_HEAD. Since the RING_HEAD
> doesn't point to the remaining requests to re-run, but may instead point
> into the uninitialised portion of the ring, the GPU may be then fed
> invalid instructions from a privileged context, oft pushing the GPU into
> an unrecoverable hang.
> 
> If at first the write doesn't succeed, try, try again.
> 
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/5432
> Testcase: igt/i915_selftest/hangcheck
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

I believe this is now Chris Wilson <chris.p.wilson at linux.intel.com>

> Signed-off-by: Nitin Gote <nitin.r.gote at intel.com>
> ---
>  .../gpu/drm/i915/gt/intel_ring_submission.c   | 44 +++++++++++++------
>  drivers/gpu/drm/i915/i915_utils.h             |  9 ++++
>  2 files changed, 39 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 72277bc8322e..6427f07ed23e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -192,6 +192,7 @@ static bool stop_ring(struct intel_engine_cs *engine)
>  static int xcs_resume(struct intel_engine_cs *engine)
>  {
>  	struct intel_ring *ring = engine->legacy.ring;
> +	ktime_t kt;
>  
>  	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
>  		     ring->head, ring->tail);
> @@ -230,9 +231,20 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	set_pp_dir(engine);
>  
>  	/* First wake the ring up to an empty/idle ring */
> -	ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +	until_timeout_ns(kt, 2 * NSEC_PER_MSEC) {

Can you please use the for loop directly here?

And we need some commenting here to understand. Besides, if the
change is for haswell, why don't we limit it to haswell?

> +		ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +		if (ENGINE_READ_FW(engine, RING_HEAD) == ring->head)
> +			break;
> +	}
> +

...

> @@ -259,16 +275,16 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	return 0;
>  
>  err:
> -	drm_err(&engine->i915->drm,
> -		"%s initialization failed; "
> -		"ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> -		engine->name,
> -		ENGINE_READ(engine, RING_CTL),
> -		ENGINE_READ(engine, RING_CTL) & RING_VALID,
> -		ENGINE_READ(engine, RING_HEAD), ring->head,
> -		ENGINE_READ(engine, RING_TAIL), ring->tail,
> -		ENGINE_READ(engine, RING_START),
> -		i915_ggtt_offset(ring->vma));
> +	ENGINE_TRACE(engine,
> +		     "initialization failed; "
> +		     "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> +		     ENGINE_READ(engine, RING_CTL),
> +		     ENGINE_READ(engine, RING_CTL) & RING_VALID,
> +		     ENGINE_READ(engine, RING_HEAD), ring->head,
> +		     ENGINE_READ(engine, RING_TAIL), ring->tail,
> +		     ENGINE_READ(engine, RING_START),
> +		     i915_ggtt_offset(ring->vma));
> +	GEM_TRACE_DUMP();

I believe this is out of the scope of this patch and should go in
another patch.

>  	return -EIO;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index 609214231ffc..538dfb8fa1d8 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -232,6 +232,15 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>  	}
>  }
>  
> +/*
> + * until_timeout_ns - Keep retrying (busy spin) until the duration has passed
> + */
> +
> +#define until_timeout_ns(end, timeout_ns) \
> +	for ((end) = ktime_get() + (timeout_ns); \
> +			ktime_before(ktime_get(), (end)); \
> +			cpu_relax())
> +

If you use this directly above then you don't need for this
define.

Andi

>  /*
>   * __wait_for - magic wait macro
>   *
> -- 
> 2.25.1


More information about the dri-devel mailing list