[Intel-gfx] [PATCH] drm/i915: Stop engines when declaring the machine wedged

Mika Kuoppala mika.kuoppala at linux.intel.com
Thu Mar 15 15:44:29 UTC 2018


Chris Wilson <chris at chris-wilson.co.uk> writes:

> If we fail to reset the GPU, we declare the machine wedged. However, the
> GPU may well still be running in the background with an in-flight
> request. So despite our efforts in cleaning up the request queue and
> faking the breadcrumb in the HWSP, the GPU may eventually write the
> in-flght seqno there breaking all of our assumptions and throwing the
> driver into a deep turmoil, wedging beyond wedged.
>
> To avoid this we ideally want to reset the GPU. Since that has already
> failed, make sure the rings have the stop bit set instead. This is part
> of the normal GPU reset sequence, but that is actually disabled by
> igt/gem_eio to force the wedged state. If we assume the worst, we must
> poke at the bit again before we give up.

I am pondering if you would save so much trouble by just
adding

int intel_get_gpu_reset(struct drm_i915_private *dev_priv,
bool ignore_modparam)

and then forcing a stop engine and the last reset
straight in wedging

-Mika

>
> Testcase: igt/gem_eio
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: MichaƂ Winiarski <michal.winiarski at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> ---
> git add fail
> We want to stop the tasklets before hitting stop-engines or else we
> may trigger an early CS completion and more asserts.
> -Chris
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  2 ++
>  drivers/gpu/drm/i915/intel_engine_cs.c  | 43 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
>  drivers/gpu/drm/i915/intel_uncore.c     | 46 +--------------------------------
>  4 files changed, 48 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2fbd622bba30..208619981ea1 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3246,6 +3246,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	}
>  	i915->caps.scheduler = 0;
>  
> +	intel_engines_stop(i915, ALL_ENGINES);
> +
>  	/*
>  	 * Make sure no one is running the old callback before we proceed with
>  	 * cancelling requests and resetting the completion tracking. Otherwise
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 337dfa56a738..44eddf10f4d1 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915)
>  	return which;
>  }
>  
> +static void gen3_stop_engine(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	const u32 base = engine->mmio_base;
> +	const i915_reg_t mode = RING_MI_MODE(base);
> +
> +	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
> +	if (intel_wait_for_register_fw(dev_priv,
> +				       mode,
> +				       MODE_IDLE,
> +				       MODE_IDLE,
> +				       500))
> +		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
> +				 engine->name);
> +
> +	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> +	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> +
> +	I915_WRITE_FW(RING_HEAD(base), 0);
> +	I915_WRITE_FW(RING_TAIL(base), 0);
> +	POSTING_READ_FW(RING_TAIL(base));
> +
> +	/* The ring must be empty before it is disabled */
> +	I915_WRITE_FW(RING_CTL(base), 0);
> +
> +	/* Check acts as a post */
> +	if (I915_READ_FW(RING_HEAD(base)) != 0)
> +		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> +				 engine->name);
> +}
> +
> +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	if (INTEL_GEN(i915) < 3)
> +		return;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, id)
> +		gen3_stop_engine(engine);
> +}
> +
>  static void print_request(struct drm_printer *m,
>  			  struct i915_request *rq,
>  			  const char *prefix)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 1f50727a5ddb..1369f7c5b57e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
>  void intel_engines_park(struct drm_i915_private *i915);
>  void intel_engines_unpark(struct drm_i915_private *i915);
>  
> +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask);
> +
>  void intel_engines_reset_default_submission(struct drm_i915_private *i915);
>  unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
>  
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 4df7c2ef8576..cda4c06acf16 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	const u32 base = engine->mmio_base;
> -	const i915_reg_t mode = RING_MI_MODE(base);
> -
> -	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
> -	if (intel_wait_for_register_fw(dev_priv,
> -				       mode,
> -				       MODE_IDLE,
> -				       MODE_IDLE,
> -				       500))
> -		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
> -				 engine->name);
> -
> -	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> -	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> -
> -	I915_WRITE_FW(RING_HEAD(base), 0);
> -	I915_WRITE_FW(RING_TAIL(base), 0);
> -	POSTING_READ_FW(RING_TAIL(base));
> -
> -	/* The ring must be empty before it is disabled */
> -	I915_WRITE_FW(RING_CTL(base), 0);
> -
> -	/* Check acts as a post */
> -	if (I915_READ_FW(RING_HEAD(base)) != 0)
> -		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> -				 engine->name);
> -}
> -
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> -}
> -
>  static bool i915_in_reset(struct pci_dev *pdev)
>  {
>  	u8 gdrst;
> @@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  		 *
>  		 * FIXME: Wa for more modern gens needs to be validated
>  		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> +		intel_engines_stop(dev_priv, engine_mask);
>  
>  		ret = -ENODEV;
>  		if (reset)
> -- 
> 2.16.2


More information about the Intel-gfx mailing list