[Intel-gfx] [PATCH 04/20] drm/i915: TDR / per-engine hang detection

Wed Jan 13 12:37:27 PST 2016

On Wed, Jan 13, 2016 at 05:28:16PM +0000, Arun Siluvery wrote:
> From: Tomas Elf <tomas.elf at intel.com>
> 
> With the per-engine hang recovery path already in place this patch adds
> per-engine hang detection by letting the periodic hang checker detect hangs on
> individual engines and communicate this to the error handler. During hang
> checking every engine is checked and the hang detection status for each engine
> is aggregated into a single 32-bit engine flag mask that contains all the
> engine flags (1 << ring->id) of all the hung engines or'ed together. The
> per-engine path in the error handler then sets up the hangcheck state for each
> invidual, hung engine based on the engine flag mask before potentially calling
> the per-engine hang recovery path.
> 
> This allows the hang detection to happen in lock-step for all engines in
> parallel and lets the driver process all hung engines in turn in the error
> handler.
> 
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h     |  4 ++--
>  drivers/gpu/drm/i915/i915_irq.c     | 41 +++++++++++++++++++++++--------------
>  3 files changed, 29 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index e3377ab..6d1b6c3 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val)
>  
>  	intel_runtime_pm_get(dev_priv);
>  
> -	i915_handle_error(dev, val,
> +	i915_handle_error(dev, 0x0, val,
>  			  "Manually setting wedged to %llu", val);
>  
>  	intel_runtime_pm_put(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e866f14..85cf692 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
>  
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -__printf(3, 4)
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +__printf(4, 5)
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
>  		       const char *fmt, ...);
>  
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6a0ec37..fef74cf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>  
>  /**
>   * i915_handle_error - handle a gpu error
> - * @dev: drm device
> + * @dev: 		drm device
> + * @engine_mask: 	Bit mask containing the engine flags of all engines
> + *			associated with one or more detected errors.
> + *			May be 0x0.
> + *			If wedged is set to true this implies that one or more
> + *			engine hangs were detected. In this case we will
> + *			attempt to reset all engines that have been detected
> + *			as hung.
> + *			If a previous engine reset was attempted too recently
> + *			or if one of the current engine resets fails we fall
> + *			back to legacy full GPU reset.
> + * @wedged: 		true = Hang detected, invoke hang recovery.

These two look to be tautological. If wedged == 0, we expect engine_mask
to be 0 zero as well since we will not be resetting the engines, just
capturing error state. It's not though. Conversely if wedged == 1, we
expect engine_mask to be set. Again, it's not and I think there the
caller is wrong.

> + * @fmt, ...: 		Error message describing reason for error.
>   *
>   * Do some basic checking of register state at error time and
>   * dump it to the syslog.  Also call i915_capture_error_state() to make
>   * sure we get a record and make it available in debugfs.  Fire a uevent
>   * so userspace knows something bad happened (should trigger collection
> - * of a ring dump etc.).
> + * of a ring dump etc.). If a hang was detected (wedged = true) try to
> + * reset the associated engine. Failing that, try to fall back to legacy
> + * full GPU reset recovery mode.
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
>  		       const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>  
>  	struct intel_engine_cs *engine;
>  
> -	/*
> -	 * NB: Placeholder until the hang checker supports
> -	 * per-engine hang detection.
> -	 */
> -	u32 engine_mask = 0;
> -
>  	va_start(args, fmt);
>  	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
>  	va_end(args);
> @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  	 */
>  	tmp = I915_READ_CTL(ring);
>  	if (tmp & RING_WAIT) {
> -		i915_handle_error(dev, false,
> +		i915_handle_error(dev, intel_ring_flag(ring), false,
>  				  "Kicking stuck wait on %s",
>  				  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  		default:
>  			return HANGCHECK_HUNG;
>  		case 1:
> -			i915_handle_error(dev, false,
> +			i915_handle_error(dev, intel_ring_flag(ring), false,
>  					  "Kicking stuck semaphore on %s",
>  					  ring->name);
>  			I915_WRITE_CTL(ring, tmp);
> @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  	struct drm_device *dev = dev_priv->dev;
>  	struct intel_engine_cs *ring;
>  	int i;
> -	int busy_count = 0, rings_hung = 0;
> +	u32 engine_mask = 0;
> +	int busy_count = 0;
>  	bool stuck[I915_NUM_RINGS] = { 0 };
>  #define BUSY 1
>  #define KICK 5
> @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> -			rings_hung++;
> +
> +			engine_mask |= intel_ring_flag(ring);
> +			ring->hangcheck.tdr_count++;

tdr_count was introduced unused in the previous patch and here it has
nothing to do with tdr, but just hangcheck pure and simple. It would
actually be a useful statistic for hangcheck/error-state...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre