[Intel-gfx] [PATCH 04/20] drm/i915: TDR / per-engine hang detection
Chris Wilson
chris at chris-wilson.co.uk
Wed Jan 13 12:37:27 PST 2016
On Wed, Jan 13, 2016 at 05:28:16PM +0000, Arun Siluvery wrote:
> From: Tomas Elf <tomas.elf at intel.com>
>
> With the per-engine hang recovery path already in place this patch adds
> per-engine hang detection by letting the periodic hang checker detect hangs on
> individual engines and communicate this to the error handler. During hang
> checking every engine is checked and the hang detection status for each engine
> is aggregated into a single 32-bit engine flag mask that contains all the
> engine flags (1 << ring->id) of all the hung engines or'ed together. The
> per-engine path in the error handler then sets up the hangcheck state for each
> invidual, hung engine based on the engine flag mask before potentially calling
> the per-engine hang recovery path.
>
> This allows the hang detection to happen in lock-step for all engines in
> parallel and lets the driver process all hung engines in turn in the error
> handler.
>
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 2 +-
> drivers/gpu/drm/i915/i915_drv.h | 4 ++--
> drivers/gpu/drm/i915/i915_irq.c | 41 +++++++++++++++++++++++--------------
> 3 files changed, 29 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index e3377ab..6d1b6c3 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val)
>
> intel_runtime_pm_get(dev_priv);
>
> - i915_handle_error(dev, val,
> + i915_handle_error(dev, 0x0, val,
> "Manually setting wedged to %llu", val);
>
> intel_runtime_pm_put(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e866f14..85cf692 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
>
> /* i915_irq.c */
> void i915_queue_hangcheck(struct drm_device *dev);
> -__printf(3, 4)
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +__printf(4, 5)
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
> const char *fmt, ...);
>
> extern void intel_irq_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6a0ec37..fef74cf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>
> /**
> * i915_handle_error - handle a gpu error
> - * @dev: drm device
> + * @dev: drm device
> + * @engine_mask: Bit mask containing the engine flags of all engines
> + * associated with one or more detected errors.
> + * May be 0x0.
> + * If wedged is set to true this implies that one or more
> + * engine hangs were detected. In this case we will
> + * attempt to reset all engines that have been detected
> + * as hung.
> + * If a previous engine reset was attempted too recently
> + * or if one of the current engine resets fails we fall
> + * back to legacy full GPU reset.
> + * @wedged: true = Hang detected, invoke hang recovery.
These two look to be tautological. If wedged == 0, we expect engine_mask
to be 0 zero as well since we will not be resetting the engines, just
capturing error state. It's not though. Conversely if wedged == 1, we
expect engine_mask to be set. Again, it's not and I think there the
caller is wrong.
> + * @fmt, ...: Error message describing reason for error.
> *
> * Do some basic checking of register state at error time and
> * dump it to the syslog. Also call i915_capture_error_state() to make
> * sure we get a record and make it available in debugfs. Fire a uevent
> * so userspace knows something bad happened (should trigger collection
> - * of a ring dump etc.).
> + * of a ring dump etc.). If a hang was detected (wedged = true) try to
> + * reset the associated engine. Failing that, try to fall back to legacy
> + * full GPU reset recovery mode.
> */
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
> const char *fmt, ...)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>
> struct intel_engine_cs *engine;
>
> - /*
> - * NB: Placeholder until the hang checker supports
> - * per-engine hang detection.
> - */
> - u32 engine_mask = 0;
> -
> va_start(args, fmt);
> vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> va_end(args);
> @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> */
> tmp = I915_READ_CTL(ring);
> if (tmp & RING_WAIT) {
> - i915_handle_error(dev, false,
> + i915_handle_error(dev, intel_ring_flag(ring), false,
> "Kicking stuck wait on %s",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> default:
> return HANGCHECK_HUNG;
> case 1:
> - i915_handle_error(dev, false,
> + i915_handle_error(dev, intel_ring_flag(ring), false,
> "Kicking stuck semaphore on %s",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> struct drm_device *dev = dev_priv->dev;
> struct intel_engine_cs *ring;
> int i;
> - int busy_count = 0, rings_hung = 0;
> + u32 engine_mask = 0;
> + int busy_count = 0;
> bool stuck[I915_NUM_RINGS] = { 0 };
> #define BUSY 1
> #define KICK 5
> @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> DRM_INFO("%s on %s\n",
> stuck[i] ? "stuck" : "no progress",
> ring->name);
> - rings_hung++;
> +
> + engine_mask |= intel_ring_flag(ring);
> + ring->hangcheck.tdr_count++;
tdr_count was introduced unused in the previous patch and here it has
nothing to do with tdr, but just hangcheck pure and simple. It would
actually be a useful statistic for hangcheck/error-state...
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list