[Intel-gfx] [PATCH 2/3] drm/i915: Add control flags to i915_handle_error()
Michel Thierry
michel.thierry at intel.com
Mon Mar 19 16:48:01 UTC 2018
On 16/03/18 14:50, Chris Wilson wrote:
> Not all callers want the GPU error to handled in the same way, so expose
> a control parameter. In the first instance, some callers do not want the
> heavyweight error capture so add a bit to request the state to be
> captured and saved.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Jeff McGee <jeff.mcgee at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 4 ++--
> drivers/gpu/drm/i915/i915_drv.h | 4 +++-
> drivers/gpu/drm/i915/i915_irq.c | 22 ++++++++++++++--------
> drivers/gpu/drm/i915/intel_hangcheck.c | 6 +++---
> drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +-
> 5 files changed, 23 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 5378863e3238..03b74a92caed 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3952,8 +3952,8 @@ i915_wedged_set(void *data, u64 val)
> engine->hangcheck.stalled = true;
> }
>
> - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
> - val);
> + i915_handle_error(i915, val, I915_ERROR_CAPTURE,
> + "Manually set wedged engine mask = %llx", val);
>
> wait_on_bit(&i915->gpu_error.flags,
> I915_RESET_HANDOFF,
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e27ba8fb64e6..53009ba50640 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2751,10 +2751,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
> &dev_priv->gpu_error.hangcheck_work, delay);
> }
>
> -__printf(3, 4)
> +__printf(4, 5)
> void i915_handle_error(struct drm_i915_private *dev_priv,
> u32 engine_mask,
> + unsigned long flags,
> const char *fmt, ...);
> +#define I915_ERROR_CAPTURE BIT(0)
>
Should this be in the new i915_gpu_error.h?
> extern void intel_irq_init(struct drm_i915_private *dev_priv);
> extern void intel_irq_fini(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 44eef355e12c..b3a4dc7cb26c 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2955,6 +2955,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> * i915_handle_error - handle a gpu error
> * @dev_priv: i915 device private
> * @engine_mask: mask representing engines that are hung
> + * @flags: control flags
> * @fmt: Error message format string
> *
> * Do some basic checking of register state at error time and
> @@ -2965,16 +2966,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> */
> void i915_handle_error(struct drm_i915_private *dev_priv,
> u32 engine_mask,
> + unsigned long flags,
> const char *fmt, ...)
> {
> struct intel_engine_cs *engine;
> unsigned int tmp;
> - va_list args;
> - char error_msg[80];
> -
> - va_start(args, fmt);
> - vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> - va_end(args);
>
> /*
> * In most cases it's guaranteed that we get here with an RPM
> @@ -2986,8 +2982,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> intel_runtime_pm_get(dev_priv);
>
> engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
> - i915_capture_error_state(dev_priv, engine_mask, error_msg);
> - i915_clear_error_registers(dev_priv);
> +
> + if (flags & I915_ERROR_CAPTURE) {
> + char error_msg[80];
> + va_list args;
> +
> + va_start(args, fmt);
> + vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> + va_end(args);
> +
> + i915_capture_error_state(dev_priv, engine_mask, error_msg);
> + i915_clear_error_registers(dev_priv);
> + }
else
DRM_INFO or DRM_NOTE the error_msg ?
Otherwise the 'kicking wait/semaphore' text from below will never be
printed.
>
> /*
> * Try engine reset when available. We fall back to full reset if
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 42e45ae87393..13d1a269c771 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> */
> tmp = I915_READ_CTL(engine);
> if (tmp & RING_WAIT) {
> - i915_handle_error(dev_priv, 0,
> + i915_handle_error(dev_priv, 0, 0,
> "Kicking stuck wait on %s",
> engine->name);
> I915_WRITE_CTL(engine, tmp);
> @@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> default:
> return ENGINE_DEAD;
> case 1:
> - i915_handle_error(dev_priv, 0,
> + i915_handle_error(dev_priv, 0, 0,
> "Kicking stuck semaphore on %s",
> engine->name);
> I915_WRITE_CTL(engine, tmp);
> @@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
> "%s, ", engine->name);
> msg[len-2] = '\0';
>
> - return i915_handle_error(i915, hung, "%s", msg);
> + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
> }
>
> /*
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index df7898c8edcb..84aad2b4825d 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -1084,7 +1084,7 @@ static int igt_handle_error(void *arg)
> engine->hangcheck.stalled = true;
> engine->hangcheck.seqno = intel_engine_get_seqno(engine);
>
> - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
> + i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__);
>
> xchg(&i915->gpu_error.first_error, error);
>
>
More information about the Intel-gfx
mailing list