[Intel-gfx] [PATCH 2/2] drm/i915: Add control flags to i915_handle_error()
Michel Thierry
michel.thierry at intel.com
Tue Mar 20 14:11:03 UTC 2018
On 3/20/2018 3:04 AM, Chris Wilson wrote:
> Not all callers want the GPU error to handled in the same way, so expose
> a control parameter. In the first instance, some callers do not want the
> heavyweight error capture so add a bit to request the state to be
> captured and saved.
>
> v2: Pass msg down to i915_reset/i915_reset_engine so that we include the
> reason for the reset in the dev_notice(), superseding the earlier option
> to not print that notice.
> v3: Stash the reason inside the i915->gpu_error to handover to the direct
> reset from the blocking waiter.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Jeff McGee <jeff.mcgee at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
Reviewed-by: Michel Thierry <michel.thierry at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 4 +-
> drivers/gpu/drm/i915/i915_drv.c | 17 ++++----
> drivers/gpu/drm/i915/i915_drv.h | 10 ++---
> drivers/gpu/drm/i915/i915_gpu_error.h | 3 ++
> drivers/gpu/drm/i915/i915_irq.c | 55 ++++++++++++++----------
> drivers/gpu/drm/i915/i915_request.c | 2 +-
> drivers/gpu/drm/i915/intel_hangcheck.c | 13 +++---
> drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 13 +++---
> 8 files changed, 62 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 964ea1a12357..7816cd53100a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val)
> engine->hangcheck.stalled = true;
> }
>
> - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
> - val);
> + i915_handle_error(i915, val, I915_ERROR_CAPTURE,
> + "Manually set wedged engine mask = %llx", val);
>
> wait_on_bit(&i915->gpu_error.flags,
> I915_RESET_HANDOFF,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 1021bf40e236..6b04cc3be513 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1869,7 +1869,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
> /**
> * i915_reset - reset chip after a hang
> * @i915: #drm_i915_private to reset
> - * @flags: Instructions
> *
> * Reset the chip. Useful if a hang is detected. Marks the device as wedged
> * on failure.
> @@ -1884,7 +1883,7 @@ static int i915_resume_switcheroo(struct drm_device *dev)
> * - re-init interrupt state
> * - re-init display
> */
> -void i915_reset(struct drm_i915_private *i915, unsigned int flags)
> +void i915_reset(struct drm_i915_private *i915)
> {
> struct i915_gpu_error *error = &i915->gpu_error;
> int ret;
> @@ -1901,8 +1900,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
> if (!i915_gem_unset_wedged(i915))
> goto wakeup;
>
> - if (!(flags & I915_RESET_QUIET))
> - dev_notice(i915->drm.dev, "Resetting chip after gpu hang\n");
> + if (error->reason)
> + dev_notice(i915->drm.dev,
> + "Resetting chip for %s\n", error->reason);
> error->reset_count++;
>
> disable_irq(i915->drm.irq);
> @@ -2003,7 +2003,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
> /**
> * i915_reset_engine - reset GPU engine to recover from a hang
> * @engine: engine to reset
> - * @flags: options
> + * @msg: reason for GPU reset; or NULL for no dev_notice()
> *
> * Reset a specific GPU engine. Useful if a hang is detected.
> * Returns zero on successful reset or otherwise an error code.
> @@ -2013,7 +2013,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
> * - reset engine (which will force the engine to idle)
> * - re-init/configure engine
> */
> -int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
> +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> {
> struct i915_gpu_error *error = &engine->i915->gpu_error;
> struct i915_request *active_request;
> @@ -2028,10 +2028,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
> goto out;
> }
>
> - if (!(flags & I915_RESET_QUIET)) {
> + if (msg)
> dev_notice(engine->i915->drm.dev,
> - "Resetting %s after gpu hang\n", engine->name);
> - }
> + "Resetting %s for %s\n", engine->name, msg);
> error->reset_engine_count[engine->id]++;
>
> if (!engine->i915->guc.execbuf_client)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e27ba8fb64e6..c9c3b2ba6a86 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev);
> extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
>
> -#define I915_RESET_QUIET BIT(0)
> -extern void i915_reset(struct drm_i915_private *i915, unsigned int flags);
> -extern int i915_reset_engine(struct intel_engine_cs *engine,
> - unsigned int flags);
> +extern void i915_reset(struct drm_i915_private *i915);
> +extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg);
>
> extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
> extern int intel_reset_guc(struct drm_i915_private *dev_priv);
> @@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
> &dev_priv->gpu_error.hangcheck_work, delay);
> }
>
> -__printf(3, 4)
> +__printf(4, 5)
> void i915_handle_error(struct drm_i915_private *dev_priv,
> u32 engine_mask,
> + unsigned long flags,
> const char *fmt, ...);
> +#define I915_ERROR_CAPTURE BIT(0)
>
> extern void intel_irq_init(struct drm_i915_private *dev_priv);
> extern void intel_irq_fini(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index ebbdf37e2879..ac5760673cc9 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -269,6 +269,9 @@ struct i915_gpu_error {
> /** Number of times an engine has been reset */
> u32 reset_engine_count[I915_NUM_ENGINES];
>
> + /** Reason for the current *global* reset */
> + const char *reason;
> +
> /**
> * Waitqueue to signal when a hang is detected. Used to for waiters
> * to release the struct_mutex for the reset to procede.
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 44eef355e12c..fa7310766217 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
> return IRQ_HANDLED;
> }
>
> -/**
> - * i915_reset_device - do process context error handling work
> - * @dev_priv: i915 device private
> - *
> - * Fire an error uevent so userspace can see that a hang or error
> - * was detected.
> - */
> -static void i915_reset_device(struct drm_i915_private *dev_priv)
> +static void i915_reset_device(struct drm_i915_private *dev_priv,
> + const char *msg)
> {
> + struct i915_gpu_error *error = &dev_priv->gpu_error;
> struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
> char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> @@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
> i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
> intel_prepare_reset(dev_priv);
>
> + error->reason = msg;
> +
> /* Signal that locked waiters should reset the GPU */
> - set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> - wake_up_all(&dev_priv->gpu_error.wait_queue);
> + set_bit(I915_RESET_HANDOFF, &error->flags);
> + wake_up_all(&error->wait_queue);
>
> /* Wait for anyone holding the lock to wakeup, without
> * blocking indefinitely on struct_mutex.
> */
> do {
> if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> - i915_reset(dev_priv, 0);
> + i915_reset(dev_priv);
> mutex_unlock(&dev_priv->drm.struct_mutex);
> }
> - } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> + } while (wait_on_bit_timeout(&error->flags,
> I915_RESET_HANDOFF,
> TASK_UNINTERRUPTIBLE,
> 1));
>
> + error->reason = NULL;
> +
> intel_finish_reset(dev_priv);
> }
>
> - if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> - kobject_uevent_env(kobj,
> - KOBJ_CHANGE, reset_done_event);
> + if (!test_bit(I915_WEDGED, &error->flags))
> + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
> }
>
> static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> @@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> * i915_handle_error - handle a gpu error
> * @dev_priv: i915 device private
> * @engine_mask: mask representing engines that are hung
> + * @flags: control flags
> * @fmt: Error message format string
> *
> * Do some basic checking of register state at error time and
> @@ -2965,16 +2964,23 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> */
> void i915_handle_error(struct drm_i915_private *dev_priv,
> u32 engine_mask,
> + unsigned long flags,
> const char *fmt, ...)
> {
> struct intel_engine_cs *engine;
> unsigned int tmp;
> - va_list args;
> char error_msg[80];
> + char *msg = NULL;
>
> - va_start(args, fmt);
> - vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> - va_end(args);
> + if (fmt) {
> + va_list args;
> +
> + va_start(args, fmt);
> + vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> + va_end(args);
> +
> + msg = error_msg;
> + }
>
> /*
> * In most cases it's guaranteed that we get here with an RPM
> @@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> intel_runtime_pm_get(dev_priv);
>
> engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
> - i915_capture_error_state(dev_priv, engine_mask, error_msg);
> - i915_clear_error_registers(dev_priv);
> +
> + if (flags & I915_ERROR_CAPTURE) {
> + i915_capture_error_state(dev_priv, engine_mask, msg);
> + i915_clear_error_registers(dev_priv);
> + }
>
> /*
> * Try engine reset when available. We fall back to full reset if
> @@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> &dev_priv->gpu_error.flags))
> continue;
>
> - if (i915_reset_engine(engine, 0) == 0)
> + if (i915_reset_engine(engine, msg) == 0)
> engine_mask &= ~intel_engine_flag(engine);
>
> clear_bit(I915_RESET_ENGINE + engine->id,
> @@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> TASK_UNINTERRUPTIBLE);
> }
>
> - i915_reset_device(dev_priv);
> + i915_reset_device(dev_priv, msg);
>
> for_each_engine(engine, dev_priv, tmp) {
> clear_bit(I915_RESET_ENGINE + engine->id,
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 43c7134a9b93..2325886d1d55 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1229,7 +1229,7 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
> return false;
>
> __set_current_state(TASK_RUNNING);
> - i915_reset(request->i915, 0);
> + i915_reset(request->i915);
> return true;
> }
>
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index c8ea510629fa..fd0ffb8328d0 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -246,9 +246,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> */
> tmp = I915_READ_CTL(engine);
> if (tmp & RING_WAIT) {
> - i915_handle_error(dev_priv, BIT(engine->id),
> - "Kicking stuck wait on %s",
> - engine->name);
> + i915_handle_error(dev_priv, BIT(engine->id), 0,
> + "stuck wait on %s", engine->name);
> I915_WRITE_CTL(engine, tmp);
> return ENGINE_WAIT_KICK;
> }
> @@ -258,8 +257,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> default:
> return ENGINE_DEAD;
> case 1:
> - i915_handle_error(dev_priv, ALL_ENGINES,
> - "Kicking stuck semaphore on %s",
> + i915_handle_error(dev_priv, ALL_ENGINES, 0,
> + "stuck semaphore on %s",
> engine->name);
> I915_WRITE_CTL(engine, tmp);
> return ENGINE_WAIT_KICK;
> @@ -386,13 +385,13 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
> if (stuck != hung)
> hung &= ~stuck;
> len = scnprintf(msg, sizeof(msg),
> - "%s on ", stuck == hung ? "No progress" : "Hang");
> + "%s on ", stuck == hung ? "no progress" : "hang");
> for_each_engine_masked(engine, i915, hung, tmp)
> len += scnprintf(msg + len, sizeof(msg) - len,
> "%s, ", engine->name);
> msg[len-2] = '\0';
>
> - return i915_handle_error(i915, hung, "%s", msg);
> + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
> }
>
> /*
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index df7898c8edcb..4372826998aa 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -433,7 +433,7 @@ static int igt_global_reset(void *arg)
> mutex_lock(&i915->drm.struct_mutex);
> reset_count = i915_reset_count(&i915->gpu_error);
>
> - i915_reset(i915, I915_RESET_QUIET);
> + i915_reset(i915);
>
> if (i915_reset_count(&i915->gpu_error) == reset_count) {
> pr_err("No GPU reset recorded!\n");
> @@ -518,7 +518,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
> engine->hangcheck.seqno =
> intel_engine_get_seqno(engine);
>
> - err = i915_reset_engine(engine, I915_RESET_QUIET);
> + err = i915_reset_engine(engine, NULL);
> if (err) {
> pr_err("i915_reset_engine failed\n");
> break;
> @@ -725,7 +725,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
> engine->hangcheck.seqno =
> intel_engine_get_seqno(engine);
>
> - err = i915_reset_engine(engine, I915_RESET_QUIET);
> + err = i915_reset_engine(engine, NULL);
> if (err) {
> pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
> engine->name, active ? "active" : "idle", err);
> @@ -865,7 +865,6 @@ static int igt_wait_reset(void *arg)
> __func__, rq->fence.seqno, hws_seqno(&h, rq));
> intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>
> - i915_reset(i915, 0);
> i915_gem_set_wedged(i915);
>
> err = -EIO;
> @@ -962,7 +961,6 @@ static int igt_reset_queue(void *arg)
> i915_request_put(rq);
> i915_request_put(prev);
>
> - i915_reset(i915, 0);
> i915_gem_set_wedged(i915);
>
> err = -EIO;
> @@ -971,7 +969,7 @@ static int igt_reset_queue(void *arg)
>
> reset_count = fake_hangcheck(prev);
>
> - i915_reset(i915, I915_RESET_QUIET);
> + i915_reset(i915);
>
> GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
> &i915->gpu_error.flags));
> @@ -1069,7 +1067,6 @@ static int igt_handle_error(void *arg)
> __func__, rq->fence.seqno, hws_seqno(&h, rq));
> intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>
> - i915_reset(i915, 0);
> i915_gem_set_wedged(i915);
>
> err = -EIO;
> @@ -1084,7 +1081,7 @@ static int igt_handle_error(void *arg)
> engine->hangcheck.stalled = true;
> engine->hangcheck.seqno = intel_engine_get_seqno(engine);
>
> - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
> + i915_handle_error(i915, intel_engine_flag(engine), 0, NULL);
>
> xchg(&i915->gpu_error.first_error, error);
>
>
More information about the Intel-gfx
mailing list