[Intel-gfx] [PATCH 2/2] drm/i915: Pass the set of guilty engines to i915_reset()
Michel Thierry
michel.thierry at intel.com
Fri Apr 6 22:35:43 UTC 2018
On 4/6/2018 3:03 PM, Chris Wilson wrote:
> Currently, we rely on inspecting the hangcheck state from within the
> i915_reset() routines to determine which engines were guilty of the
> hang. This is problematic for cases where we want to run
> i915_handle_error() and call i915_reset() independently of hangcheck.
> Instead of relying on the indirect parameter passing, turn it into an
> explicit parameter providing the set of stalled engines which then are
> treated as guilty until proven innocent.
>
> While we are removing the implicit stalled parameter, also make the
> reason into an explicit paramter to i915_reset(). We still need a
parameter
> back-channel for i915_handle_error() to hand over the task to the locked
> waiter, but let's keep that its own channel rather than incriminate
> another.
>
> This leaves stalled/seqno as being private to hangcheck, with no more
> nefarious snooping by reset, be it whole-device or per-engine. \o/
>
> The only real issue now is that this makes it crystal clear that we
> don't actually do any testing of hangcheck per se in
> drv_selftest/live_hangcheck, merely of resets!
Don't tell anyone
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Jeff McGee <jeff.mcgee at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.c | 13 ++++++----
> drivers/gpu/drm/i915/i915_drv.h | 10 +++++---
> drivers/gpu/drm/i915/i915_gem.c | 5 ++--
> drivers/gpu/drm/i915/i915_gpu_error.h | 3 +++
> drivers/gpu/drm/i915/i915_irq.c | 12 ++++++---
> drivers/gpu/drm/i915/i915_request.c | 6 +++--
> .../gpu/drm/i915/selftests/intel_hangcheck.c | 25 ++++++++-----------
> 7 files changed, 44 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 7ce229c6f424..f770be18b2d7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1866,6 +1866,8 @@ static int i915_resume_switcheroo(struct drm_device *dev)
> /**
> * i915_reset - reset chip after a hang
> * @i915: #drm_i915_private to reset
> + * @stalled_mask: mask of the stalled engines with the guilty requests
> + * @reason: user error message for why we are resetting
> *
> * Reset the chip. Useful if a hang is detected. Marks the device as wedged
> * on failure.
> @@ -1880,7 +1882,9 @@ static int i915_resume_switcheroo(struct drm_device *dev)
> * - re-init interrupt state
> * - re-init display
> */
> -void i915_reset(struct drm_i915_private *i915)
> +void i915_reset(struct drm_i915_private *i915,
> + unsigned int stalled_mask,
> + const char *reason)
> {
> struct i915_gpu_error *error = &i915->gpu_error;
> int ret;
> @@ -1899,9 +1903,8 @@ void i915_reset(struct drm_i915_private *i915)
> if (!i915_gem_unset_wedged(i915))
> goto wakeup;
>
> - if (error->reason)
> - dev_notice(i915->drm.dev,
> - "Resetting chip for %s\n", error->reason);
> + if (reason)
> + dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> error->reset_count++;
>
> disable_irq(i915->drm.irq);
> @@ -1944,7 +1947,7 @@ void i915_reset(struct drm_i915_private *i915)
> goto error;
> }
>
> - i915_gem_reset(i915);
> + i915_gem_reset(i915, stalled_mask);
> intel_overlay_reset(i915);
>
> /*
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 6b3f2f651def..9bca104c409e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2701,8 +2701,11 @@ extern void i915_driver_unload(struct drm_device *dev);
> extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
>
> -extern void i915_reset(struct drm_i915_private *i915);
> -extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg);
> +extern void i915_reset(struct drm_i915_private *i915,
> + unsigned int stalled_mask,
> + const char *reason);
> +extern int i915_reset_engine(struct intel_engine_cs *engine,
> + const char *reason);
>
> extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
> extern int intel_reset_guc(struct drm_i915_private *dev_priv);
> @@ -3126,7 +3129,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
> struct i915_request *
> i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
> int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
> -void i915_gem_reset(struct drm_i915_private *dev_priv);
> +void i915_gem_reset(struct drm_i915_private *dev_priv,
> + unsigned int stalled_mask);
> void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
> void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
> void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 306d7a805eb7..eacb6893d892 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3213,7 +3213,8 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine,
> engine->reset_hw(engine, request);
> }
>
> -void i915_gem_reset(struct drm_i915_private *dev_priv)
> +void i915_gem_reset(struct drm_i915_private *dev_priv,
> + unsigned int stalled_mask)
> {
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> @@ -3227,7 +3228,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
>
> i915_gem_reset_engine(engine,
> engine->hangcheck.active_request,
> - engine->hangcheck.stalled);
> + stalled_mask & BIT(id));
ENGINE_MASK(id) since we have it defined?
(I know they are the same)
> ctx = fetch_and_zero(&engine->last_retired_context);
> if (ctx)
> engine->context_unpin(engine, ctx);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index ac5760673cc9..c05b6034d718 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -269,6 +269,9 @@ struct i915_gpu_error {
> /** Number of times an engine has been reset */
> u32 reset_engine_count[I915_NUM_ENGINES];
>
> + /** Set of stalled engines with guilty requests, in the current reset */
> + u32 stalled_mask;
> +
> /** Reason for the current *global* reset */
> const char *reason;
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c2f878ace0ea..b03d18561b55 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2961,7 +2961,8 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
> }
>
> static void i915_reset_device(struct drm_i915_private *dev_priv,
> - const char *msg)
> + u32 engine_mask,
> + const char *reason)
> {
> struct i915_gpu_error *error = &dev_priv->gpu_error;
> struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
> @@ -2979,9 +2980,11 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
> i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
> intel_prepare_reset(dev_priv);
>
> - error->reason = msg;
> + error->reason = reason;
> + error->stalled_mask = engine_mask;
>
> /* Signal that locked waiters should reset the GPU */
> + smp_mb__before_atomic();
> set_bit(I915_RESET_HANDOFF, &error->flags);
> wake_up_all(&error->wait_queue);
>
> @@ -2990,7 +2993,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
> */
> do {
> if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> - i915_reset(dev_priv);
> + i915_reset(dev_priv, engine_mask, reason);
> mutex_unlock(&dev_priv->drm.struct_mutex);
> }
> } while (wait_on_bit_timeout(&error->flags,
> @@ -2998,6 +3001,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
> TASK_UNINTERRUPTIBLE,
> 1));
>
> + error->stalled_mask = 0;
> error->reason = NULL;
>
> intel_finish_reset(dev_priv);
> @@ -3122,7 +3126,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> TASK_UNINTERRUPTIBLE);
> }
>
> - i915_reset_device(dev_priv, msg);
> + i915_reset_device(dev_priv, engine_mask, msg);
>
> for_each_engine(engine, dev_priv, tmp) {
> clear_bit(I915_RESET_ENGINE + engine->id,
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index a9d0bde16443..629f3e860592 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1185,11 +1185,13 @@ static bool __i915_spin_request(const struct i915_request *rq,
>
> static bool __i915_wait_request_check_and_reset(struct i915_request *request)
> {
> - if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
> + struct i915_gpu_error *error = &request->i915->gpu_error;
> +
> + if (likely(!i915_reset_handoff(error)))
> return false;
>
> __set_current_state(TASK_RUNNING);
> - i915_reset(request->i915);
> + i915_reset(request->i915, error->stalled_mask, error->reason);
> return true;
> }
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index acfb4dcc9fb5..60508927b8e4 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -437,7 +437,7 @@ static int igt_global_reset(void *arg)
> mutex_lock(&i915->drm.struct_mutex);
> reset_count = i915_reset_count(&i915->gpu_error);
>
> - i915_reset(i915);
> + i915_reset(i915, ALL_ENGINES, NULL);
>
> if (i915_reset_count(&i915->gpu_error) == reset_count) {
> pr_err("No GPU reset recorded!\n");
> @@ -881,17 +881,17 @@ static int igt_reset_engines(void *arg)
> return 0;
> }
>
> -static u32 fake_hangcheck(struct i915_request *rq)
> +static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
> {
> - u32 reset_count;
> + struct i915_gpu_error *error = &rq->i915->gpu_error;
> + u32 reset_count = i915_reset_count(error);
>
> - rq->engine->hangcheck.stalled = true;
> - rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
> + error->stalled_mask = mask;
>
> - reset_count = i915_reset_count(&rq->i915->gpu_error);
> + smp_mb__before_atomic();
checkpatch is going to complain about the lack of comment
> + set_bit(I915_RESET_HANDOFF, &error->flags);
>
> - set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
> - wake_up_all(&rq->i915->gpu_error.wait_queue);
> + wake_up_all(&error->wait_queue);
>
> return reset_count;
> }
> @@ -939,7 +939,7 @@ static int igt_wait_reset(void *arg)
> goto out_rq;
> }
>
> - reset_count = fake_hangcheck(rq);
> + reset_count = fake_hangcheck(rq, ALL_ENGINES);
>
> timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
> if (timeout < 0) {
> @@ -1075,9 +1075,9 @@ static int igt_reset_queue(void *arg)
> goto fini;
> }
>
> - reset_count = fake_hangcheck(prev);
> + reset_count = fake_hangcheck(prev, BIT(id));
>
> - i915_reset(i915);
> + i915_reset(i915, BIT(id), NULL);
>
> GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
> &i915->gpu_error.flags));
> @@ -1186,9 +1186,6 @@ static int igt_handle_error(void *arg)
> /* Temporarily disable error capture */
> error = xchg(&i915->gpu_error.first_error, (void *)-1);
>
> - engine->hangcheck.stalled = true;
> - engine->hangcheck.seqno = intel_engine_get_seqno(engine);
> -
> i915_handle_error(i915, intel_engine_flag(engine), 0, NULL);
>
> xchg(&i915->gpu_error.first_error, error);
>
I would s/BIT()/ENGINE_MASK()/g, but it's not like we enforce it
(engine_struck has BIT(engine->id)).
With the missing comment in fake_hangcheck's barrier,
Reviewed-by: Michel Thierry <michel.thierry at intel.com>
More information about the Intel-gfx
mailing list