[Intel-gfx] [PATCH v7 03/20] drm/i915: Add support for per engine reset recovery
Chris Wilson
chris at chris-wilson.co.uk
Thu Apr 27 23:50:17 UTC 2017
On Thu, Apr 27, 2017 at 04:12:43PM -0700, Michel Thierry wrote:
> From: Arun Siluvery <arun.siluvery at linux.intel.com>
>
> This change implements support for per-engine reset as an initial, less
> intrusive hang recovery option to be attempted before falling back to the
> legacy full GPU reset recovery mode if necessary. This is only supported
> from Gen8 onwards.
>
> Hangchecker determines which engines are hung and invokes error handler to
> recover from it. Error handler schedules recovery for each of those engines
> that are hung. The recovery procedure is as follows,
> - identifies the request that caused the hang and it is dropped
> - force engine to idle: this is done by issuing a reset request
> - reset and re-init engine
> - restart submissions to the engine
>
> If engine reset fails then we fall back to heavy weight full gpu reset
> which resets all engines and reinitiazes complete state of HW and SW.
>
> v2: Rebase.
> v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before
> calling i915_gem_reset_engine (Chris).
> v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and
> reuse the function for reset_engine.
> v5: intel_reset_engine_start/cancel instead of request/unrequest_reset.
> v6: Clean up reset_engine function to not require mutex, i.e. no need to call
> revoke/restore_fences and _retire_requests (Chris).
> v7: Remove leftovers from v5, i.e. no need to disable irq, hold
> forcewake or wakeup the handoff bit (Chris).
>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.c | 60 ++++++++++++++++++--
> drivers/gpu/drm/i915/i915_drv.h | 12 +++-
> drivers/gpu/drm/i915/i915_gem.c | 97 +++++++++++++++++++--------------
> drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
> drivers/gpu/drm/i915/intel_uncore.c | 20 +++++++
> 5 files changed, 142 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 48c8b69d9bde..ae891529dedd 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1810,7 +1810,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>
> pr_notice("drm/i915: Resetting chip after gpu hang\n");
> disable_irq(dev_priv->drm.irq);
> - ret = i915_gem_reset_prepare(dev_priv);
> + ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
> if (ret) {
> DRM_ERROR("GPU recovery failed\n");
> intel_gpu_reset(dev_priv, ALL_ENGINES);
> @@ -1852,7 +1852,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
> i915_queue_hangcheck(dev_priv);
>
> finish:
> - i915_gem_reset_finish(dev_priv);
> + i915_gem_reset_finish(dev_priv, ALL_ENGINES);
> enable_irq(dev_priv->drm.irq);
>
> wakeup:
> @@ -1871,11 +1871,63 @@ void i915_reset(struct drm_i915_private *dev_priv)
> *
> * Reset a specific GPU engine. Useful if a hang is detected.
> * Returns zero on successful reset or otherwise an error code.
> + *
> + * Procedure is:
> + * - identifies the request that caused the hang and it is dropped
> + * - force engine to idle: this is done by issuing a reset request
> + * - reset engine
> + * - restart submissions to the engine
Why does the prospective caller need to know this?
> */
> int i915_reset_engine(struct intel_engine_cs *engine)
> {
> - /* FIXME: replace me with engine reset sequence */
> - return -ENODEV;
> + int ret;
> + struct drm_i915_private *dev_priv = engine->i915;
> + struct i915_gpu_error *error = &dev_priv->gpu_error;
> +
> + GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> +
> + DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
> +
> + ret = i915_gem_reset_prepare_engine(engine);
> + if (ret) {
> + DRM_ERROR("Previous reset failed - promote to full reset\n");
> + goto out;
> + }
> +
> + /*
> + * the request that caused the hang is stuck on elsp, identify the
> + * active request and drop it, adjust head to skip the offending
> + * request to resume executing remaining requests in the queue.
> + */
> + i915_gem_reset_engine(engine);
> +
> + /* forcing engine to idle */
> + ret = intel_reset_engine_start(engine);
> + if (ret) {
> + DRM_ERROR("Failed to disable %s\n", engine->name);
> + goto out;
> + }
> +
> + /* finally, reset engine */
> + ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
> + if (ret) {
> + DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
> + intel_reset_engine_cancel(engine);
> + goto out;
> + }
> +
> + /* be sure the request reset bit gets cleared */
> + intel_reset_engine_cancel(engine);
> +
> + i915_gem_reset_finish_engine(engine);
> +
> + /* replay remaining requests in the queue */
> + ret = engine->init_hw(engine);
> + if (ret)
> + goto out; //XXX: ignore this line for now
Please give the comments here some tlc. Focus on the why, you are
telling me what the code does.
> +
> +out:
> + return ret;
> }
>
> static int i915_pm_suspend(struct device *kdev)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ab7e68626c49..efbf34318893 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3022,6 +3022,8 @@ extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
> extern void i915_reset(struct drm_i915_private *dev_priv);
> extern int i915_reset_engine(struct intel_engine_cs *engine);
> extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
> +extern int intel_reset_engine_start(struct intel_engine_cs *engine);
> +extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
> extern int intel_guc_reset(struct drm_i915_private *dev_priv);
> extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
> extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
> @@ -3410,7 +3412,6 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
>
> struct drm_i915_gem_request *
> i915_gem_find_active_request(struct intel_engine_cs *engine);
> -
Nope. (find_active_request is not in the same group of operations as
retire_requests.)
> void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>
> static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> @@ -3438,11 +3439,16 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
> return READ_ONCE(error->reset_count);
> }
>
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
> +int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
> +int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
> + unsigned int engine_mask);
> void i915_gem_reset(struct drm_i915_private *dev_priv);
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
> +void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
> +void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
> + unsigned int engine_mask);
> void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
> bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
> +void i915_gem_reset_engine(struct intel_engine_cs *engine);
>
> void i915_gem_init_mmio(struct drm_i915_private *i915);
> int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 33fb11cc5acc..bce38062f94e 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2793,48 +2793,57 @@ static bool engine_stalled(struct intel_engine_cs *engine)
> return true;
> }
>
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
> +/* Ensure irq handler finishes, and not run again. */
> +int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> {
> - struct intel_engine_cs *engine;
> - enum intel_engine_id id;
> + struct drm_i915_gem_request *request;
> int err = 0;
>
> - /* Ensure irq handler finishes, and not run again. */
> - for_each_engine(engine, dev_priv, id) {
> - struct drm_i915_gem_request *request;
> -
> - /* Prevent the signaler thread from updating the request
> - * state (by calling dma_fence_signal) as we are processing
> - * the reset. The write from the GPU of the seqno is
> - * asynchronous and the signaler thread may see a different
> - * value to us and declare the request complete, even though
> - * the reset routine have picked that request as the active
> - * (incomplete) request. This conflict is not handled
> - * gracefully!
> - */
> - kthread_park(engine->breadcrumbs.signaler);
> -
> - /* Prevent request submission to the hardware until we have
> - * completed the reset in i915_gem_reset_finish(). If a request
> - * is completed by one engine, it may then queue a request
> - * to a second via its engine->irq_tasklet *just* as we are
> - * calling engine->init_hw() and also writing the ELSP.
> - * Turning off the engine->irq_tasklet until the reset is over
> - * prevents the race.
> - */
> - tasklet_kill(&engine->irq_tasklet);
> - tasklet_disable(&engine->irq_tasklet);
>
> - if (engine->irq_seqno_barrier)
> - engine->irq_seqno_barrier(engine);
> + /* Prevent the signaler thread from updating the request
> + * state (by calling dma_fence_signal) as we are processing
> + * the reset. The write from the GPU of the seqno is
> + * asynchronous and the signaler thread may see a different
> + * value to us and declare the request complete, even though
> + * the reset routine have picked that request as the active
> + * (incomplete) request. This conflict is not handled
> + * gracefully!
> + */
> + kthread_park(engine->breadcrumbs.signaler);
> +
> + /* Prevent request submission to the hardware until we have
> + * completed the reset in i915_gem_reset_finish(). If a request
> + * is completed by one engine, it may then queue a request
> + * to a second via its engine->irq_tasklet *just* as we are
> + * calling engine->init_hw() and also writing the ELSP.
> + * Turning off the engine->irq_tasklet until the reset is over
> + * prevents the race.
> + */
> + tasklet_kill(&engine->irq_tasklet);
> + tasklet_disable(&engine->irq_tasklet);
>
> - if (engine_stalled(engine)) {
> - request = i915_gem_find_active_request(engine);
> - if (request && request->fence.error == -EIO)
> - err = -EIO; /* Previous reset failed! */
> - }
> + if (engine->irq_seqno_barrier)
> + engine->irq_seqno_barrier(engine);
> +
> + if (engine_stalled(engine)) {
> + request = i915_gem_find_active_request(engine);
> + if (request && request->fence.error == -EIO)
> + err = -EIO; /* Previous reset failed! */
> }
>
> + return err;
> +}
> +
> +int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
> + unsigned int engine_mask)
> +{
> + struct intel_engine_cs *engine;
> + unsigned int tmp;
> + int err = 0;
> +
> + for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> + err = i915_gem_reset_prepare_engine(engine);
You are losing any earlier err.
> +
> i915_gem_revoke_fences(dev_priv);
>
> return err;
> @@ -2920,7 +2929,7 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
> return guilty;
> }
>
> -static void i915_gem_reset_engine(struct intel_engine_cs *engine)
> +void i915_gem_reset_engine(struct intel_engine_cs *engine)
> {
> struct drm_i915_gem_request *request;
>
> @@ -2966,16 +2975,22 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
> }
> }
>
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> +void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> +{
> + tasklet_enable(&engine->irq_tasklet);
> + kthread_unpark(engine->breadcrumbs.signaler);
> +}
> +
> +void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
> + unsigned int engine_mask)
> {
> struct intel_engine_cs *engine;
> - enum intel_engine_id id;
> + unsigned int tmp;
>
> lockdep_assert_held(&dev_priv->drm.struct_mutex);
>
> - for_each_engine(engine, dev_priv, id) {
> - tasklet_enable(&engine->irq_tasklet);
> - kthread_unpark(engine->breadcrumbs.signaler);
> + for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> + i915_gem_reset_finish_engine(engine);
> }
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 6198f6997d05..f69a8c535d5f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1216,7 +1216,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
> return timeout;
> }
>
> -static void engine_retire_requests(struct intel_engine_cs *engine)
> +void engine_retire_requests(struct intel_engine_cs *engine)
Fortunately stray chunk. I was about to scream.
> {
> struct drm_i915_gem_request *request, *next;
> u32 seqno = intel_engine_get_seqno(engine);
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index ab5bdd110ac3..3ebba6b2dd74 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1801,6 +1801,26 @@ int intel_guc_reset(struct drm_i915_private *dev_priv)
> return ret;
> }
>
> +/*
> + * On gen8+ a reset request has to be issued via the reset control register
> + * before a GPU engine can be reset in order to stop the command streamer
> + * and idle the engine. This replaces the legacy way of stopping an engine
> + * by writing to the stop ring bit in the MI_MODE register.
> + */
> +int intel_reset_engine_start(struct intel_engine_cs *engine)
> +{
> + return gen8_reset_engine_start(engine);
> +}
> +
> +/*
> + * It is possible to back off from a previously issued reset request by simply
> + * clearing the reset request bit in the reset control register.
> + */
> +void intel_reset_engine_cancel(struct intel_engine_cs *engine)
> +{
> + gen8_reset_engine_cancel(engine);
> +}
> +
> bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
> {
> return check_for_unclaimed_mmio(dev_priv);
> --
> 2.11.0
>
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list