[Intel-gfx] [PATCH v2] drm/i915: Separate out reset flags from the reset counter
Mika Kuoppala
mika.kuoppala at linux.intel.com
Tue Aug 30 14:40:49 UTC 2016
Chris Wilson <chris at chris-wilson.co.uk> writes:
> In preparation for introducing a per-engine reset, we can first separate
> the mixing of the reset state from the global reset counter.
>
> The loss of atomicity in updating the reset state poses a small problem
> for handling the waiters. For requests, this is solved by advancing the
> seqno so that a waiter waking up after the reset knows the request is
> complete. For pending flips, we still rely on the increment of the
> global reset epoch (as well as the reset-in-progress flag) to signify
> when the hardware was reset.
>
> The advantage, now that we do not inspect the reset state during reset
> itself i.e. we no longer emit requests during reset, is that we can use
> the atomic updates of the state flags to ensure that only one reset
> worker is active.
>
> v2: Mika spotted that I transformed the i915_gem_wait_for_error() wakeup
> into a waiter wakeup.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
The wait_for_request docs are infested with reset_count which
is no more for them. But not fault of this patch.
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Arun Siluvery <arun.siluvery at linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-6-git-send-email-arun.siluvery@linux.intel.com
> ---
> drivers/gpu/drm/i915/i915_drv.c | 16 ++---
> drivers/gpu/drm/i915/i915_drv.h | 46 +++++---------
> drivers/gpu/drm/i915/i915_gem.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_request.c | 13 ++--
> drivers/gpu/drm/i915/i915_irq.c | 103 ++++++++++++++------------------
> drivers/gpu/drm/i915/intel_display.c | 25 +++++---
> drivers/gpu/drm/i915/intel_drv.h | 4 +-
> 7 files changed, 92 insertions(+), 117 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 47fe07283d88..01b518dcbd7a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1602,7 +1602,7 @@ static int i915_drm_resume(struct drm_device *dev)
> mutex_lock(&dev->struct_mutex);
> if (i915_gem_init_hw(dev)) {
> DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
> - atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
> + set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> }
> mutex_unlock(&dev->struct_mutex);
>
> @@ -1764,20 +1764,13 @@ int i915_reset(struct drm_i915_private *dev_priv)
> {
> struct drm_device *dev = &dev_priv->drm;
> struct i915_gpu_error *error = &dev_priv->gpu_error;
> - unsigned reset_counter;
> int ret;
>
> mutex_lock(&dev->struct_mutex);
>
> /* Clear any previous failed attempts at recovery. Time to try again. */
> - atomic_andnot(I915_WEDGED, &error->reset_counter);
> -
> - /* Clear the reset-in-progress flag and increment the reset epoch. */
> - reset_counter = atomic_inc_return(&error->reset_counter);
> - if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
> - ret = -EIO;
> - goto error;
> - }
> + __clear_bit(I915_WEDGED, &error->flags);
> + error->reset_count++;
>
> pr_notice("drm/i915: Resetting chip after gpu hang\n");
>
> @@ -1814,6 +1807,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
> goto error;
> }
>
> + clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
> mutex_unlock(&dev->struct_mutex);
>
> /*
> @@ -1828,7 +1822,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
> return 0;
>
> error:
> - atomic_or(I915_WEDGED, &error->reset_counter);
> + set_bit(I915_WEDGED, &error->flags);
> mutex_unlock(&dev->struct_mutex);
> return ret;
> }
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c413587895cf..e574eaa65c4d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1386,9 +1386,10 @@ struct i915_gpu_error {
> * State variable controlling the reset flow and count
> *
> * This is a counter which gets incremented when reset is triggered,
> - * and again when reset has been handled. So odd values (lowest bit set)
> - * means that reset is in progress and even values that
> - * (reset_counter >> 1):th reset was successfully completed.
> + *
> + * Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
> + * meaning that any waiters holding onto the struct_mutex should
> + * relinquish the lock immediately in order for the reset to start.
> *
> * If reset is not completed succesfully, the I915_WEDGE bit is
> * set meaning that hardware is terminally sour and there is no
> @@ -1403,10 +1404,11 @@ struct i915_gpu_error {
> * naturally enforces the correct ordering between the bail-out of the
> * waiter and the gpu reset work code.
> */
> - atomic_t reset_counter;
> + unsigned long reset_count;
>
> -#define I915_RESET_IN_PROGRESS_FLAG 1
> -#define I915_WEDGED (1 << 31)
> + unsigned long flags;
> +#define I915_RESET_IN_PROGRESS 0
> +#define I915_WEDGED (BITS_PER_LONG - 1)
>
> /**
> * Waitqueue to signal when a hang is detected. Used to for waiters
> @@ -3234,44 +3236,24 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
>
> void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>
> -static inline u32 i915_reset_counter(struct i915_gpu_error *error)
> -{
> - return atomic_read(&error->reset_counter);
> -}
> -
> -static inline bool __i915_reset_in_progress(u32 reset)
> -{
> - return unlikely(reset & I915_RESET_IN_PROGRESS_FLAG);
> -}
> -
> -static inline bool __i915_reset_in_progress_or_wedged(u32 reset)
> -{
> - return unlikely(reset & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> -}
> -
> -static inline bool __i915_terminally_wedged(u32 reset)
> -{
> - return unlikely(reset & I915_WEDGED);
> -}
> -
> static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> {
> - return __i915_reset_in_progress(i915_reset_counter(error));
> + return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
> }
>
> -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
> +static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> {
> - return __i915_reset_in_progress_or_wedged(i915_reset_counter(error));
> + return unlikely(test_bit(I915_WEDGED, &error->flags));
> }
>
> -static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> +static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
> {
> - return __i915_terminally_wedged(i915_reset_counter(error));
> + return i915_reset_in_progress(error) | i915_terminally_wedged(error);
> }
>
> static inline u32 i915_reset_count(struct i915_gpu_error *error)
> {
> - return ((i915_reset_counter(error) & ~I915_WEDGED) + 1) / 2;
> + return READ_ONCE(error->reset_count);
> }
>
> void i915_gem_reset(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 838a275e7fac..c06dacdae87f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4525,7 +4525,7 @@ int i915_gem_init(struct drm_device *dev)
> * for all other failure, such as an allocation failure, bail.
> */
> DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
> - atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
> + set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> ret = 0;
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index ec613fd5e01c..24eb4b1b7540 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -233,16 +233,18 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
> } while (tmp != req);
> }
>
> -static int i915_gem_check_wedge(unsigned int reset_counter, bool interruptible)
> +static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
> {
> - if (__i915_terminally_wedged(reset_counter))
> + struct i915_gpu_error *error = &dev_priv->gpu_error;
> +
> + if (i915_terminally_wedged(error))
> return -EIO;
>
> - if (__i915_reset_in_progress(reset_counter)) {
> + if (i915_reset_in_progress(error)) {
> /* Non-interruptible callers can't handle -EAGAIN, hence return
> * -EIO unconditionally for these.
> */
> - if (!interruptible)
> + if (!dev_priv->mm.interruptible)
> return -EIO;
>
> return -EAGAIN;
> @@ -331,7 +333,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> struct i915_gem_context *ctx)
> {
> struct drm_i915_private *dev_priv = engine->i915;
> - unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> struct drm_i915_gem_request *req;
> u32 seqno;
> int ret;
> @@ -340,7 +341,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
> * and restart.
> */
> - ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
> + ret = i915_gem_check_wedge(dev_priv);
> if (ret)
> return ERR_PTR(ret);
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 82358d4e0cc2..ed172d7beecb 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2501,53 +2501,41 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>
> kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>
> + DRM_DEBUG_DRIVER("resetting chip\n");
> + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> +
> /*
> - * Note that there's only one work item which does gpu resets, so we
> - * need not worry about concurrent gpu resets potentially incrementing
> - * error->reset_counter twice. We only need to take care of another
> - * racing irq/hangcheck declaring the gpu dead for a second time. A
> - * quick check for that is good enough: schedule_work ensures the
> - * correct ordering between hang detection and this work item, and since
> - * the reset in-progress bit is only ever set by code outside of this
> - * work we don't need to worry about any other races.
> + * In most cases it's guaranteed that we get here with an RPM
> + * reference held, for example because there is a pending GPU
> + * request that won't finish until the reset is done. This
> + * isn't the case at least when we get here by doing a
> + * simulated reset via debugs, so get an RPM reference.
> */
> - if (i915_reset_in_progress(&dev_priv->gpu_error)) {
> - DRM_DEBUG_DRIVER("resetting chip\n");
> - kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> -
> - /*
> - * In most cases it's guaranteed that we get here with an RPM
> - * reference held, for example because there is a pending GPU
> - * request that won't finish until the reset is done. This
> - * isn't the case at least when we get here by doing a
> - * simulated reset via debugs, so get an RPM reference.
> - */
> - intel_runtime_pm_get(dev_priv);
> + intel_runtime_pm_get(dev_priv);
>
> - intel_prepare_reset(dev_priv);
> + intel_prepare_reset(dev_priv);
>
> - /*
> - * All state reset _must_ be completed before we update the
> - * reset counter, for otherwise waiters might miss the reset
> - * pending state and not properly drop locks, resulting in
> - * deadlocks with the reset work.
> - */
> - ret = i915_reset(dev_priv);
> + /*
> + * All state reset _must_ be completed before we update the
> + * reset counter, for otherwise waiters might miss the reset
> + * pending state and not properly drop locks, resulting in
> + * deadlocks with the reset work.
> + */
> + ret = i915_reset(dev_priv);
>
> - intel_finish_reset(dev_priv);
> + intel_finish_reset(dev_priv);
>
> - intel_runtime_pm_put(dev_priv);
> + intel_runtime_pm_put(dev_priv);
>
> - if (ret == 0)
> - kobject_uevent_env(kobj,
> - KOBJ_CHANGE, reset_done_event);
> + if (ret == 0)
> + kobject_uevent_env(kobj,
> + KOBJ_CHANGE, reset_done_event);
>
> - /*
> - * Note: The wake_up also serves as a memory barrier so that
> - * waiters see the update value of the reset counter atomic_t.
> - */
> - wake_up_all(&dev_priv->gpu_error.reset_queue);
> - }
> + /*
> + * Note: The wake_up also serves as a memory barrier so that
> + * waiters see the updated value of the dev_priv->gpu_error.
> + */
> + wake_up_all(&dev_priv->gpu_error.reset_queue);
> }
>
> static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
> @@ -2666,25 +2654,26 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
> i915_capture_error_state(dev_priv, engine_mask, error_msg);
> i915_report_and_clear_eir(dev_priv);
>
> - if (engine_mask) {
> - atomic_or(I915_RESET_IN_PROGRESS_FLAG,
> - &dev_priv->gpu_error.reset_counter);
> + if (!engine_mask)
> + return;
>
> - /*
> - * Wakeup waiting processes so that the reset function
> - * i915_reset_and_wakeup doesn't deadlock trying to grab
> - * various locks. By bumping the reset counter first, the woken
> - * processes will see a reset in progress and back off,
> - * releasing their locks and then wait for the reset completion.
> - * We must do this for _all_ gpu waiters that might hold locks
> - * that the reset work needs to acquire.
> - *
> - * Note: The wake_up serves as the required memory barrier to
> - * ensure that the waiters see the updated value of the reset
> - * counter atomic_t.
> - */
> - i915_error_wake_up(dev_priv);
> - }
> + if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> + &dev_priv->gpu_error.flags))
> + return;
> +
> + /*
> + * Wakeup waiting processes so that the reset function
> + * i915_reset_and_wakeup doesn't deadlock trying to grab
> + * various locks. By bumping the reset counter first, the woken
> + * processes will see a reset in progress and back off,
> + * releasing their locks and then wait for the reset completion.
> + * We must do this for _all_ gpu waiters that might hold locks
> + * that the reset work needs to acquire.
> + *
> + * Note: The wake_up also provides a memory barrier to ensure that the
> + * waiters see the updated value of the reset flags.
> + */
> + i915_error_wake_up(dev_priv);
>
> i915_reset_and_wakeup(dev_priv);
> }
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 19ffd024ddec..2e63e5cfa98d 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3643,15 +3643,26 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
> mutex_unlock(&dev->mode_config.mutex);
> }
>
> +static bool abort_flip_on_reset(struct intel_crtc *crtc)
> +{
> + struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
> +
> + if (i915_reset_in_progress(error))
> + return true;
> +
> + if (crtc->reset_count != i915_reset_count(error))
> + return true;
> +
> + return false;
> +}
> +
> static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
> {
> struct drm_device *dev = crtc->dev;
> struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
> - unsigned reset_counter;
> bool pending;
>
> - reset_counter = i915_reset_counter(&to_i915(dev)->gpu_error);
> - if (intel_crtc->reset_counter != reset_counter)
> + if (abort_flip_on_reset(intel_crtc))
> return false;
>
> spin_lock_irq(&dev->event_lock);
> @@ -11549,10 +11560,8 @@ static bool __pageflip_finished_cs(struct intel_crtc *crtc,
> {
> struct drm_device *dev = crtc->base.dev;
> struct drm_i915_private *dev_priv = to_i915(dev);
> - unsigned reset_counter;
>
> - reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> - if (crtc->reset_counter != reset_counter)
> + if (abort_flip_on_reset(crtc))
> return true;
>
> /*
> @@ -12218,8 +12227,8 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> if (ret)
> goto cleanup;
>
> - intel_crtc->reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> - if (__i915_reset_in_progress_or_wedged(intel_crtc->reset_counter)) {
> + intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
> + if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
> ret = -EIO;
> goto cleanup;
> }
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index 570a7ca7983f..60e1cd915b85 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -712,8 +712,8 @@ struct intel_crtc {
>
> struct intel_crtc_state *config;
>
> - /* reset counter value when the last flip was submitted */
> - unsigned int reset_counter;
> + /* global reset count when the last flip was submitted */
> + unsigned int reset_count;
>
> /* Access to these should be protected by dev_priv->irq_lock. */
> bool cpu_fifo_underrun_disabled;
> --
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list