[Intel-gfx] [PATCH 3/3] drm/i915: More surgically unbreak the modeset vs reset deadlock

Mon Aug 14 14:13:59 UTC 2017

On 8/8/2017 1:08 AM, Daniel Vetter wrote:
> There's no reason to entirely wedge the gpu, for the minimal deadlock
> bugfix we only need to unbreak/decouple the atomic commit from the gpu
> reset. The simplest way to fix that is by replacing the
> unconditional fence wait a the top of commit_tail by a wait which
> completes either when the fences are done (normal case, or when a
> reset doesn't need to touch the display state). Or when the gpu reset
> needs to force-unblock all pending modeset states.
> 
> The lesser source of deadlocks is when we try to pin a new framebuffer
> and run into a stall. There's a bunch of places this can happen, like
> eviction, changing the caching mode, acquiring a fence on older
> platforms. And we can't just break the depency loop and keep going,
> the only way would be to break out and restart. But the problem with
> that approach is that we must stall for the reset to complete before
> we grab any locks, and with the atomic infrastructure that's a bit
> tricky. The only place is the ioctl code, and we don't want to insert
> code into e.g. the BUSY ioctl. Hence for that problem just create a
> critical section, and if any code is in there, wedge the GPU. For the
> steady-state this should never be a problem.
> 
> Note that in both cases TDR itself keeps working, so from a userspace
> pov this trickery isn't observable. Users themselvs might spot a short
> glitch while the rendering is catching up again, but that's still
> better than pre-TDR where we've thrown away all the rendering,
> including innocent batches. Also, this fixes the regression TDR
> introduced of making gpu resets deadlock-prone when we do need to
> touch the display.
> 
> One thing I noticed is that gpu_error.flags seems to use both our own
> wait-queue in gpu_error.wait_queue, and the generic wait_on_bit
> facilities. Not entirely sure why this inconsistency exists, I just
> picked one style.
> 
> A possible future avenue could be to insert the gpu reset in-between
> ongoing modeset changes, which would avoid the momentary glitch. But
> that's a lot more work to implement in the atomic commit machinery,
> and given that we only need this for pre-g4x hw, of questionable
> utility just for the sake of polishing gpu reset even more on those
> old boxes. It might be useful for other features though.
> 
> v2: Rebase onto 4.13 with a s/wait_queue_t/struct wait_queue_entry/.
> 
> v3: Really emabarrassing fixup, I checked the wrong bit and broke the
> unbreak/wakeup logic.
> 
> v4: Also handle deadlocks in pin_to_display.
> 
> v5: Review from Michel:
> - Fixup the BUILD_BUG_ON
> - Don't forget about the overlay
> 

Reviewed-by: Michel Thierry <michel.thierry at intel.com>

> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com> (v2)
> Cc: Michel Thierry <michel.thierry at intel.com>
> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h      |  3 +++
>   drivers/gpu/drm/i915/i915_irq.c      |  2 +-
>   drivers/gpu/drm/i915/intel_display.c | 46 ++++++++++++++++++++++++++++++------
>   drivers/gpu/drm/i915/intel_overlay.c | 11 +++++++--
>   4 files changed, 52 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 907603cba447..4e669b7738d9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1528,6 +1528,8 @@ struct i915_gpu_error {
>   	/* Protected by the above dev->gpu_error.lock. */
>   	struct i915_gpu_state *first_error;
>   
> +	atomic_t pending_fb_pin;
> +
>   	unsigned long missed_irq_rings;
>   
>   	/**
> @@ -1587,6 +1589,7 @@ struct i915_gpu_error {
>   	unsigned long flags;
>   #define I915_RESET_BACKOFF	0
>   #define I915_RESET_HANDOFF	1
> +#define I915_RESET_MODESET	2
>   #define I915_WEDGED		(BITS_PER_LONG - 1)
>   #define I915_RESET_ENGINE	(I915_WEDGED - I915_NUM_ENGINES)
>   
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb9f98555c71..f181f19e8436 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2731,7 +2731,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
>   	 */
>   	if (intel_has_reset_engine(dev_priv)) {
>   		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -			BUILD_BUG_ON(I915_RESET_HANDOFF >= I915_RESET_ENGINE);
> +			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
>   			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
>   					     &dev_priv->gpu_error.flags))
>   				continue;
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index da8d0d3b2bc2..b5631d97a317 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -2157,6 +2157,8 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
>   	 */
>   	intel_runtime_pm_get(dev_priv);
>   
> +	atomic_inc(&dev_priv->gpu_error.pending_fb_pin);
> +
>   	vma = i915_gem_object_pin_to_display_plane(obj, alignment, &view);
>   	if (IS_ERR(vma))
>   		goto err;
> @@ -2184,6 +2186,8 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
>   
>   	i915_vma_get(vma);
>   err:
> +	atomic_dec(&dev_priv->gpu_error.pending_fb_pin);
> +
>   	intel_runtime_pm_put(dev_priv);
>   	return vma;
>   }
> @@ -3458,12 +3462,14 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
>   	    !gpu_reset_clobbers_display(dev_priv))
>   		return;
>   
> -	/* We have a modeset vs reset deadlock, defensively unbreak it.
> -	 *
> -	 * FIXME: We can do a _lot_ better, this is just a first iteration.
> -	 */
> -	i915_gem_set_wedged(dev_priv);
> -	DRM_DEBUG_DRIVER("Wedging GPU to avoid deadlocks with pending modeset updates\n");
> +	/* We have a modeset vs reset deadlock, defensively unbreak it. */
> +	set_bit(I915_RESET_MODESET, &dev_priv->gpu_error.flags);
> +	wake_up_all(&dev_priv->gpu_error.wait_queue);
> +
> +	if (atomic_read(&dev_priv->gpu_error.pending_fb_pin)) {
> +		DRM_DEBUG_KMS("Modeset potentially stuck, unbreaking through wedging\n");
> +		i915_gem_set_wedged(dev_priv);
> +	}
>   
>   	/*
>   	 * Need mode_config.mutex so that we don't
> @@ -3551,6 +3557,8 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
>   	drm_modeset_drop_locks(ctx);
>   	drm_modeset_acquire_fini(ctx);
>   	mutex_unlock(&dev->mode_config.mutex);
> +
> +	clear_bit(I915_RESET_MODESET, &dev_priv->gpu_error.flags);
>   }
>   
>   static void intel_update_pipe_config(struct intel_crtc *crtc,
> @@ -12069,6 +12077,30 @@ static void intel_atomic_helper_free_state_worker(struct work_struct *work)
>   	intel_atomic_helper_free_state(dev_priv);
>   }
>   
> +static void intel_atomic_commit_fence_wait(struct intel_atomic_state *intel_state)
> +{
> +	struct wait_queue_entry wait_fence, wait_reset;
> +	struct drm_i915_private *dev_priv = to_i915(intel_state->base.dev);
> +
> +	init_wait_entry(&wait_fence, 0);
> +	init_wait_entry(&wait_reset, 0);
> +	for (;;) {
> +		prepare_to_wait(&intel_state->commit_ready.wait,
> +				&wait_fence, TASK_UNINTERRUPTIBLE);
> +		prepare_to_wait(&dev_priv->gpu_error.wait_queue,
> +				&wait_reset, TASK_UNINTERRUPTIBLE);
> +
> +
> +		if (i915_sw_fence_done(&intel_state->commit_ready)
> +		    || test_bit(I915_RESET_MODESET, &dev_priv->gpu_error.flags))
> +			break;
> +
> +		schedule();
> +	}
> +	finish_wait(&intel_state->commit_ready.wait, &wait_fence);
> +	finish_wait(&dev_priv->gpu_error.wait_queue, &wait_reset);
> +}
> +
>   static void intel_atomic_commit_tail(struct drm_atomic_state *state)
>   {
>   	struct drm_device *dev = state->dev;
> @@ -12082,7 +12114,7 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
>   	unsigned crtc_vblank_mask = 0;
>   	int i;
>   
> -	i915_sw_fence_wait(&intel_state->commit_ready);
> +	intel_atomic_commit_fence_wait(intel_state);
>   
>   	drm_atomic_helper_wait_for_dependencies(state);
>   
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index b96aed941b97..aace22e7ccac 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -799,9 +799,13 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
>   	if (ret != 0)
>   		return ret;
>   
> +	atomic_inc(&dev_priv->gpu_error.pending_fb_pin);
> +
>   	vma = i915_gem_object_pin_to_display_plane(new_bo, 0, NULL);
> -	if (IS_ERR(vma))
> -		return PTR_ERR(vma);
> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto out_pin_section;
> +	}
>   
>   	ret = i915_vma_put_fence(vma);
>   	if (ret)
> @@ -886,6 +890,9 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
>   
>   out_unpin:
>   	i915_gem_object_unpin_from_display_plane(vma);
> +out_pin_section:
> +	atomic_dec(&dev_priv->gpu_error.pending_fb_pin);
> +
>   	return ret;
>   }
>   
>