[PATCH 5/5] drm/i915: Solve the GPU reset vs. modeset deadlocks with an rw_semaphore

Thu Jun 29 13:49:48 UTC 2017

From: Ville Syrjälä <ville.syrjala at linux.intel.com>

Introduce an rw_semaphore to protect the display commits. All normal
commits use down_read() and hence can proceed in parallel, but GPU reset
will use down_write() making sure no other commits are in progress when
we have to pull the plug on the display engine on pre-g4x platforms.
There are no modeset/gem locks taken inside __intel_atomic_commit_tail()
itself, and we wait for all dependencies before the down_read(), and
thus there is no chance of deadlocks with this scheme.

During reset we should be recommiting the state that was committed last.
But for now we'll settle for recommiting the last state for each object.
Hence we may commit things a bit too soon when a GPU reset occurs. The
rw_semaphore should guarantee that whatever state we observe in
obj->state during reset sticks around while we do the commit. The
obj->state pointer might change for some objects if another swap_state()
occurs while we do our thing, so there migth be some theoretical
mismatch which I tink we could avoid by grabbing the rw_semaphore also
around the swap_state(), but for now I didn't do that.

Another source of mismatch can happen because we sometimes use the
intel_crtc->config during the actual commit, and that only gets updated
when we do the commuit. Hence we may get some state via ->state, some
via the duplicated ->state, and some via ->config. We'll want to
fix this all by tracking the commited state properly, but that will
some bigger refactroing so for now we'll just choose to accept these
potential mismatches.

I left out the state readout from the post-reset display
reinitialization as that still likes to clobber crtc->state etc.
If we make it use a free standing atomic state and mke sure it doesn't
need any locks we could reintroduce it. For now I just mark the
post-reset display state as dirty as possible to make sure we
reinitialize everything.

One other potential issue remains in the form of display detection.
Either we need to protect that with the same rw_semaphore as well, or
perhaps it would be enough to force gmbus into bitbanging mode while
the reset is happening and we don't have interrupts, and just across
the actual hardware GPU reset we could hold the gmbus mutex.

v2: Keep intel_prepare/finish_reset() outside struct_mutex (Chris)
v3: Drop all the committed_state refactoring to make this less
    obnoxious to backport (Daniel)

Cc: stable at vger.kernel.org # for the brave
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101597
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99093
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101600
Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion")
Fixes: 221fe7994554 ("drm/i915: Perform a direct reset of the GPU from the waiter")
Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |   2 +
 drivers/gpu/drm/i915/i915_irq.c      |  44 +-------
 drivers/gpu/drm/i915/intel_display.c | 199 ++++++++++++++++++++++++-----------
 3 files changed, 139 insertions(+), 106 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index effbe4f72a64..88ddd27f61b0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2237,6 +2237,8 @@ struct drm_i915_private {
 	struct drm_atomic_state *modeset_restore_state;
 	struct drm_modeset_acquire_ctx reset_ctx;
 
+	struct rw_semaphore commit_sem;
+
 	struct list_head vm_list; /* Global list of all address spaces */
 	struct i915_ggtt ggtt; /* VM representing the global address space */
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index eb4f1dca2077..9d591f17fda3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2587,46 +2587,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
-struct wedge_me {
-	struct delayed_work work;
-	struct drm_i915_private *i915;
-	const char *name;
-};
-
-static void wedge_me(struct work_struct *work)
-{
-	struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
-	dev_err(w->i915->drm.dev,
-		"%s timed out, cancelling all in-flight rendering.\n",
-		w->name);
-	i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
-			 struct drm_i915_private *i915,
-			 long timeout,
-			 const char *name)
-{
-	w->i915 = i915;
-	w->name = name;
-
-	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
-	schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
-	cancel_delayed_work_sync(&w->work);
-	destroy_delayed_work_on_stack(&w->work);
-	w->i915 = NULL;
-}
-
-#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
-	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
-	     (W)->i915;							\
-	     __fini_wedge((W)))
-
 /**
  * i915_reset_device - do process context error handling work
  * @dev_priv: i915 device private
@@ -2640,15 +2600,13 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	/* Use a watchdog to ensure that our reset completes */
-	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
+	{
 		intel_prepare_reset(dev_priv);
 
 		/* Signal that locked waiters should reset the GPU */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 7cdd6ec97f80..f13c7d81d4a9 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -123,6 +123,7 @@ static void ironlake_pfit_enable(struct intel_crtc *crtc);
 static void intel_modeset_setup_hw_state(struct drm_device *dev,
 					 struct drm_modeset_acquire_ctx *ctx);
 static void intel_pre_disable_primary_noatomic(struct drm_crtc *crtc);
+static void __intel_atomic_commit_tail(struct drm_atomic_state *state, bool is_reset);
 
 struct intel_limit {
 	struct {
@@ -3491,27 +3492,85 @@ static bool gpu_reset_clobbers_display(struct drm_i915_private *dev_priv)
 		INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv);
 }
 
-void intel_prepare_reset(struct drm_i915_private *dev_priv)
+static void init_intel_state(struct intel_atomic_state *state)
+{
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *old_crtc_state, *new_crtc_state;
+	int i;
+
+	state->modeset = true;
+
+	for_each_oldnew_crtc_in_state(&state->base, crtc,
+				      old_crtc_state, new_crtc_state, i) {
+		if (new_crtc_state->active)
+			state->active_crtcs |= 1 << i;
+		else
+			state->active_crtcs &= ~(1 << i);
+
+		if (old_crtc_state->active != new_crtc_state->active)
+			state->active_pipe_changes |= drm_crtc_mask(crtc);
+	}
+}
+
+static struct drm_atomic_state *
+intel_duplicate_committed_state(struct drm_i915_private *dev_priv)
 {
-	struct drm_device *dev = &dev_priv->drm;
-	struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
 	struct drm_atomic_state *state;
-	int ret;
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *crtc_state;
+	int i;
 
-	/*
-	 * Need mode_config.mutex so that we don't
-	 * trample ongoing ->detect() and whatnot.
-	 */
-	mutex_lock(&dev->mode_config.mutex);
-	drm_modeset_acquire_init(ctx, 0);
-	while (1) {
-		ret = drm_modeset_lock_all_ctx(dev, ctx);
-		if (ret != -EDEADLK)
-			break;
+	state = drm_atomic_helper_duplicate_committed_state(&dev_priv->drm);
+	if (IS_ERR(state)) {
+		DRM_ERROR("Duplicating state failed with %ld\n",
+			  PTR_ERR(state));
+		return NULL;
+	}
+
+	to_intel_atomic_state(state)->active_crtcs = 0;
+	to_intel_atomic_state(state)->cdclk.logical = dev_priv->cdclk.hw;
+	to_intel_atomic_state(state)->cdclk.actual = dev_priv->cdclk.hw;
+
+	init_intel_state(to_intel_atomic_state(state));
+
+	/* force a full update */
+	for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
+		struct intel_crtc_state *intel_crtc_state =
+			to_intel_crtc_state(crtc_state);
+
+		if (!crtc_state->active)
+			continue;
 
-		drm_modeset_backoff(ctx);
+		crtc_state->mode_changed = true;
+		crtc_state->active_changed = true;
+		crtc_state->planes_changed = true;
+		crtc_state->connectors_changed = true;
+		crtc_state->color_mgmt_changed = true;
+		crtc_state->zpos_changed = true;
+
+		intel_crtc_state->update_pipe = true;
+		intel_crtc_state->disable_lp_wm = true;
+		intel_crtc_state->disable_cxsr = true;
+		intel_crtc_state->update_wm_post = true;
+		intel_crtc_state->fb_changed = true;
+		intel_crtc_state->fifo_changed = true;
+		intel_crtc_state->wm.need_postvbl_update = true;
 	}
 
+	return state;
+}
+
+void intel_prepare_reset(struct drm_i915_private *dev_priv)
+{
+	struct drm_atomic_state *disable_state, *restore_state = NULL;
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *crtc_state;
+	struct drm_plane *plane;
+	struct drm_plane_state *plane_state;
+	int i;
+
+	down_write(&dev_priv->commit_sem);
+
 	/* reset doesn't touch the display, but flips might get nuked anyway, */
 	if (!i915.force_reset_modeset_test &&
 	    !gpu_reset_clobbers_display(dev_priv))
@@ -3521,30 +3580,40 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
 	 * Disabling the crtcs gracefully seems nicer. Also the
 	 * g33 docs say we should at least disable all the planes.
 	 */
-	state = drm_atomic_helper_duplicate_state(dev, ctx);
-	if (IS_ERR(state)) {
-		ret = PTR_ERR(state);
-		DRM_ERROR("Duplicating state failed with %i\n", ret);
-		return;
-	}
+	disable_state = intel_duplicate_committed_state(dev_priv);
+	if (IS_ERR(disable_state))
+		goto out;
 
-	ret = drm_atomic_helper_disable_all(dev, ctx);
-	if (ret) {
-		DRM_ERROR("Suspending crtc's failed with %i\n", ret);
-		drm_atomic_state_put(state);
-		return;
-	}
+	to_intel_atomic_state(disable_state)->active_crtcs = 0;
 
-	dev_priv->modeset_restore_state = state;
-	state->acquire_ctx = ctx;
+	for_each_new_crtc_in_state(disable_state, crtc, crtc_state, i)
+		crtc_state->active = false;
+	for_each_new_plane_in_state(disable_state, plane, plane_state, i)
+		plane_state->visible = false;
+
+	__intel_atomic_commit_tail(disable_state, true);
+
+	drm_atomic_helper_clean_committed_state(disable_state);
+	drm_atomic_state_put(disable_state);
+
+	restore_state = intel_duplicate_committed_state(dev_priv);
+	if (IS_ERR(restore_state))
+		restore_state = NULL;
+
+	for_each_old_crtc_in_state(restore_state, crtc, crtc_state, i)
+		crtc_state->active = false;
+	for_each_old_plane_in_state(restore_state, plane, plane_state, i)
+		plane_state->visible = false;
+
+out:
+	dev_priv->modeset_restore_state = restore_state;
 }
 
 void intel_finish_reset(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = &dev_priv->drm;
-	struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
-	struct drm_atomic_state *state = dev_priv->modeset_restore_state;
-	int ret;
+	struct drm_atomic_state *restore_state =
+		dev_priv->modeset_restore_state;
 
 	/*
 	 * Flips in the rings will be nuked by the reset,
@@ -3557,7 +3626,7 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 
 	/* reset doesn't touch the display */
 	if (!gpu_reset_clobbers_display(dev_priv)) {
-		if (!state) {
+		if (!restore_state) {
 			/*
 			 * Flips in the rings have been nuked by the reset,
 			 * so update the base address of all primary
@@ -3569,11 +3638,11 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 			 */
 			intel_update_primary_planes(dev);
 		} else {
-			ret = __intel_display_resume(dev, state, ctx);
-			if (ret)
-				DRM_ERROR("Restoring old state failed with %i\n", ret);
+			__intel_atomic_commit_tail(restore_state, true);
 		}
 	} else {
+		i915_redisable_vga(dev_priv);
+
 		/*
 		 * The display has been reset as well,
 		 * so need a full re-initialization.
@@ -3589,18 +3658,17 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 			dev_priv->display.hpd_irq_setup(dev_priv);
 		spin_unlock_irq(&dev_priv->irq_lock);
 
-		ret = __intel_display_resume(dev, state, ctx);
-		if (ret)
-			DRM_ERROR("Restoring old state failed with %i\n", ret);
+		__intel_atomic_commit_tail(restore_state, true);
 
 		intel_hpd_init(dev_priv);
 	}
 
-	if (state)
-		drm_atomic_state_put(state);
-	drm_modeset_drop_locks(ctx);
-	drm_modeset_acquire_fini(ctx);
-	mutex_unlock(&dev->mode_config.mutex);
+	if (restore_state) {
+		drm_atomic_helper_clean_committed_state(restore_state);
+		drm_atomic_state_put(restore_state);
+	}
+
+	up_write(&dev_priv->commit_sem);
 }
 
 static bool abort_flip_on_reset(struct intel_crtc *crtc)
@@ -12592,29 +12660,18 @@ static int intel_modeset_checks(struct drm_atomic_state *state)
 {
 	struct intel_atomic_state *intel_state = to_intel_atomic_state(state);
 	struct drm_i915_private *dev_priv = to_i915(state->dev);
-	struct drm_crtc *crtc;
-	struct drm_crtc_state *old_crtc_state, *new_crtc_state;
-	int ret = 0, i;
+	int ret = 0;
 
 	if (!check_digital_port_conflicts(state)) {
 		DRM_DEBUG_KMS("rejecting conflicting digital port configuration\n");
 		return -EINVAL;
 	}
 
-	intel_state->modeset = true;
 	intel_state->active_crtcs = dev_priv->active_crtcs;
 	intel_state->cdclk.logical = dev_priv->cdclk.logical;
 	intel_state->cdclk.actual = dev_priv->cdclk.actual;
 
-	for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) {
-		if (new_crtc_state->active)
-			intel_state->active_crtcs |= 1 << i;
-		else
-			intel_state->active_crtcs &= ~(1 << i);
-
-		if (old_crtc_state->active != new_crtc_state->active)
-			intel_state->active_pipe_changes |= drm_crtc_mask(crtc);
-	}
+	init_intel_state(intel_state);
 
 	/*
 	 * See if the config requires any additional preparation, e.g.
@@ -13004,7 +13061,7 @@ static void intel_atomic_helper_free_state_worker(struct work_struct *work)
 	intel_atomic_helper_free_state(dev_priv);
 }
 
-static void __intel_atomic_commit_tail(struct drm_atomic_state *state)
+static void __intel_atomic_commit_tail(struct drm_atomic_state *state, bool is_reset)
 {
 	struct drm_device *dev = state->dev;
 	struct intel_atomic_state *intel_state = to_intel_atomic_state(state);
@@ -13068,10 +13125,18 @@ static void __intel_atomic_commit_tail(struct drm_atomic_state *state)
 
 	/* Only after disabling all output pipelines that will be changed can we
 	 * update the the output configuration. */
-	intel_modeset_update_crtc_state(state);
+	if (!is_reset)
+		intel_modeset_update_crtc_state(state);
 
 	if (intel_state->modeset) {
-		drm_atomic_helper_update_legacy_modeset_state(state->dev, state);
+		if (!is_reset) {
+			drm_atomic_helper_update_legacy_modeset_state(state->dev, state);
+		} else {
+			for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
+				if (new_crtc_state->enable)
+					drm_calc_timestamping_constants(crtc, &new_crtc_state->adjusted_mode);
+			}
+		}
 
 		intel_set_cdclk(dev_priv, &dev_priv->cdclk.actual);
 
@@ -13082,7 +13147,8 @@ static void __intel_atomic_commit_tail(struct drm_atomic_state *state)
 		if (!intel_can_enable_sagv(state))
 			intel_disable_sagv(dev_priv);
 
-		intel_modeset_verify_disabled(dev, state);
+		if (!is_reset)
+			intel_modeset_verify_disabled(dev, state);
 	}
 
 	/* Complete the events for pipes that have now been disabled */
@@ -13135,7 +13201,8 @@ static void __intel_atomic_commit_tail(struct drm_atomic_state *state)
 		if (put_domains[i])
 			modeset_put_power_domains(dev_priv, put_domains[i]);
 
-		intel_modeset_verify_crtc(crtc, state, old_crtc_state, new_crtc_state);
+		if (!is_reset)
+			intel_modeset_verify_crtc(crtc, state, old_crtc_state, new_crtc_state);
 	}
 
 	if (intel_state->modeset && intel_can_enable_sagv(state))
@@ -13160,10 +13227,14 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
 
 	drm_atomic_helper_wait_for_dependencies(state);
 
-	__intel_atomic_commit_tail(state);
+	down_read(&dev_priv->commit_sem);
+
+	__intel_atomic_commit_tail(state, false);
 
 	drm_atomic_helper_commit_hw_done(state);
 
+	up_read(&dev_priv->commit_sem);
+
 	mutex_lock(&dev->struct_mutex);
 	drm_atomic_helper_cleanup_planes(dev, state);
 	mutex_unlock(&dev->struct_mutex);
@@ -15022,6 +15093,8 @@ int intel_modeset_init(struct drm_device *dev)
 	INIT_WORK(&dev_priv->atomic_helper.free_work,
 		  intel_atomic_helper_free_state_worker);
 
+	init_rwsem(&dev_priv->commit_sem);
+
 	intel_init_quirks(dev);
 
 	intel_init_pm(dev_priv);
-- 
2.13.0