[PATCH 13/14] drm/i915: Perform device reset under stop-machine

Chris Wilson chris at chris-wilson.co.uk
Thu Jul 26 13:16:45 UTC 2018


If we do a device level reset, we lose vital registers that may be in
concurrent use by userspace (i.e. the GGTT and its fencing). To be
paranoid and prevent that memory access from being corrupted, we want to
pause all other processes/threads, so that the device reset is the only
thing running on the system.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 94 ++++++++++++++++---------------
 kernel/stop_machine.c             |  1 +
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 06217399c581..411cc9477d3d 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
 
 #include "i915_drv.h"
 #include "i915_gpu_error.h"
@@ -404,22 +405,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 	int retry;
 	int ret;
 
-	/*
-	 * We want to perform per-engine reset from atomic context (e.g.
-	 * softirq), which imposes the constraint that we cannot sleep.
-	 * However, experience suggests that spending a bit of time waiting
-	 * for a reset helps in various cases, so for a full-device reset
-	 * we apply the opposite rule and wait if we want to. As we should
-	 * always follow up a failed per-engine reset with a full device reset,
-	 * being a little faster, stricter and more error prone for the
-	 * atomic case seems an acceptable compromise.
-	 *
-	 * Unfortunately this leads to a bimodal routine, when the goal was
-	 * to have a single reset function that worked for resetting any
-	 * number of engines simultaneously.
-	 */
-	might_sleep_if(engine_mask == ALL_ENGINES);
-
 	/*
 	 * If the power well sleeps during the reset, the reset
 	 * request may be dropped and never completes (causing -EIO).
@@ -451,8 +436,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 		}
 		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
-
-		cond_resched();
 	}
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 
@@ -620,12 +603,19 @@ static void reset_engine(struct intel_engine_cs *engine,
 	engine->reset.reset(engine, rq);
 }
 
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	int err;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	/*
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.
+	 */
+	err = i915_ggtt_enable_hw(i915);
+	if (err)
+		return err;
 
 	i915_retire_requests(i915);
 
@@ -659,6 +649,7 @@ static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 	}
 
 	i915_gem_restore_fences(i915);
+	return 0;
 }
 
 static void reset_finish_engine(struct intel_engine_cs *engine)
@@ -875,6 +866,42 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return true;
 }
 
+struct __i915_reset {
+	struct drm_i915_private *i915;
+	unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+	struct __i915_reset *arg = data;
+	int err;
+
+	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+	if (err)
+		return err;
+
+	return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+	struct __i915_reset arg = { i915, stalled_mask };
+	int err, i;
+
+	cpus_read_lock();
+	lock_map_acquire(&i915->gpu_error.lockdep);
+
+	err = stop_machine_cpuslocked(__i915_reset__BKL, &arg, NULL);
+	for (i = 0; err && i < 3; i++) {
+		msleep(100);
+		err = stop_machine_cpuslocked(__i915_reset__BKL, &arg, NULL);
+	}
+
+	lock_map_release(&i915->gpu_error.lockdep);
+	cpus_read_unlock();
+
+	return err;
+}
 /**
  * i915_reset - reset chip after a hang
  * @i915: #drm_i915_private to reset
@@ -900,7 +927,6 @@ void i915_reset(struct drm_i915_private *i915,
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	int ret;
-	int i;
 
 	GEM_TRACE("flags=%lx\n", error->flags);
 
@@ -911,8 +937,6 @@ void i915_reset(struct drm_i915_private *i915,
 	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
 		return;
 
-	lock_map_acquire(&error->lockdep);
-
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	if (!i915_gem_unset_wedged(i915))
 		goto wakeup;
@@ -935,32 +959,11 @@ void i915_reset(struct drm_i915_private *i915,
 		goto error;
 	}
 
-	for (i = 0; i < 3; i++) {
-		ret = intel_gpu_reset(i915, ALL_ENGINES);
-		if (ret == 0)
-			break;
-
-		msleep(100);
-	}
-	if (ret) {
+	if (do_reset(i915, stalled_mask)) {
 		dev_err(i915->drm.dev, "Failed to reset chip\n");
 		goto taint;
 	}
 
-	/* Ok, now get things going again... */
-
-	/*
-	 * Everything depends on having the GTT running, so we need to start
-	 * there.
-	 */
-	ret = i915_ggtt_enable_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	gt_reset(i915, stalled_mask);
 	intel_overlay_reset(i915);
 
 	/*
@@ -983,7 +986,6 @@ void i915_reset(struct drm_i915_private *i915,
 finish:
 	reset_finish(i915);
 wakeup:
-	lock_map_release(&error->lockdep);
 	clear_bit(I915_RESET_HANDOFF, &error->flags);
 	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1ff523dae6e2..17e1f9bd663d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -600,6 +600,7 @@ int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
 	set_state(&msdata, MULTI_STOP_PREPARE);
 	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
+EXPORT_SYMBOL_GPL(stop_machine_cpuslocked);
 
 int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
-- 
2.18.0



More information about the Intel-gfx-trybot mailing list