[Intel-gfx] [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged

Chris Wilson chris at chris-wilson.co.uk
Thu Mar 15 13:14:51 UTC 2018


If we fail to reset the GPU, we declare the machine wedged. However, the
GPU may well still be running in the background with an in-flight
request. So despite our efforts in cleaning up the request queue and
faking the breadcrumb in the HWSP, the GPU may eventually write the
in-flght seqno there breaking all of our assumptions and throwing the
driver into a deep turmoil, wedging beyond wedged.

To avoid this we ideally want to reset the GPU. Since that has already
failed, make sure the rings have the stop bit set instead. This is part
of the normal GPU reset sequence, but that is actually disabled by
igt/gem_eio to force the wedged state. If we assume the worst, we must
poke at the bit again before we give up.

Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Cc: MichaƂ Winiarski <michal.winiarski at intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
Cc: Michel Thierry <michel.thierry at intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |  1 +
 drivers/gpu/drm/i915/intel_engine_cs.c  | 43 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 drivers/gpu/drm/i915/intel_uncore.c     | 46 +--------------------------------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2fbd622bba30..0f69fc6d34b4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3238,6 +3238,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	 * rolling the global seqno forward (since this would complete requests
 	 * for which we haven't set the fence error to EIO yet).
 	 */
+	intel_engines_stop(i915, ALL_ENGINES);
 	for_each_engine(engine, i915, id) {
 		i915_gem_reset_prepare_engine(engine);
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 337dfa56a738..44eddf10f4d1 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915)
 	return which;
 }
 
+static void gen3_stop_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	const u32 base = engine->mmio_base;
+	const i915_reg_t mode = RING_MI_MODE(base);
+
+	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
+	if (intel_wait_for_register_fw(dev_priv,
+				       mode,
+				       MODE_IDLE,
+				       MODE_IDLE,
+				       500))
+		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
+				 engine->name);
+
+	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
+	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
+
+	I915_WRITE_FW(RING_HEAD(base), 0);
+	I915_WRITE_FW(RING_TAIL(base), 0);
+	POSTING_READ_FW(RING_TAIL(base));
+
+	/* The ring must be empty before it is disabled */
+	I915_WRITE_FW(RING_CTL(base), 0);
+
+	/* Check acts as a post */
+	if (I915_READ_FW(RING_HEAD(base)) != 0)
+		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
+				 engine->name);
+}
+
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(i915) < 3)
+		return;
+
+	for_each_engine_masked(engine, i915, engine_mask, id)
+		gen3_stop_engine(engine);
+}
+
 static void print_request(struct drm_printer *m,
 			  struct i915_request *rq,
 			  const char *prefix)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1f50727a5ddb..1369f7c5b57e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
 void intel_engines_park(struct drm_i915_private *i915);
 void intel_engines_unpark(struct drm_i915_private *i915);
 
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask);
+
 void intel_engines_reset_default_submission(struct drm_i915_private *i915);
 unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 4df7c2ef8576..cda4c06acf16 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	const u32 base = engine->mmio_base;
-	const i915_reg_t mode = RING_MI_MODE(base);
-
-	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
-	if (intel_wait_for_register_fw(dev_priv,
-				       mode,
-				       MODE_IDLE,
-				       MODE_IDLE,
-				       500))
-		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
-				 engine->name);
-
-	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
-	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
-
-	I915_WRITE_FW(RING_HEAD(base), 0);
-	I915_WRITE_FW(RING_TAIL(base), 0);
-	POSTING_READ_FW(RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	I915_WRITE_FW(RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
-}
-
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (INTEL_GEN(dev_priv) < 3)
-		return;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
-}
-
 static bool i915_in_reset(struct pci_dev *pdev)
 {
 	u8 gdrst;
@@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 *
 		 * FIXME: Wa for more modern gens needs to be validated
 		 */
-		i915_stop_engines(dev_priv, engine_mask);
+		intel_engines_stop(dev_priv, engine_mask);
 
 		ret = -ENODEV;
 		if (reset)
-- 
2.16.2



More information about the Intel-gfx mailing list