[Intel-gfx] [PATCH v7 05/20] drm/i915: Cancel reset-engine if we couldn't find an active request

Michel Thierry michel.thierry at intel.com
Thu Apr 27 23:12:45 UTC 2017


Before reseting an engine, check if there is an active request, and if
the _hung_ request has completed. In these two cases, the seqno has moved
after hang declaration and we can skip the reset.

Also store the active request so that we only search for it once.

Suggested-by: Chris Wilson <chris at chris-wilson.co.uk>
Signed-off-by: Michel Thierry <michel.thierry at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 37 +++++++++++++++++++++++++++++--------
 drivers/gpu/drm/i915/i915_drv.h |  6 ++++--
 drivers/gpu/drm/i915/i915_gem.c | 37 ++++++++++++++++++++++++-------------
 3 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ae891529dedd..a64e9b63cdbc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1811,7 +1811,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
 	disable_irq(dev_priv->drm.irq);
 	ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
-	if (ret) {
+	if (ret == -EIO) {
 		DRM_ERROR("GPU recovery failed\n");
 		intel_gpu_reset(dev_priv, ALL_ENGINES);
 		goto error;
@@ -1883,23 +1883,40 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 	int ret;
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
+	struct drm_i915_gem_request *active_request;
 
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
 
-	ret = i915_gem_reset_prepare_engine(engine);
-	if (ret) {
-		DRM_ERROR("Previous reset failed - promote to full reset\n");
-		goto out;
+	active_request = i915_gem_reset_prepare_engine(engine);
+	if (!active_request) {
+		DRM_DEBUG_DRIVER("seqno moved after hang declaration, pardoned\n");
+		goto canceled;
+	}
+	if (IS_ERR(active_request)) {
+		ret = PTR_ERR(active_request);
+		if (ret == -ECANCELED) {
+			DRM_DEBUG_DRIVER("no active request found, skip reset\n");
+			goto canceled;
+		} else if (ret) {
+			DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
+			goto out;
+		}
 	}
 
+	if (__i915_gem_request_completed(active_request, engine->hangcheck.seqno)) {
+		DRM_DEBUG_DRIVER("request completed, skip the reset\n");
+		goto canceled;
+	}
+
+
 	/*
-	 * the request that caused the hang is stuck on elsp, identify the
-	 * active request and drop it, adjust head to skip the offending
+	 * the request that caused the hang is stuck on elsp, we know the
+	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
-	i915_gem_reset_engine(engine);
+	i915_gem_reset_engine(engine, active_request);
 
 	/* forcing engine to idle */
 	ret = intel_reset_engine_start(engine);
@@ -1928,6 +1945,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
 out:
 	return ret;
+
+canceled:
+	i915_gem_reset_finish_engine(engine);
+	return 0;
 }
 
 static int i915_pm_suspend(struct device *kdev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index efbf34318893..8e93189c2104 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3439,7 +3439,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
 	return READ_ONCE(error->reset_count);
 }
 
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
@@ -3448,7 +3449,8 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_reset_engine(struct intel_engine_cs *engine);
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+			   struct drm_i915_gem_request *request);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bce38062f94e..4e357d333cc2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2793,12 +2793,15 @@ static bool engine_stalled(struct intel_engine_cs *engine)
 	return true;
 }
 
-/* Ensure irq handler finishes, and not run again. */
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+/*
+ * Ensure irq handler finishes, and not run again.
+ * For reset-engine we also store the active request so that we only search
+ * for it once.
+ */
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *request;
-	int err = 0;
-
+	struct drm_i915_gem_request *request = NULL;
 
 	/* Prevent the signaler thread from updating the request
 	 * state (by calling dma_fence_signal) as we are processing
@@ -2827,22 +2830,29 @@ int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 
 	if (engine_stalled(engine)) {
 		request = i915_gem_find_active_request(engine);
+		if (!request)
+			return ERR_PTR(-ECANCELED); /* Can't find a request, abort! */
+
 		if (request && request->fence.error == -EIO)
-			err = -EIO; /* Previous reset failed! */
+			return ERR_PTR(-EIO); /* Previous reset failed! */
 	}
 
-	return err;
+	return request;
 }
 
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask)
 {
 	struct intel_engine_cs *engine;
+	struct drm_i915_gem_request *request;
 	unsigned int tmp;
 	int err = 0;
 
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		err = i915_gem_reset_prepare_engine(engine);
+	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
+		request = i915_gem_reset_prepare_engine(engine);
+		if (request && IS_ERR(request))
+			err = PTR_ERR(request);
+	}
 
 	i915_gem_revoke_fences(dev_priv);
 
@@ -2929,11 +2939,12 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
 	return guilty;
 }
 
-void i915_gem_reset_engine(struct intel_engine_cs *engine)
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+			   struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request;
+	if (!request)
+		request = i915_gem_find_active_request(engine);
 
-	request = i915_gem_find_active_request(engine);
 	if (request && i915_gem_reset_request(request)) {
 		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
 				 engine->name, request->global_seqno);
@@ -2959,7 +2970,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
 	for_each_engine(engine, dev_priv, id) {
 		struct i915_gem_context *ctx;
 
-		i915_gem_reset_engine(engine);
+		i915_gem_reset_engine(engine, NULL);
 		ctx = fetch_and_zero(&engine->last_retired_context);
 		if (ctx)
 			engine->context_unpin(engine, ctx);
-- 
2.11.0



More information about the Intel-gfx mailing list