[PATCH 14/20] drm/i915/guc: Add support for reset engine using GuC commands

Michel Thierry michel.thierry at intel.com
Fri May 19 15:43:31 UTC 2017


This patch adds per engine reset and recovery (TDR) support when GuC is
used to submit workloads to GPU.

In the case of i915 directly submission to ELSP, driver manages hang
detection, recovery and resubmission. With GuC submission these tasks
are shared between driver and GuC. i915 is still responsible for detecting
a hang, and when it does it only requests GuC to reset that Engine. GuC
internally manages acquiring forcewake and idling the engine before actually
resetting it.

Once the reset is successful, i915 takes over again and handles resubmission.
The scheduler in i915 knows which requests are pending so after resetting
a engine, pending workloads/requests are resubmitted again.

v2: s/i915_guc_request_engine_reset/i915_guc_reset_engine/ to match the
non-guc funtion names.

v3: Removed debug message about engine restarting from which request,
since the new baseline do it regardless of submission mode. (Chris)

Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
Signed-off-by: Michel Thierry <michel.thierry at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            | 42 +++++++++++++++++---------
 drivers/gpu/drm/i915/i915_drv.h            |  1 +
 drivers/gpu/drm/i915/i915_guc_submission.c | 48 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_guc_fwif.h      |  6 ++++
 drivers/gpu/drm/i915/intel_uc.h            |  1 +
 drivers/gpu/drm/i915/intel_uncore.c        |  5 ----
 6 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index fd1bda67ad21..c48e7e44b697 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1915,24 +1915,34 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 	 */
 	i915_gem_reset_engine(engine, active_request);
 
-	/* forcing engine to idle */
-	ret = intel_reset_engine_start(engine);
-	if (ret) {
-		DRM_ERROR("Failed to disable %s\n", engine->name);
-		goto out;
-	}
+	if (!dev_priv->guc.execbuf_client) {
+		/* forcing engine to idle */
+		ret = intel_reset_engine_start(engine);
+		if (ret) {
+			DRM_ERROR("Failed to disable %s\n", engine->name);
+			goto out;
+		}
 
-	/* finally, reset engine */
-	ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
-	if (ret) {
-		DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+		/* finally, reset engine */
+		ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+		if (ret) {
+			DRM_ERROR("Failed to reset %s, ret=%d\n",
+				  engine->name, ret);
+			intel_reset_engine_cancel(engine);
+			goto out;
+		}
+
+		/* be sure the request reset bit gets cleared */
 		intel_reset_engine_cancel(engine);
-		goto out;
+	} else {
+		ret = i915_guc_reset_engine(engine);
+		if (ret) {
+			DRM_ERROR("GuC failed to reset %s, ret=%d\n",
+				  engine->name, ret);
+			goto out;
+		}
 	}
 
-	/* be sure the request reset bit gets cleared */
-	intel_reset_engine_cancel(engine);
-
 	i915_gem_reset_finish_engine(engine);
 
 	/*
@@ -1944,6 +1954,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 	if (ret)
 		goto out;
 
+	/* for guc too */
+	if (dev_priv->guc.execbuf_client)
+		i915_guc_submission_reenable_engine(engine);
+
 	error->reset_engine_count[engine->id]++;
 out:
 	return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8c32b035cb90..e53c9bfb74e6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3056,6 +3056,7 @@ extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_reset_engine_start(struct intel_engine_cs *engine);
 extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
 extern int intel_reset_guc(struct drm_i915_private *dev_priv);
+extern int i915_guc_reset_engine(struct intel_engine_cs *engine);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index e21cbd9753e3..22ab1feda17d 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1367,6 +1367,25 @@ void i915_guc_submission_disable(struct drm_i915_private *dev_priv)
 	guc->execbuf_client = NULL;
 }
 
+void i915_guc_submission_reenable_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct intel_guc *guc = &dev_priv->guc;
+	struct i915_guc_client *client = guc->execbuf_client;
+	const int wqi_size = sizeof(struct guc_wq_item);
+	struct drm_i915_gem_request *rq;
+
+	GEM_BUG_ON(!client);
+	intel_guc_sample_forcewake(guc);
+
+	spin_lock_irq(&engine->timeline->lock);
+	list_for_each_entry(rq, &engine->timeline->requests, link) {
+		guc_client_update_wq_rsvd(client, wqi_size);
+		__i915_guc_submit(rq);
+	}
+	spin_unlock_irq(&engine->timeline->lock);
+}
+
 /**
  * intel_guc_suspend() - notify GuC entering suspend state
  * @dev_priv:	i915 device private
@@ -1418,3 +1437,32 @@ int intel_guc_resume(struct drm_i915_private *dev_priv)
 
 	return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
+
+int i915_guc_reset_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct intel_guc *guc = &dev_priv->guc;
+	struct i915_gem_context *ctx;
+	u32 data[7];
+
+	if (!i915.enable_guc_submission)
+		return 0;
+
+	ctx = dev_priv->kernel_context;
+
+	/*
+	 * The affected context report is populated by GuC and is provided
+	 * to the driver using the shared page. We request for it but don't
+	 * use it as scheduler has all of these details.
+	 */
+	data[0] = INTEL_GUC_ACTION_REQUEST_ENGINE_RESET;
+	data[1] = engine->guc_id;
+	data[2] = INTEL_GUC_RESET_OPTION_REPORT_AFFECTED_CONTEXTS;
+	data[3] = 0;
+	data[4] = 0;
+	data[5] = guc->execbuf_client->stage_id;
+	/* first page is shared data with GuC */
+	data[6] = guc_ggtt_offset(ctx->engine[RCS].state);
+
+	return intel_guc_send(guc, data, ARRAY_SIZE(data));
+}
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index e6f8079df94a..081f2cf614e6 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -505,6 +505,7 @@ union guc_log_control {
 /* This Action will be programmed in C180 - SOFT_SCRATCH_O_REG */
 enum intel_guc_action {
 	INTEL_GUC_ACTION_DEFAULT = 0x0,
+	INTEL_GUC_ACTION_REQUEST_ENGINE_RESET = 0x3,
 	INTEL_GUC_ACTION_SAMPLE_FORCEWAKE = 0x6,
 	INTEL_GUC_ACTION_ALLOCATE_DOORBELL = 0x10,
 	INTEL_GUC_ACTION_DEALLOCATE_DOORBELL = 0x20,
@@ -518,6 +519,11 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_LIMIT
 };
 
+/* Reset engine options */
+enum action_engine_reset_options {
+	INTEL_GUC_RESET_OPTION_REPORT_AFFECTED_CONTEXTS = 0x10,
+};
+
 /*
  * The GuC sends its response to a command by overwriting the
  * command in SS0. The response is distinguishable from a command
diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h
index 7618b7100175..97a87ea6e42d 100644
--- a/drivers/gpu/drm/i915/intel_uc.h
+++ b/drivers/gpu/drm/i915/intel_uc.h
@@ -264,6 +264,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *rq);
 void i915_guc_wq_unreserve(struct drm_i915_gem_request *request);
 void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
+void i915_guc_submission_reenable_engine(struct intel_engine_cs *engine);
 struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size);
 
 /* intel_guc_log.c */
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 2d2b87797e1a..e189e97a58cb 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1803,14 +1803,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
 	return intel_get_gpu_reset(dev_priv) != NULL;
 }
 
-/*
- * When GuC submission is enabled, GuC manages ELSP and can initiate the
- * engine reset too. For now, fall back to full GPU reset if it is enabled.
- */
 bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
 {
 	return (dev_priv->info.has_reset_engine &&
-		!dev_priv->guc.execbuf_client &&
 		i915.reset >= 2);
 }
 
-- 
2.11.0



More information about the Intel-gfx-trybot mailing list