[PATCH] drm/i915/gt: Stop poisoning the idle kernel context alone when waking up

Krzysztof Niemiec krzysztof.niemiec at intel.com
Wed Jul 24 14:17:47 UTC 2024


From: Chris Wilson <chris.p.wilson at linux.intel.com>

The kernel context was poisoned on wakeup to simulate how the driver
would cope with bad HW that caused corruption of any context that was
still resident during power loss, see commit 1d0e2c9359fe ("drm/i915/gt:
Always poison the kernel_context image before unparking"). However
clearing the entire context image also poisons the ppHWSP which may
contain pertinent information (such as the breadcrumb and context
switches) that we want to preserve. We could restrict the poisoning to
exclude the ppHWSP, or more simply recognise that we have poisoned the
HW enough and can leave the verification to after suspend/resume.

References: 1d0e2c9359fe ("drm/i915/gt: Always poison the kernel_context image before unparking")
Signed-off-by: Chris Wilson <chris.p.wilson at linux.intel.com>
Signed-off-by: Krzysztof Niemiec <krzysztof.niemiec at intel.com>
---
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  |  4 ++++
 drivers/gpu/drm/i915/gt/intel_engine_pm.c     | 24 -------------------
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 12 ++++++----
 3 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 8d4bb95f8424..7d69bc496283 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -127,6 +127,10 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
 		 */
 		intel_guc_find_hung_context(engine);
 
+	/* If the heartbeat failed to resume after reset, declare an emergency. */
+	if (xchg(&rq->fence.error, -ENODEV) == -ENODEV)
+		intel_gt_set_wedged(engine->gt);
+
 	intel_gt_handle_error(engine->gt, engine->mask,
 			      I915_ERROR_CAPTURE,
 			      "stopped heartbeat on %s",
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index fb7bff27b45a..a5c42b784168 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -59,35 +59,11 @@ static int __engine_unpark(struct intel_wakeref *wf)
 {
 	struct intel_engine_cs *engine =
 		container_of(wf, typeof(*engine), wakeref);
-	struct intel_context *ce;
 
 	ENGINE_TRACE(engine, "\n");
 
 	engine->wakeref_track = intel_gt_pm_get(engine->gt);
 
-	/* Discard stale context state from across idling */
-	ce = engine->kernel_context;
-	if (ce) {
-		GEM_BUG_ON(test_bit(CONTEXT_VALID_BIT, &ce->flags));
-
-		/* Flush all pending HW writes before we touch the context */
-		while (unlikely(intel_context_inflight(ce)))
-			intel_engine_flush_submission(engine);
-
-		/* First poison the image to verify we never fully trust it */
-		dbg_poison_ce(ce);
-
-		/* Scrub the context image after our loss of control */
-		ce->ops->reset(ce);
-
-		CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n",
-			 ce->timeline->seqno,
-			 READ_ONCE(*ce->timeline->hwsp_seqno),
-			 ce->ring->emit);
-		GEM_BUG_ON(ce->timeline->seqno !=
-			   READ_ONCE(*ce->timeline->hwsp_seqno));
-	}
-
 	if (engine->unpark)
 		engine->unpark(engine);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 7bd5d2c29056..f742707b703e 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1017,15 +1017,19 @@ void lrc_init_state(struct intel_context *ce,
 
 	set_redzone(state, engine);
 
+	/* Clear the ppHWSP (inc. per-context counters) */
+	if (!test_bit(CONTEXT_VALID_BIT, &ce->flags))
+		memset(state, 0, LRC_STATE_OFFSET);
+
 	if (ce->default_state) {
-		shmem_read(ce->default_state, 0, state, engine->context_size);
+		shmem_read(ce->default_state, /* exclude ppHWSP */
+			   LRC_STATE_OFFSET,
+			   state + LRC_STATE_OFFSET,
+			   engine->context_size - LRC_STATE_OFFSET);
 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
 		inhibit = false;
 	}
 
-	/* Clear the ppHWSP (inc. per-context counters) */
-	memset(state, 0, PAGE_SIZE);
-
 	/* Clear the indirect wa and storage */
 	if (ce->wa_bb_page)
 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
-- 
2.43.0



More information about the Intel-gfx mailing list