[Intel-gfx] [PATCH v3 1/6] drm/i915: Fix request locking during error capture & debugfs dump

John.C.Harrison at Intel.com John.C.Harrison at Intel.com
Thu Jan 19 06:49:55 UTC 2023


From: John Harrison <John.C.Harrison at Intel.com>

When GuC support was added to error capture, the locking around the
request object was broken. Fix it up.

The context based search manages the spinlocking around the search
internally. So it needs to grab the reference count internally as
well. The execlist only request based search relies on external
locking, so it needs an external reference count but within the
spinlock not outside it.

The only other caller of the context based search is the code for
dumping engine state to debugfs. That code wasn't previously getting
an explicit reference at all as it does everything while holding the
execlist specific spinlock. So, that needs updaing as well as that
spinlock doesn't help when using GuC submission. Rather than trying to
conditionally get/put depending on submission model, just change it to
always do the get/put.

In addition, intel_guc_find_hung_context() was not acquiring the
correct spinlock before searching the request list. So fix that up
too. While at it, add some extra whitespace padding for readability.

v2: Explicitly document adding an extra blank line in some dense code
(Andy Shevchenko). Fix multiple potential null pointer derefs in case
of no request found (some spotted by Tvrtko, but there was more!).
Also fix a leaked request in case of !started and another in
__guc_reset_context now that intel_context_find_active_request is
actually reference counting the returned request.

Fixes: dc0dad365c5e ("drm/i915/guc: Fix for error capture after full GPU reset with GuC")
Fixes: 573ba126aef3 ("drm/i915/guc: Capture error state on context reset")
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: John Harrison <John.C.Harrison at Intel.com>
Cc: Jani Nikula <jani.nikula at linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Cc: Andrzej Hajda <andrzej.hajda at intel.com>
Cc: Matthew Auld <matthew.auld at intel.com>
Cc: Matt Roper <matthew.d.roper at intel.com>
Cc: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
Cc: Michael Cheng <michael.cheng at intel.com>
Cc: Lucas De Marchi <lucas.demarchi at intel.com>
Cc: Tejas Upadhyay <tejaskumarx.surendrakumar.upadhyay at intel.com>
Cc: Andy Shevchenko <andriy.shevchenko at linux.intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
Cc: Alan Previn <alan.previn.teres.alexis at intel.com>
Cc: Bruce Chang <yu.bruce.chang at intel.com>
Cc: intel-gfx at lists.freedesktop.org
Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c           |  2 ++
 drivers/gpu/drm/i915/gt/intel_engine_cs.c         |  8 +++++++-
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 13 +++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c             | 12 ++++++------
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index e94365b08f1ef..e7c5509c48ef1 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -552,6 +552,8 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce)
 
 		active = rq;
 	}
+	if (active)
+		active = i915_request_get_rcu(active);
 	spin_unlock_irqrestore(&parent->guc_state.lock, flags);
 
 	return active;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 922f1bb22dc68..6a082658d0082 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2236,10 +2236,14 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
 	guc = intel_uc_uses_guc_submission(&engine->gt->uc);
 	if (guc) {
 		ce = intel_engine_get_hung_context(engine);
-		if (ce)
+		if (ce) {
+			/* This will reference count the request (if found) */
 			hung_rq = intel_context_find_active_request(ce);
+		}
 	} else {
 		hung_rq = intel_engine_execlist_find_hung_request(engine);
+		if (hung_rq)
+			hung_rq = i915_request_get_rcu(hung_rq);
 	}
 
 	if (hung_rq)
@@ -2250,6 +2254,8 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
 	else
 		intel_engine_dump_active_requests(&engine->sched_engine->requests,
 						  hung_rq, m);
+	if (hung_rq)
+		i915_request_put(hung_rq);
 }
 
 void intel_engine_dump(struct intel_engine_cs *engine,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index b436dd7f12e42..d123cbd90a919 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1702,6 +1702,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
 			goto next_context;
 
 		guilty = false;
+		/* NB: This gets a reference to the request */
 		rq = intel_context_find_active_request(ce);
 		if (!rq) {
 			head = ce->ring->tail;
@@ -1715,6 +1716,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
 		head = intel_ring_wrap(ce->ring, rq->head);
 
 		__i915_request_reset(rq, guilty);
+		i915_request_put(rq);
 out_replay:
 		guc_reset_state(ce, head, guilty);
 next_context:
@@ -4820,6 +4822,8 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine)
 
 	xa_lock_irqsave(&guc->context_lookup, flags);
 	xa_for_each(&guc->context_lookup, index, ce) {
+		bool found;
+
 		if (!kref_get_unless_zero(&ce->ref))
 			continue;
 
@@ -4836,10 +4840,18 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine)
 				goto next;
 		}
 
+		found = false;
+		spin_lock(&ce->guc_state.lock);
 		list_for_each_entry(rq, &ce->guc_state.requests, sched.link) {
 			if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE)
 				continue;
 
+			found = true;
+			break;
+		}
+		spin_unlock(&ce->guc_state.lock);
+
+		if (found) {
 			intel_engine_set_hung_context(engine, ce);
 
 			/* Can only cope with one hang at a time... */
@@ -4847,6 +4859,7 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine)
 			xa_lock(&guc->context_lookup);
 			goto done;
 		}
+
 next:
 		intel_context_put(ce);
 		xa_lock(&guc->context_lookup);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9d5d5a397b64e..7ea36478ee52d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1607,6 +1607,7 @@ capture_engine(struct intel_engine_cs *engine,
 	ce = intel_engine_get_hung_context(engine);
 	if (ce) {
 		intel_engine_clear_hung_context(engine);
+		/* This will reference count the request (if found) */
 		rq = intel_context_find_active_request(ce);
 		if (!rq || !i915_request_started(rq))
 			goto no_request_capture;
@@ -1618,21 +1619,18 @@ capture_engine(struct intel_engine_cs *engine,
 		if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
 			spin_lock_irqsave(&engine->sched_engine->lock, flags);
 			rq = intel_engine_execlist_find_hung_request(engine);
+			if (rq)
+				rq = i915_request_get_rcu(rq);
 			spin_unlock_irqrestore(&engine->sched_engine->lock,
 					       flags);
 		}
 	}
-	if (rq)
-		rq = i915_request_get_rcu(rq);
-
 	if (!rq)
 		goto no_request_capture;
 
 	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
-	if (!capture) {
-		i915_request_put(rq);
+	if (!capture)
 		goto no_request_capture;
-	}
 	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
 		intel_guc_capture_get_matching_node(engine->gt, ee, ce);
 
@@ -1642,6 +1640,8 @@ capture_engine(struct intel_engine_cs *engine,
 	return ee;
 
 no_request_capture:
+	if (rq)
+		i915_request_put(rq);
 	kfree(ee);
 	return NULL;
 }
-- 
2.39.0



More information about the Intel-gfx mailing list