[PATCH 2/4] drm/i915/guc: Fix race between guc_request_alloc and guc_context_close

Alan Previn alan.previn.teres.alexis at intel.com
Fri Sep 9 04:52:18 UTC 2022


From: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable. If
context_close wins, it will immediately disable the context, but
request_alloc might've in the meantime assumed that no delayed disabled
was pending and managed to go past the context status check, therefore
assuming that the context is still enabled. The consequence is that the
context gets disabled while there is a pending submission, which will
never complete.

To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 4a25406704c9..a3724b7238d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -169,7 +169,8 @@ guc_create_parallel(struct intel_engine_cs **engines,
 #define SCHED_STATE_PENDING_ENABLE			BIT(5)
 #define SCHED_STATE_REGISTERED				BIT(6)
 #define SCHED_STATE_POLICY_REQUIRED			BIT(7)
-#define SCHED_STATE_BLOCKED_SHIFT			8
+#define SCHED_STATE_CLOSED				BIT(8)
+#define SCHED_STATE_BLOCKED_SHIFT			9
 #define SCHED_STATE_BLOCKED		BIT(SCHED_STATE_BLOCKED_SHIFT)
 #define SCHED_STATE_BLOCKED_MASK	(0xfff << SCHED_STATE_BLOCKED_SHIFT)
 
@@ -179,12 +180,20 @@ static inline void init_sched_state(struct intel_context *ce)
 	ce->guc_state.sched_state &= SCHED_STATE_BLOCKED_MASK;
 }
 
+/*
+ * Kernel contexts can have SCHED_STATE_REGISTERED after suspend.
+ * A context close can race with the submission path, so SCHED_STATE_CLOSED
+ * can be set immediately before we try to register.
+ */
+#define SCHED_STATE_VALID_INIT \
+	(SCHED_STATE_BLOCKED_MASK | \
+	 SCHED_STATE_CLOSED | \
+	 SCHED_STATE_REGISTERED)
+
 __maybe_unused
 static bool sched_state_is_init(struct intel_context *ce)
 {
-	/* Kernel contexts can have SCHED_STATE_REGISTERED after suspend. */
-	return !(ce->guc_state.sched_state &
-		 ~(SCHED_STATE_BLOCKED_MASK | SCHED_STATE_REGISTERED));
+	return !(ce->guc_state.sched_state & ~SCHED_STATE_VALID_INIT);
 }
 
 static inline bool
@@ -325,6 +334,17 @@ static inline void clr_context_policy_required(struct intel_context *ce)
 	ce->guc_state.sched_state &= ~SCHED_STATE_POLICY_REQUIRED;
 }
 
+static inline bool context_close_done(struct intel_context *ce)
+{
+	return ce->guc_state.sched_state & SCHED_STATE_CLOSED;
+}
+
+static inline void set_context_close_done(struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->guc_state.lock);
+	ce->guc_state.sched_state |= SCHED_STATE_CLOSED;
+}
+
 static inline u32 context_blocked(struct intel_context *ce)
 {
 	return (ce->guc_state.sched_state & SCHED_STATE_BLOCKED_MASK) >>
@@ -3095,9 +3115,15 @@ static void guc_context_sched_disable(struct intel_context *ce)
 
 static void guc_context_close(struct intel_context *ce)
 {
+	unsigned long flags;
+
 	if (test_bit(CONTEXT_GUC_INIT, &ce->flags) &&
 	    cancel_delayed_work(&ce->guc_state.sched_disable_delay))
 		__delay_sched_disable(&ce->guc_state.sched_disable_delay.work);
+
+	spin_lock_irqsave(&ce->guc_state.lock, flags);
+	set_context_close_done(ce);
+	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
 }
 
 static inline void guc_lrc_desc_unpin(struct intel_context *ce)
@@ -3541,9 +3567,19 @@ static int guc_request_alloc(struct i915_request *rq)
 	if (unlikely(!test_bit(CONTEXT_GUC_INIT, &ce->flags)))
 		guc_context_init(ce);
 
-	if (cancel_delayed_work(&ce->guc_state.sched_disable_delay))
+	/*
+	 * If the context gets closed while the execbuf is ongoing, the context
+	 * close code will race with the below code to cancel the delayed work.
+	 * If the context close wins the race and cancels the work, it will
+	 * immediately call the sched disable (see guc_context_close), so there
+	 * is a chance we can get past this check while the sched_disable code
+	 * is being executed. To make sure that code completes before we check
+	 * the status further down, we wait for the close process to complete.
+	 */
+	if (cancel_delayed_work_sync(&ce->guc_state.sched_disable_delay))
 		intel_context_sched_disable_unpin(ce);
-
+	else if (intel_context_is_closed(ce))
+		wait_for(context_close_done(ce), 1);
 	/*
 	 * Call pin_guc_id here rather than in the pinning step as with
 	 * dma_resv, contexts can be repeatedly pinned / unpinned trashing the
-- 
2.25.1



More information about the Intel-gfx-trybot mailing list