[PATCH v2 2/3] drm/i915/guc: Close deregister-context race against CT-loss

Tue Aug 15 13:56:04 UTC 2023

On Mon, Aug 14, 2023 at 06:12:09PM -0700, Alan Previn wrote:
> If we are at the end of suspend or very early in resume
> its possible an async fence signal could lead us to the
> execution of the context destruction worker (after the
> prior worker flush).
> 
> Even if checking that the CT is enabled before calling
> destroyed_worker_func, guc_lrc_desc_unpin may still fail
> because in corner cases, as we traverse the GuC's
> context-destroy list, the CT could get disabled in the mid
> of it right before calling the GuC's CT send function.
> 
> We've witnessed this race condition once every ~6000-8000
> suspend-resume cycles while ensuring workloads that render
> something onscreen is continuously started just before
> we suspend (and the workload is small enough to complete
> and trigger the queued engine/context free-up either very
> late in suspend or very early in resume).
> 
> In such a case, we need to unroll the unpin process because
> guc-lrc-unpin takes a gt wakeref which only gets released in
> the G2H IRQ reply that never comes through in this corner
> case. That will cascade into a kernel hang later at the tail
> end of suspend in this function:
> 
>    intel_wakeref_wait_for_idle(&gt->wakeref)
>    (called by) - intel_gt_pm_wait_for_idle
>    (called by) - wait_for_suspend
> 
> Doing this unroll and keeping the context in the GuC's
> destroy-list will allow the context to get picked up on
> the next destroy worker invocation or purged as part of a
> major GuC sanitization or reset flow.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis at intel.com>
> ---
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 40 +++++++++++++++++--
>  1 file changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 050572bb8dbe..ddb4ee4c4e51 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -235,6 +235,13 @@ set_context_destroyed(struct intel_context *ce)
>  	ce->guc_state.sched_state |= SCHED_STATE_DESTROYED;
>  }
>  
> +static inline void
> +clr_context_destroyed(struct intel_context *ce)
> +{
> +	lockdep_assert_held(&ce->guc_state.lock);
> +	ce->guc_state.sched_state &= ~SCHED_STATE_DESTROYED;
> +}
> +
>  static inline bool context_pending_disable(struct intel_context *ce)
>  {
>  	return ce->guc_state.sched_state & SCHED_STATE_PENDING_DISABLE;
> @@ -3175,7 +3182,7 @@ static void guc_context_close(struct intel_context *ce)
>  	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>  }
>  
> -static inline void guc_lrc_desc_unpin(struct intel_context *ce)
> +static inline int guc_lrc_desc_unpin(struct intel_context *ce)
>  {
>  	struct intel_guc *guc = ce_to_guc(ce);
>  	struct intel_gt *gt = guc_to_gt(guc);
> @@ -3199,10 +3206,20 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
>  	if (unlikely(disabled)) {
>  		release_guc_id(guc, ce);
>  		__guc_context_destroy(ce);
> -		return;
> +		return 0;
> +	}
> +
> +	if (deregister_context(ce, ce->guc_id.id)) {
> +		/* Seal race with concurrent suspend by unrolling */
> +		spin_lock_irqsave(&ce->guc_state.lock, flags);
> +		set_context_registered(ce);
> +		clr_context_destroyed(ce);
> +		intel_gt_pm_put(gt);

how sure we are this is not calling unbalanced puts?
could we wrap this in some existent function to make this clear?

> +		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +		return -ENODEV;
>  	}
>  
> -	deregister_context(ce, ce->guc_id.id);
> +	return 0;
>  }
>  
>  static void __guc_context_destroy(struct intel_context *ce)
> @@ -3270,7 +3287,22 @@ static void deregister_destroyed_contexts(struct intel_guc *guc)
>  		if (!ce)
>  			break;
>  
> -		guc_lrc_desc_unpin(ce);
> +		if (guc_lrc_desc_unpin(ce)) {
> +			/*
> +			 * This means GuC's CT link severed mid-way which could happen
> +			 * in suspend-resume corner cases. In this case, put the
> +			 * context back into the destroyed_contexts list which will
> +			 * get picked up on the next context deregistration event or
> +			 * purged in a GuC sanitization event (reset/unload/wedged/...).
> +			 */
> +			spin_lock_irqsave(&guc->submission_state.lock, flags);
> +			list_add_tail(&ce->destroyed_link,
> +				      &guc->submission_state.destroyed_contexts);
> +			spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> +			/* Bail now since the list might never be emptied if h2gs fail */
> +			break;
> +		}
> +
>  	}
>  }
>  
> -- 
> 2.39.0
>