[PATCH 1/4] drm/xe: Introduce a simple wedged state

Fri Apr 5 01:57:13 UTC 2024

On Wed, Apr 03, 2024 at 03:28:10PM -0400, Rodrigo Vivi wrote:
> On Wed, Apr 03, 2024 at 11:07:29AM -0400, Rodrigo Vivi wrote:
> > Introduce a very simple 'wedged' state where any attempt
> > to access the GPU is entirely blocked.
> > 
> > On some critical cases, like on gt_reset failure, we need to
> > block any other attempt to use the GPU. Otherwise we are at
> > a risk of reaching cases that would force us to reboot the machine.
> > 
> > So, when this cases are identified we corner and block any GPU
> > access. No IOCTL and not even another GT reset should be attempted.
> > 
> > The 'wedged' state in Xe is an end state with no way back.
> > Only a device "re-probe" (unbind + bind) can restore the GPU access.
> > 
> > v2: - s/wedged/busted (Lucas)
> >     - use unbind+bind instead of module reload (Lucas)
> >     - added more info on unbind operations and instruction on bug report
> >     - only print the message once.
> > 
> > v3: - s/busted/wedged (Ashutosh, Tvrtko, Thomas)
> >     - don't assume user has sudo and tee available (Lucas)
> >     - do inner protections against command submission instead
> >       of avoiding only the migration for a more reliable protection.
> > 
> > Cc: Ashutosh Dixit <ashutosh.dixit at intel.com>
> > Cc: Tvrtko Ursulin <tursulin at ursulin.net>
> > Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> > Cc: Anshuman Gupta <anshuman.gupta at intel.com>
> > Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>
> 
> I forgot to state that the reviews were for the first version.
> The new version here brought some changes to where I was blocking
> the execution. and I found out some bad performance impact.
> I will probably need to get back to the previous version for this
> regular wedged mode approach and get the deeper one only with a
> specific debug config enabled.
> 
> Matt, would you perhaps have a better idea on how to blocker the
> scheduler of running anything on wedged without any impact on
> the performance?
> 

Hmm, I'll play around with this a bit but off the of my head probably
use existing mechanisms to disable CTs and stop exec queue from
submiting by disabling the DRM schedulers. This is probably only
required in specific debug modes too.

Also we likely don't need the wedged to an atomic.

Lastly, even as writen I'm surprised a noticeable perf difference is
seen.

Matt

> > Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_device.c              |  6 ++++++
> >  drivers/gpu/drm/xe/xe_device.h              | 20 ++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_device_types.h        |  3 +++
> >  drivers/gpu/drm/xe/xe_gt.c                  |  5 ++++-
> >  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c |  3 +++
> >  drivers/gpu/drm/xe/xe_guc_ct.c              |  8 ++++++++
> >  drivers/gpu/drm/xe/xe_guc_pc.c              |  3 +++
> >  drivers/gpu/drm/xe/xe_guc_submit.c          |  3 +++
> >  8 files changed, 50 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index 01bd5ccf05ca..7015ef9b00a0 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -142,6 +142,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >  	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
> >  	long ret;
> >  
> > +	if (xe_device_wedged(xe))
> > +		return -ECANCELED;
> > +
> >  	ret = xe_pm_runtime_get_ioctl(xe);
> >  	if (ret >= 0)
> >  		ret = drm_ioctl(file, cmd, arg);
> > @@ -157,6 +160,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
> >  	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
> >  	long ret;
> >  
> > +	if (xe_device_wedged(xe))
> > +		return -ECANCELED;
> > +
> >  	ret = xe_pm_runtime_get_ioctl(xe);
> >  	if (ret >= 0)
> >  		ret = drm_compat_ioctl(file, cmd, arg);
> > diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> > index d413bc2c6be5..c532209c5bbd 100644
> > --- a/drivers/gpu/drm/xe/xe_device.h
> > +++ b/drivers/gpu/drm/xe/xe_device.h
> > @@ -176,4 +176,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
> >  u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
> >  u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
> >  
> > +static inline bool xe_device_wedged(struct xe_device *xe)
> > +{
> > +	return atomic_read(&xe->wedged);
> > +}
> > +
> > +static inline void xe_device_declare_wedged(struct xe_device *xe)
> > +{
> > +	if (!atomic_xchg(&xe->wedged, 1)) {
> > +		xe->needs_flr_on_fini = true;
> > +		drm_err(&xe->drm,
> > +			"CRITICAL: Xe has declared device %s as wedged.\n"
> > +			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
> > +			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
> > +			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
> > +			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
> > +			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
> > +			dev_name(xe->drm.dev));
> > +	}
> > +}
> > +
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > index 1df3dcc17d75..0430bd51a5dd 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -455,6 +455,9 @@ struct xe_device {
> >  	/** @needs_flr_on_fini: requests function-reset on fini */
> >  	bool needs_flr_on_fini;
> >  
> > +	/** @wedged: Xe device faced a critical error and is now blocked. */
> > +	atomic_t wedged;
> > +
> >  	/* private: */
> >  
> >  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index cfa5da900461..0844081b88ef 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt)
> >  {
> >  	int err;
> >  
> > +	if (xe_device_wedged(gt_to_xe(gt)))
> > +		return -ECANCELED;
> > +
> >  	/* We only support GT resets with GuC submission */
> >  	if (!xe_device_uc_enabled(gt_to_xe(gt)))
> >  		return -ENODEV;
> > @@ -685,7 +688,7 @@ static int gt_reset(struct xe_gt *gt)
> >  err_fail:
> >  	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
> >  
> > -	gt_to_xe(gt)->needs_flr_on_fini = true;
> > +	xe_device_declare_wedged(gt_to_xe(gt));
> >  
> >  	return err;
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > index 93df2d7969b3..3a6075021542 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > @@ -236,6 +236,9 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
> >  {
> >  	struct xe_device *xe = gt_to_xe(gt);
> >  
> > +	if (xe_device_wedged(xe))
> > +		return 0;
> > +
> >  	if (xe_guc_ct_enabled(&gt->uc.guc.ct) &&
> >  	    gt->uc.guc.submission_state.enabled) {
> >  		int seqno;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > index 6c37f4f9bddd..2047a49c71b7 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > @@ -638,6 +638,9 @@ static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
> >  	unsigned int sleep_period_ms = 1;
> >  	int ret;
> >  
> > +	if (xe_device_wedged(ct_to_xe(ct)))
> > +		return -ECANCELED;
> > +
> >  	xe_assert(ct_to_xe(ct), !g2h_len || !g2h_fence);
> >  	lockdep_assert_held(&ct->lock);
> >  	xe_device_assert_mem_access(ct_to_xe(ct));
> > @@ -1016,6 +1019,11 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
> >  	u32 *payload;
> >  	int ret = 0;
> >  
> > +	if (xe_device_wedged(xe)) {
> > +		ct->g2h_outstanding = 0;
> > +		return -ECANCELED;
> > +	}
> > +
> >  	if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT)
> >  		return 0;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
> > index 521ae24f2314..f4663f1b0a80 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_pc.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
> > @@ -902,6 +902,9 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
> >  		return;
> >  	}
> >  
> > +	if (xe_device_wedged(xe))
> > +		return;
> > +
> >  	XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
> >  	XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
> >  	XE_WARN_ON(xe_guc_pc_stop(pc));
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 13b7e195c7b5..0a2a54f69f50 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -705,6 +705,9 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
> >  	struct xe_device *xe = guc_to_xe(guc);
> >  	bool lr = xe_exec_queue_is_lr(q);
> >  
> > +	if (xe_device_wedged(xe))
> > +		return ERR_PTR(-ECANCELED);
> > +
> >  	xe_assert(xe, !(exec_queue_destroyed(q) || exec_queue_pending_disable(q)) ||
> >  		  exec_queue_banned(q) || exec_queue_suspended(q));
> >  
> > -- 
> > 2.44.0
> >