[PATCH 1/3] drm/xe: Introduce a simple wedged state

Rodrigo Vivi rodrigo.vivi at intel.com
Thu Mar 14 18:53:28 UTC 2024


On Thu, Mar 14, 2024 at 09:03:10AM +0530, Aravind Iddamsetty wrote:
> 
> On 14/03/24 07:10, Aravind Iddamsetty wrote:
> > On 14/03/24 01:24, Rodrigo Vivi wrote:
> >
> > Hi Rodrigo,
> >
> >> Introduce a very simple 'wedged' state where any attempt
> >> to access the GPU is entirely blocked.
> >>
> >> On some critical cases, like on gt_reset failure, we need to
> >> block any other attempt to use the GPU. Otherwise we are at
> >> a risk of reaching cases that would force us to reboot the machine.
> >>
> >> So, when this cases are identified we corner and block any GPU
> >> access. No IOCTL and not even another GT reset should be attempted.
> >>
> >> The 'wedged' state in Xe is an end state with no way back.
> >> Only a module reload can restore the GPU access.
> > I believe we should also expose this wedged state to userspace so that
> > any admin can take action, typically sysman is interested to know that.
> >
> > A sysfs at the pci device level?
> >
> > Thanks,
> > Aravind.
> Also, I feel at this point we can reintroduce the RESET_REQUIRED uevent
> when GT reset fails, which we dropped for realignment purpose.

Hi Aravind,

Thanks for the feedback and ideas.
Yes, I believe it is a good idea, but please let's have this as a follow-up
so we don't have a risk of lingering the addition of this protection.

Then we can also work together and prepare the sysfs and uevent user space
usages at the same time so we don't face any blockages there.

Thanks,
Rodrigo.

> 
> Thanks,
> 
> Aravind.
> >> Cc: Anshuman Gupta <anshuman.gupta at intel.com>
> >> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> >> ---
> >>  drivers/gpu/drm/xe/xe_device.c       |  6 ++++++
> >>  drivers/gpu/drm/xe/xe_device.h       | 11 +++++++++++
> >>  drivers/gpu/drm/xe/xe_device_types.h |  6 ++++++
> >>  drivers/gpu/drm/xe/xe_gt.c           |  4 ++++
> >>  drivers/gpu/drm/xe/xe_migrate.c      |  6 ++++++
> >>  5 files changed, 33 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> >> index 919ad88f0495..5f0a2bdb7c24 100644
> >> --- a/drivers/gpu/drm/xe/xe_device.c
> >> +++ b/drivers/gpu/drm/xe/xe_device.c
> >> @@ -142,6 +142,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >>  	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
> >>  	long ret;
> >>  
> >> +	if (xe_device_wedged(xe))
> >> +		return -ECANCELED;
> >> +
> >>  	ret = xe_pm_runtime_get_ioctl(xe);
> >>  	if (ret >= 0)
> >>  		ret = drm_ioctl(file, cmd, arg);
> >> @@ -157,6 +160,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
> >>  	struct xe_device *xe = to_xe_device(file_priv->minor->dev);
> >>  	long ret;
> >>  
> >> +	if (xe_device_wedged(xe))
> >> +		return -ECANCELED;
> >> +
> >>  	ret = xe_pm_runtime_get_ioctl(xe);
> >>  	if (ret >= 0)
> >>  		ret = drm_compat_ioctl(file, cmd, arg);
> >> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> >> index 14be34d9f543..d10664d32f7f 100644
> >> --- a/drivers/gpu/drm/xe/xe_device.h
> >> +++ b/drivers/gpu/drm/xe/xe_device.h
> >> @@ -176,4 +176,15 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
> >>  u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
> >>  u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
> >>  
> >> +static inline bool xe_device_wedged(struct xe_device *xe)
> >> +{
> >> +	return atomic_read(&xe->wedged);
> >> +}
> >> +
> >> +static inline void xe_device_declare_wedged(struct xe_device *xe)
> >> +{
> >> +	atomic_set(&xe->wedged, 1);
> >> +	drm_err(&xe->drm, "CRITICAL: Xe has been declared wedged. A module reload is required.\n");
> >> +}
> >> +
> >>  #endif
> >> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> >> index 9785eef2e5a4..13971eb2334f 100644
> >> --- a/drivers/gpu/drm/xe/xe_device_types.h
> >> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> >> @@ -455,6 +455,12 @@ struct xe_device {
> >>  	/** @needs_flr_on_fini: requests function-reset on fini */
> >>  	bool needs_flr_on_fini;
> >>  
> >> +	/**
> >> +	 * @wedged: Xe device faced a critical error and is now blocked.
> >> +	 * It cannot return to life without a module reload.
> >> +	 */
> >> +	atomic_t wedged;
> >> +
> >>  	/* private: */
> >>  
> >>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
> >> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> >> index 85408e7a932b..972c0c6d0608 100644
> >> --- a/drivers/gpu/drm/xe/xe_gt.c
> >> +++ b/drivers/gpu/drm/xe/xe_gt.c
> >> @@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt)
> >>  {
> >>  	int err;
> >>  
> >> +	if (xe_device_wedged(gt_to_xe(gt)))
> >> +	    return -ECANCELED;
> >> +
> >>  	/* We only support GT resets with GuC submission */
> >>  	if (!xe_device_uc_enabled(gt_to_xe(gt)))
> >>  		return -ENODEV;
> >> @@ -686,6 +689,7 @@ static int gt_reset(struct xe_gt *gt)
> >>  	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
> >>  
> >>  	gt_to_xe(gt)->needs_flr_on_fini = true;
> >> +	xe_device_declare_wedged(gt_to_xe(gt));
> >>  
> >>  	return err;
> >>  }
> >> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> >> index ee1bb938c493..5b2eeb2048b5 100644
> >> --- a/drivers/gpu/drm/xe/xe_migrate.c
> >> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> >> @@ -713,6 +713,9 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> >>  		xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);
> >>  	bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
> >>  
> >> +	if (xe_device_wedged(xe))
> >> +	    return ERR_PTR(-ECANCELED);
> >> +
> >>  	/* Copying CCS between two different BOs is not supported yet. */
> >>  	if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))
> >>  		return ERR_PTR(-EINVAL);
> >> @@ -986,6 +989,9 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> >>  	int err;
> >>  	int pass = 0;
> >>  
> >> +	if (xe_device_wedged(xe))
> >> +	    return ERR_PTR(-ECANCELED);
> >> +
> >>  	if (!clear_vram)
> >>  		xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it);
> >>  	else


More information about the Intel-xe mailing list