[Intel-xe] [RFC 1/1] drm/xe: Notify Userspace when engine/gt reset fails.

Rodrigo Vivi rodrigo.vivi at kernel.org
Tue May 9 17:25:36 UTC 2023


On Tue, May 09, 2023 at 06:28:07AM +0000, Ghimiray, Himal Prasad wrote:
> 
> 
> > -----Original Message-----
> > From: Iddamsetty, Aravind <aravind.iddamsetty at intel.com>
> > Sent: 09 May 2023 10:42
> > To: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>; intel-
> > xe at lists.freedesktop.org; Vivi, Rodrigo <rodrigo.vivi at intel.com>
> > Subject: Re: [RFC 1/1] drm/xe: Notify Userspace when engine/gt reset fails.
> > 
> > 
> > 
> > On 08-05-2023 19:18, Himal Prasad Ghimiray wrote:
> > > Send uevent in case of engine reset or gt reset failure.
> > > L0 sysman would like to generate device reset needed event when gt
> > > reset fails
> > > https://one-api.gitlab-pages.devtools.intel.com/level_zero/sysman/api.
> > > html#_CPPv441ZES_EVENT_TYPE_FLAG_DEVICE_RESET_REQUIRED
> > This could rather be an intimation to userpace monitoring tool to do the
> > device level reset/reboot when GT reset fails. udevadm can be used to
> > monitor the uevents.
> Hmm. Makes sense. 
> > 
> > @rodrigo, your thoughts?

It would be awesome if this is an indication that user space needs to call
FLR/SBR... so we would do this when we are unable to reset ("wedged")

but let's get some confirmation on the true meaning before moving on...

> > 
> > 
> > >
> > > Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> > > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_gt.c         | 18 ++++++++++++++++++
> > >  drivers/gpu/drm/xe/xe_guc.h        | 18 ++++++++++++++++++
> > >  drivers/gpu/drm/xe/xe_guc_submit.c | 23 +++++++++++++++++++++++
> > >  include/uapi/drm/xe_drm.h          |  8 ++++++++
> > >  4 files changed, 67 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > > index 3afca3dd9657..2c3ffa1db74e 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt.c
> > > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > > @@ -8,6 +8,7 @@
> > >  #include <linux/minmax.h>
> > >
> > >  #include <drm/drm_managed.h>
> > > +#include <drm/xe_drm.h>
> > >
> > >  #include "regs/xe_gt_regs.h"
> > >  #include "xe_bb.h"
> > > @@ -590,6 +591,20 @@ static int do_gt_restart(struct xe_gt *gt)
> > >  	return 0;
> > >  }
> > >
> > > +static void xe_uevent_gt_reset_failure(struct xe_device *xe, u8 id) {
> > > +	char *reset_event[5];
> > > +
> > > +	reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
> > > +	reset_event[1] = "RESET_ENABLED=1";
> > > +	reset_event[2] = "RESET_UNIT=gt";
> > > +	reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%d", id);
> > > +	reset_event[4] = NULL;
> > > +	kobject_uevent_env(&xe->drm.primary->kdev->kobj,
> > KOBJ_CHANGE,
> > > +reset_event);
> > > +
> > > +	kfree(reset_event[3]);
> > > +}
> > > +
> > >  static int gt_reset(struct xe_gt *gt)  {
> > >  	struct xe_device *xe = gt_to_xe(gt); @@ -639,6 +654,9 @@ static int
> > > gt_reset(struct xe_gt *gt)
> > >  	xe_device_mem_access_put(gt_to_xe(gt));
> > >  	drm_err(&xe->drm, "GT reset failed, err=%d\n", err);
> > >
> > > +	/* Notify userspace about gt reset failure */
> > > +	xe_uevent_gt_reset_failure(xe, gt->info.id);
> > > +
> > >  	return err;
> > >  }
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> > > index 74a74051f354..845c57b95562 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc.h
> > > +++ b/drivers/gpu/drm/xe/xe_guc.h
> > > @@ -56,4 +56,22 @@ static inline u16 xe_engine_class_to_guc_class(enum
> > xe_engine_class class)
> > >  	}
> > >  }
> > >
> > > +static inline u16 xe_guc_class_to_engine_class(u8 guc_class) {
> > > +	switch (guc_class) {
> > > +	case GUC_RENDER_CLASS:
> > > +		return XE_ENGINE_CLASS_RENDER;
> > > +	case GUC_VIDEO_CLASS:
> > > +		return XE_ENGINE_CLASS_VIDEO_DECODE;
> > > +	case GUC_VIDEOENHANCE_CLASS:
> > > +		return XE_ENGINE_CLASS_VIDEO_ENHANCE;
> > > +	case GUC_BLITTER_CLASS:
> > > +		return XE_ENGINE_CLASS_COPY;
> > > +	case GUC_COMPUTE_CLASS:
> > > +		return XE_ENGINE_CLASS_COMPUTE;
> > > +	default:
> > > +		XE_WARN_ON(guc_class);
> > > +		return -1;
> > > +	}
> > > +}
> > >  #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> > > b/drivers/gpu/drm/xe/xe_guc_submit.c
> > > index e857013070b9..663908573d5c 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > > @@ -12,6 +12,7 @@
> > >  #include <linux/dma-fence-array.h>
> > >
> > >  #include <drm/drm_managed.h>
> > > +#include <drm/xe_drm.h>
> > >
> > >  #include "regs/xe_lrc_layout.h"
> > >  #include "xe_device.h"
> > > @@ -1589,9 +1590,25 @@ int
> > xe_guc_engine_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
> > >  	return 0;
> > >  }
> > >
> > > +static void xe_uevent_engine_reset_failure(struct xe_device *xe,
> > > +const char *name) {
> > > +	char *reset_event[5];
> > > +
> > > +	reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
> > > +	reset_event[1] = "RESET_ENABLED=1";
> > > +	reset_event[2] = "RESET_UNIT=engine";
> > > +	reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%s", name);
> > > +	reset_event[4] = NULL;
> > > +	kobject_uevent_env(&xe->drm.primary->kdev->kobj,
> > KOBJ_CHANGE,
> > > +reset_event);
> > > +
> > > +	kfree(reset_event[3]);
> > > +}
> > > +
> > >  int xe_guc_engine_reset_failure_handler(struct xe_guc *guc, u32 *msg,
> > > u32 len)  {
> > >  	struct xe_device *xe = guc_to_xe(guc);
> > > +	struct xe_hw_engine *hwe;
> > > +	u16 engine_class;
> > >  	u8 guc_class, instance;
> > >  	u32 reason;
> > >
> > > @@ -1608,6 +1625,12 @@ int xe_guc_engine_reset_failure_handler(struct
> > xe_guc *guc, u32 *msg, u32 len)
> > >  	drm_err(&xe->drm, "GuC engine reset request failed on %d:%d
> > because 0x%08X",
> > >  		guc_class, instance, reason);
> > >
> > > +	engine_class = xe_guc_class_to_engine_class(guc_class);
> > > +	hwe = xe_gt_hw_engine(guc_to_gt(guc), engine_class, instance,
> > > +false);
> > 
> > this can return NULL, so a check is needed here.
> Will address in next patch.
> > 
> > Thanks,
> > Aravind.
> > > +
> > > +	/* Notify userspace about engine reset failure */
> > > +	xe_uevent_engine_reset_failure(xe, hwe->name);
> > > +
> > >  	xe_gt_reset_async(guc_to_gt(guc));
> > >
> > >  	return 0;
> > > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > > index b0b80aae3ee8..79ef5947c172 100644
> > > --- a/include/uapi/drm/xe_drm.h
> > > +++ b/include/uapi/drm/xe_drm.h
> > > @@ -36,6 +36,14 @@ extern "C" {
> > >   * subject to backwards-compatibility constraints.
> > >   */
> > >
> > > +/*
> > > + * Uevents generated by xe on it's device node.
> > > + *
> > > + * XE_RESET_FAILED_UEVENT - Event is generated when attempt to reset
> > engine
> > > + *	or gt fails. The value supplied with the event is always 1.
> > > + */
> > > +#define XE_RESET_FAILED_UEVENT "RESET_FAILED"
> > > +
> > >  /**
> > >   * struct xe_user_extension - Base class for defining a chain of extensions
> > >   *


More information about the Intel-xe mailing list