[Intel-xe] [PATCH v2 1/1] drm/xe: Notify Userspace when engine/gt reset fails.
Nilawar, Badal
badal.nilawar at intel.com
Thu May 11 04:36:48 UTC 2023
Hi Himal,
On 09-05-2023 13:55, Himal Prasad Ghimiray wrote:
> Send uevent in case of engine reset or gt reset failure.
> This intimation can be used by userspace monitoring tool to do the
> device level reset/reboot when GT reset fails. udevadm can be used to
> monitor the uevents.
>
> v2:
> -Add NULL check for xe_gt_hw_engine return(Aravind)
> -Arrange variables in Christmas tree order(Tejas)
> -Check GUC_GSC_OTHER_CLASS(Tejas)
>
> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> Cc: Tejas Upadhyay <tejas.upadhyay at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 18 ++++++++++++++++++
> drivers/gpu/drm/xe/xe_guc.h | 19 +++++++++++++++++++
> drivers/gpu/drm/xe/xe_guc_submit.c | 23 +++++++++++++++++++++++
> include/uapi/drm/xe_drm.h | 8 ++++++++
> 4 files changed, 68 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 3afca3dd9657..2c3ffa1db74e 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -8,6 +8,7 @@
> #include <linux/minmax.h>
>
> #include <drm/drm_managed.h>
> +#include <drm/xe_drm.h>
>
> #include "regs/xe_gt_regs.h"
> #include "xe_bb.h"
> @@ -590,6 +591,20 @@ static int do_gt_restart(struct xe_gt *gt)
> return 0;
> }
>
> +static void xe_uevent_gt_reset_failure(struct xe_device *xe, u8 id)
> +{
> + char *reset_event[5];
> +
> + reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
> + reset_event[1] = "RESET_ENABLED=1";
> + reset_event[2] = "RESET_UNIT=gt";
> + reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%d", id);
> + reset_event[4] = NULL;
> + kobject_uevent_env(&xe->drm.primary->kdev->kobj, KOBJ_CHANGE, reset_event);
> +
> + kfree(reset_event[3]);
> +}
> +
> static int gt_reset(struct xe_gt *gt)
> {
> struct xe_device *xe = gt_to_xe(gt);
> @@ -639,6 +654,9 @@ static int gt_reset(struct xe_gt *gt)
> xe_device_mem_access_put(gt_to_xe(gt));
> drm_err(&xe->drm, "GT reset failed, err=%d\n", err);
>
> + /* Notify userspace about gt reset failure */
> + xe_uevent_gt_reset_failure(xe, gt->info.id);
> +
> return err;
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> index 74a74051f354..a3637eeaaa37 100644
> --- a/drivers/gpu/drm/xe/xe_guc.h
> +++ b/drivers/gpu/drm/xe/xe_guc.h
> @@ -56,4 +56,23 @@ static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
> }
> }
>
> +static inline u16 xe_guc_class_to_engine_class(u8 guc_class)
> +{
> + switch (guc_class) {
> + case GUC_RENDER_CLASS:
> + return XE_ENGINE_CLASS_RENDER;
> + case GUC_VIDEO_CLASS:
> + return XE_ENGINE_CLASS_VIDEO_DECODE;
> + case GUC_VIDEOENHANCE_CLASS:
> + return XE_ENGINE_CLASS_VIDEO_ENHANCE;
> + case GUC_BLITTER_CLASS:
> + return XE_ENGINE_CLASS_COPY;
> + case GUC_COMPUTE_CLASS:
> + return XE_ENGINE_CLASS_COMPUTE;
> + case GUC_GSC_OTHER_CLASS:
> + default:
> + XE_WARN_ON(guc_class);
> + return -1;
> + }
> +}
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index e857013070b9..d068af0ca7df 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -12,6 +12,7 @@
> #include <linux/dma-fence-array.h>
>
> #include <drm/drm_managed.h>
> +#include <drm/xe_drm.h>
>
> #include "regs/xe_lrc_layout.h"
> #include "xe_device.h"
> @@ -1589,10 +1590,26 @@ int xe_guc_engine_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
> return 0;
> }
>
> +static void xe_uevent_engine_reset_failure(struct xe_device *xe, const char *name)
> +{
> + char *reset_event[5];
> +
> + reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
> + reset_event[1] = "RESET_ENABLED=1";
> + reset_event[2] = "RESET_UNIT=engine";
> + reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%s", name);
> + reset_event[4] = NULL;
> + kobject_uevent_env(&xe->drm.primary->kdev->kobj, KOBJ_CHANGE, reset_event);
> +
> + kfree(reset_event[3]);
> +}
> +
> int xe_guc_engine_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len)
> {
> struct xe_device *xe = guc_to_xe(guc);
> + struct xe_hw_engine *hwe;
> u8 guc_class, instance;
> + u16 engine_class;
> u32 reason;
>
> if (unlikely(len != 3)) {
> @@ -1608,6 +1625,12 @@ int xe_guc_engine_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len)
> drm_err(&xe->drm, "GuC engine reset request failed on %d:%d because 0x%08X",
> guc_class, instance, reason);
>
> + /* Notify userspace about engine reset failure */
> + engine_class = xe_guc_class_to_engine_class(guc_class);
> + hwe = xe_gt_hw_engine(guc_to_gt(guc), engine_class, instance, false);The The instance here is considered as physical instance of hwe but while
submitting contexts kmd passes logical instance to guc. With that I
assume guc might be passing logical instance or for this notification it
is passing physical instance?
Regards,
Badal
> + if (hwe)
> + xe_uevent_engine_reset_failure(xe, hwe->name);
> +
> xe_gt_reset_async(guc_to_gt(guc));
>
> return 0;
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index b0b80aae3ee8..79ef5947c172 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -36,6 +36,14 @@ extern "C" {
> * subject to backwards-compatibility constraints.
> */
>
> +/*
> + * Uevents generated by xe on it's device node.
> + *
> + * XE_RESET_FAILED_UEVENT - Event is generated when attempt to reset engine
> + * or gt fails. The value supplied with the event is always 1.
> + */
> +#define XE_RESET_FAILED_UEVENT "RESET_FAILED"
> +
> /**
> * struct xe_user_extension - Base class for defining a chain of extensions
> *
More information about the Intel-xe
mailing list