[Intel-xe] [RFC, 1/1] drm/xe: Notify Userspace when engine/gt reset fails - review

Upadhyay, Tejas tejas.upadhyay at intel.com
Tue May 9 06:04:35 UTC 2023


diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 3afca3dd9657..2c3ffa1db74e 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -8,6 +8,7 @@
 #include <linux/minmax.h>
 #include <drm/drm_managed.h>
+#include <drm/xe_drm.h>
 #include "regs/xe_gt_regs.h"
#include "xe_bb.h"
@@ -590,6 +591,20 @@  static int do_gt_restart(struct xe_gt *gt)
       return 0;
}
+static void xe_uevent_gt_reset_failure(struct xe_device *xe, u8 id)
+{
+       char *reset_event[5];
+
+       reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
+       reset_event[1] = "RESET_ENABLED=1";
+       reset_event[2] = "RESET_UNIT=gt";
+       reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%d", id);
+       reset_event[4] = NULL;
+       kobject_uevent_env(&xe->drm.primary->kdev->kobj, KOBJ_CHANGE, reset_event);
+
+       kfree(reset_event[3]);
+}
+
static int gt_reset(struct xe_gt *gt)
{
       struct xe_device *xe = gt_to_xe(gt);
@@ -639,6 +654,9 @@  static int gt_reset(struct xe_gt *gt)
       xe_device_mem_access_put(gt_to_xe(gt));
       drm_err(&xe->drm, "GT reset failed, err=%d\n", err);
The codebase here looks like moved on, in current code there is no drm_err logged it seems,
Though should not be this under if(err) ?

+       /* Notify userspace about gt reset failure */
+       xe_uevent_gt_reset_failure(xe, gt->info.id);
+
       return err;
}
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index 74a74051f354..845c57b95562 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -56,4 +56,22 @@  static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
       }
}
+static inline u16 xe_guc_class_to_engine_class(u8 guc_class)
+{
+       switch (guc_class) {
+       case GUC_RENDER_CLASS:
+                return XE_ENGINE_CLASS_RENDER;
+       case GUC_VIDEO_CLASS:
+                return XE_ENGINE_CLASS_VIDEO_DECODE;
+       case GUC_VIDEOENHANCE_CLASS:
+                return XE_ENGINE_CLASS_VIDEO_ENHANCE;
+       case GUC_BLITTER_CLASS:
+                return XE_ENGINE_CLASS_COPY;
+       case GUC_COMPUTE_CLASS:
+                return XE_ENGINE_CLASS_COMPUTE;

Should you check GUC_GSC_OTHER_CLASS to return default here?

+       default:
+                XE_WARN_ON(guc_class);
+                return -1;
+       }
+}
#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index e857013070b9..663908573d5c 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -12,6 +12,7 @@
 #include <linux/dma-fence-array.h>
 #include <drm/drm_managed.h>
+#include <drm/xe_drm.h>
 #include "regs/xe_lrc_layout.h"
#include "xe_device.h"
@@ -1589,9 +1590,25 @@  int xe_guc_engine_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
       return 0;
}
+static void xe_uevent_engine_reset_failure(struct xe_device *xe, const char *name)
+{
+       char *reset_event[5];
+
+       reset_event[0] = XE_RESET_FAILED_UEVENT "=1";
+       reset_event[1] = "RESET_ENABLED=1";
+       reset_event[2] = "RESET_UNIT=engine";
+       reset_event[3] = kasprintf(GFP_KERNEL, "RESET_ID=%s", name);
+       reset_event[4] = NULL;
+       kobject_uevent_env(&xe->drm.primary->kdev->kobj, KOBJ_CHANGE, reset_event);
+
+       kfree(reset_event[3]);
+}
+
int xe_guc_engine_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len)
{
       struct xe_device *xe = guc_to_xe(guc);
+       struct xe_hw_engine *hwe;
+       u16 engine_class;
       u8 guc_class, instance;
       u32 reason;

On upstream last time I got review comment from Andi saying this should be in Christmas tree order, something you might want to consider.
@@ -1608,6 +1625,12 @@  int xe_guc_engine_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len)
       drm_err(&xe->drm, "GuC engine reset request failed on %d:%d because 0x%08X",
                guc_class, instance, reason);
+       engine_class = xe_guc_class_to_engine_class(guc_class);
+       hwe = xe_gt_hw_engine(guc_to_gt(guc), engine_class, instance, false);
+
+       /* Notify userspace about engine reset failure */
+       xe_uevent_engine_reset_failure(xe, hwe->name);
+
       xe_gt_reset_async(guc_to_gt(guc));
        return 0;
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index b0b80aae3ee8..79ef5947c172 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -36,6 +36,14 @@  extern "C" {
  * subject to backwards-compatibility constraints.
  */
+/*
+ * Uevents generated by xe on it's device node.
+ *
+ * XE_RESET_FAILED_UEVENT - Event is generated when attempt to reset engine
+ *     or gt fails. The value supplied with the event is always 1.
+ */
+#define XE_RESET_FAILED_UEVENT "RESET_FAILED"
+
/**
  * struct xe_user_extension - Base class for defining a chain of extensions
  *
xe_uevent_engine_reset_failure/xe_uevent_gt_reset_failure can they be clubbed as one, up to you but most part looks same so though of this. You can ignore.

Please include me in cc next time, somehow I am not getting email for xe patches.

Thanks,
Tejas

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20230509/b4dcf263/attachment-0001.htm>


More information about the Intel-xe mailing list