[PATCH 2/2] drm/xe: Enable configfs support for survivability mode
Lucas De Marchi
lucas.demarchi at intel.com
Tue Apr 1 03:25:58 UTC 2025
On Thu, Mar 27, 2025 at 12:12:02PM +0530, Riana Tauro wrote:
>Enable survivability mode if supported and configfs attribute is set.
>Enabing survivability mode manually is useful in cases where pcode does
>not detect failure, validation and for IFR (in-field-repair).
>
>To set configfs survivability mode attribute for a device
>
>mkdir /config/xe/0000:03:00.0
>echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/unbind
>echo 1 > /config/xe/0000:03:00.0/survivability_mode
>echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/bind
>
>The card enters survivability mode if supported
this part is repeated in commit 1 and 2. You could leave the generic
part with survivability_mode as the example in commit 1. Here you just
mentioned setting survivability_mode to 1.
>
>Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>---
> drivers/gpu/drm/xe/xe_configfs.c | 62 ++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h | 7 ++-
> drivers/gpu/drm/xe/xe_device.c | 2 +-
> drivers/gpu/drm/xe/xe_pci.c | 19 ++++---
> drivers/gpu/drm/xe/xe_survivability_mode.c | 39 ++++++++++----
> drivers/gpu/drm/xe/xe_survivability_mode.h | 2 +-
> 6 files changed, 109 insertions(+), 22 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>index 59e1bc4c5f76..57234904c061 100644
>--- a/drivers/gpu/drm/xe/xe_configfs.c
>+++ b/drivers/gpu/drm/xe/xe_configfs.c
>@@ -151,6 +151,68 @@ static struct configfs_subsystem xe_configfs = {
> },
> };
>
>+static struct xe_config_device *configfs_find_group(struct pci_dev *pdev)
>+{
>+ struct config_item *item;
>+ char name[64];
>+
>+ snprintf(name, sizeof(name), "%04x:%02x:%02x.%x", pci_domain_nr(pdev->bus),
>+ pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>+
>+ mutex_lock(&xe_configfs.su_mutex);
>+ item = config_group_find_item(&xe_configfs.su_group, name);
>+ mutex_unlock(&xe_configfs.su_mutex);
>+
>+ if (!item)
>+ return NULL;
>+
>+ return to_xe_config_device(item);
>+}
>+
>+/**
>+ * xe_configfs_get_survivability_mode - get configfs survivability mode attribute
>+ * @pdev: pci device
>+ *
>+ * find the configfs group that belongs to the pci device and return
>+ * the survivability mode attribute
>+ *
>+ * Return: survivability mode if config group is found, false otherwise
>+ */
>+bool xe_configfs_get_survivability_mode(struct pci_dev *pdev)
>+{
>+ struct xe_config_device *dev = configfs_find_group(pdev);
>+ bool mode;
>+
>+ if (!dev)
>+ return false;
>+
>+ mode = dev->survivability_mode;
>+ config_item_put(&dev->group.cg_item);
>+
>+ return mode;
>+}
>+
>+/**
>+ * xe_configfs_clear_survivability_mode - clear configfs survivability mode attribute
>+ * @pdev: pci device
>+ *
>+ * find the configfs group that belongs to the pci device and clear survivability
>+ * mode attribute
>+ */
>+void xe_configfs_clear_survivability_mode(struct pci_dev *pdev)
>+{
>+ struct xe_config_device *dev = configfs_find_group(pdev);
>+
>+ if (!dev)
>+ return;
>+
>+ mutex_lock(&dev->lock);
>+ dev->survivability_mode = 0;
>+ mutex_unlock(&dev->lock);
>+
>+ config_item_put(&dev->group.cg_item);
>+}
>+
> int __init xe_configfs_init(void)
> {
> struct config_group *root = &xe_configfs.su_group;
>diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>index 2c30be9a2c7e..6e8c5ccaf202 100644
>--- a/drivers/gpu/drm/xe/xe_configfs.h
>+++ b/drivers/gpu/drm/xe/xe_configfs.h
>@@ -5,7 +5,12 @@
> #ifndef _XE_CONFIGFS_H_
> #define _XE_CONFIGFS_H_
>
>+#include <linux/types.h>
>+
>+struct pci_dev;
>+
> int xe_configfs_init(void);
> void xe_configfs_exit(void);
>-
>+bool xe_configfs_get_survivability_mode(struct pci_dev *pdev);
>+void xe_configfs_clear_survivability_mode(struct pci_dev *pdev);
> #endif
>diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>index 1ffb7d1f6be6..f4e59577ddc0 100644
>--- a/drivers/gpu/drm/xe/xe_device.c
>+++ b/drivers/gpu/drm/xe/xe_device.c
>@@ -712,7 +712,7 @@ int xe_device_probe_early(struct xe_device *xe)
> sriov_update_device_info(xe);
>
> err = xe_pcode_probe_early(xe);
>- if (err) {
>+ if (err || xe_survivability_mode_requested(xe)) {
> int save_err = err;
>
> /*
>diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>index da384acf731f..bcacfa78afd1 100644
>--- a/drivers/gpu/drm/xe/xe_pci.c
>+++ b/drivers/gpu/drm/xe/xe_pci.c
>@@ -812,18 +812,17 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> return err;
>
> err = xe_device_probe_early(xe);
>- if (err) {
>- /*
>- * In Boot Survivability mode, no drm card is exposed and driver
>- * is loaded with bare minimum to allow for firmware to be
>- * flashed through mei. If early probe failed, but it managed to
>- * enable survivability mode, return success.
>- */
>- if (xe_survivability_mode_is_enabled(xe))
>- return 0;
>+ /*
>+ * In Boot Survivability mode, no drm card is exposed and driver
>+ * is loaded with bare minimum to allow for firmware to be
>+ * flashed through mei. Return success, if survivability mode
>+ * is enabled due to pcode failure or configfs being set
>+ */
>+ if (xe_survivability_mode_is_enabled(xe))
>+ return 0;
>
>+ if (err)
> return err;
>- }
>
> err = xe_info_init(xe, desc);
> if (err)
>diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>index cb813b337fd3..ed6599c5b85d 100644
>--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>@@ -10,6 +10,7 @@
> #include <linux/pci.h>
> #include <linux/sysfs.h>
>
>+#include "xe_configfs.h"
> #include "xe_device.h"
> #include "xe_gt.h"
> #include "xe_heci_gsc.h"
>@@ -28,8 +29,12 @@
> * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
> * to be flashed through mei and collect telemetry. The driver's probe flow is modified
> * such that it enters survivability mode when pcode initialization is incomplete and boot status
>- * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating
>- * survivability mode and provides additional information required for debug
>+ * denotes a failure. Survivability mode can also be enabled manually by setting the
I think this should start a new paragraph.
>+ * survivability_mode attribute of the device in xe configfs. This is useful in cases where pcode
>+ * does not detect failure, IFR (in-field-repair) use cases where the repair can be performed for
>+ * a single GPU card without impacting the usage of other cards in the same node and for validation.
I'm missing a verb in in this phrase
>+ * The driver then populates the survivability_mode PCI sysfs indicating survivability mode and
>+ * provides additional information required for debug
> *
> * KMD exposes below admin-only readable sysfs in survivability mode
> *
>@@ -42,6 +47,15 @@
> * Overflow Information - Provides history of previous failures
> * Auxiliary Information - Certain failures may have information in
> * addition to postcode information
>+ *
>+ * Enable survivability mode through configfs
>+ *
>+ * Create device directory : mkdir /config/xe/0000:03:00.0
>+ * Unbind : echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/unbind
>+ * Enable survivability mode : echo 1 > /config/xe/0000:03:00.0/survivability_mode
>+ * Bind : echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/bind
ditto about alignment and unbind.
I didn't check how the html gets rendered though.
>+ *
>+ * The card enters survivability mode if supported
> */
>
> static u32 aux_history_offset(u32 reg_value)
>@@ -133,6 +147,7 @@ static void xe_survivability_mode_fini(void *arg)
> struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> struct device *dev = &pdev->dev;
>
>+ xe_configfs_clear_survivability_mode(pdev);
> sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
> }
>
>@@ -186,24 +201,30 @@ bool xe_survivability_mode_is_enabled(struct xe_device *xe)
> return xe->survivability.mode;
> }
>
>-/*
>- * survivability_mode_requested - check if it's possible to enable
>- * survivability mode and that was requested by firmware
>+/**
>+ * xe_survivability_mode_requested - check if it's possible to enable survivability
>+ * mode that was requested by firmware or userspace
>+ * @xe: xe device instance
> *
>- * This function reads the boot status from Pcode.
>+ * This function reads configfs and boot status from Pcode.
> *
> * Return: true if platform support is available and boot status indicates
>- * failure, false otherwise.
>+ * failure or if survivability mode is requested, false otherwise.
> */
>-static bool survivability_mode_requested(struct xe_device *xe)
>+bool xe_survivability_mode_requested(struct xe_device *xe)
> {
> struct xe_survivability *survivability = &xe->survivability;
> struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> u32 data;
>
> if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
> return false;
>
>+ /* Enable survivability mode if set via configfs */
>+ if (xe_configfs_get_survivability_mode(pdev))
>+ return true;
>+
> data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
> survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>
>@@ -226,7 +247,7 @@ int xe_survivability_mode_enable(struct xe_device *xe)
> struct xe_survivability_info *info;
> struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>
>- if (!survivability_mode_requested(xe))
>+ if (!xe_survivability_mode_requested(xe))
> return 0;
>
> survivability->size = MAX_SCRATCH_MMIO;
>diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
>index d7e64885570d..70eaf56fb3bb 100644
>--- a/drivers/gpu/drm/xe/xe_survivability_mode.h
>+++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
>@@ -12,5 +12,5 @@ struct xe_device;
>
> int xe_survivability_mode_enable(struct xe_device *xe);
> bool xe_survivability_mode_is_enabled(struct xe_device *xe);
>-
>+bool xe_survivability_mode_requested(struct xe_device *xe);
keep empty line here and probably use _is_requested to be similar to the
enabled case.
Just a few nits.. From my tests today it appears to be working fine on
BMG. Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>
thanks
Lucas De Marchi
> #endif /* _XE_SURVIVABILITY_MODE_H_ */
>--
>2.47.1
>
More information about the Intel-xe
mailing list