[PATCH 2/2] drm/xe: Enable configfs support for survivability mode

Lucas De Marchi lucas.demarchi at intel.com
Tue Apr 1 03:25:58 UTC 2025


On Thu, Mar 27, 2025 at 12:12:02PM +0530, Riana Tauro wrote:
>Enable survivability mode if supported and configfs attribute is set.
>Enabing survivability mode manually is useful in cases where pcode does
>not detect failure, validation and for IFR (in-field-repair).
>
>To set configfs survivability mode attribute for a device
>
>mkdir /config/xe/0000:03:00.0
>echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/unbind
>echo 1 > /config/xe/0000:03:00.0/survivability_mode
>echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/bind
>
>The card enters survivability mode if supported

this part is repeated in commit 1 and 2. You could leave the generic
part with survivability_mode as the example in commit 1. Here you just
mentioned setting survivability_mode to 1.

>
>Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>---
> drivers/gpu/drm/xe/xe_configfs.c           | 62 ++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h           |  7 ++-
> drivers/gpu/drm/xe/xe_device.c             |  2 +-
> drivers/gpu/drm/xe/xe_pci.c                | 19 ++++---
> drivers/gpu/drm/xe/xe_survivability_mode.c | 39 ++++++++++----
> drivers/gpu/drm/xe/xe_survivability_mode.h |  2 +-
> 6 files changed, 109 insertions(+), 22 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>index 59e1bc4c5f76..57234904c061 100644
>--- a/drivers/gpu/drm/xe/xe_configfs.c
>+++ b/drivers/gpu/drm/xe/xe_configfs.c
>@@ -151,6 +151,68 @@ static struct configfs_subsystem xe_configfs = {
> 	},
> };
>
>+static struct xe_config_device *configfs_find_group(struct pci_dev *pdev)
>+{
>+	struct config_item *item;
>+	char name[64];
>+
>+	snprintf(name, sizeof(name), "%04x:%02x:%02x.%x", pci_domain_nr(pdev->bus),
>+		 pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>+
>+	mutex_lock(&xe_configfs.su_mutex);
>+	item = config_group_find_item(&xe_configfs.su_group, name);
>+	mutex_unlock(&xe_configfs.su_mutex);
>+
>+	if (!item)
>+		return NULL;
>+
>+	return to_xe_config_device(item);
>+}
>+
>+/**
>+ * xe_configfs_get_survivability_mode - get configfs survivability mode attribute
>+ * @pdev: pci device
>+ *
>+ * find the configfs group that belongs to the pci device and return
>+ * the survivability mode attribute
>+ *
>+ * Return: survivability mode if config group is found, false otherwise
>+ */
>+bool xe_configfs_get_survivability_mode(struct pci_dev *pdev)
>+{
>+	struct xe_config_device *dev = configfs_find_group(pdev);
>+	bool mode;
>+
>+	if (!dev)
>+		return false;
>+
>+	mode = dev->survivability_mode;
>+	config_item_put(&dev->group.cg_item);
>+
>+	return mode;
>+}
>+
>+/**
>+ * xe_configfs_clear_survivability_mode - clear configfs survivability mode attribute
>+ * @pdev: pci device
>+ *
>+ * find the configfs group that belongs to the pci device and clear survivability
>+ * mode attribute
>+ */
>+void xe_configfs_clear_survivability_mode(struct pci_dev *pdev)
>+{
>+	struct xe_config_device *dev = configfs_find_group(pdev);
>+
>+	if (!dev)
>+		return;
>+
>+	mutex_lock(&dev->lock);
>+	dev->survivability_mode = 0;
>+	mutex_unlock(&dev->lock);
>+
>+	config_item_put(&dev->group.cg_item);
>+}
>+
> int __init xe_configfs_init(void)
> {
> 	struct config_group *root = &xe_configfs.su_group;
>diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>index 2c30be9a2c7e..6e8c5ccaf202 100644
>--- a/drivers/gpu/drm/xe/xe_configfs.h
>+++ b/drivers/gpu/drm/xe/xe_configfs.h
>@@ -5,7 +5,12 @@
> #ifndef _XE_CONFIGFS_H_
> #define _XE_CONFIGFS_H_
>
>+#include <linux/types.h>
>+
>+struct pci_dev;
>+
> int xe_configfs_init(void);
> void xe_configfs_exit(void);
>-
>+bool xe_configfs_get_survivability_mode(struct pci_dev *pdev);
>+void xe_configfs_clear_survivability_mode(struct pci_dev *pdev);
> #endif
>diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>index 1ffb7d1f6be6..f4e59577ddc0 100644
>--- a/drivers/gpu/drm/xe/xe_device.c
>+++ b/drivers/gpu/drm/xe/xe_device.c
>@@ -712,7 +712,7 @@ int xe_device_probe_early(struct xe_device *xe)
> 	sriov_update_device_info(xe);
>
> 	err = xe_pcode_probe_early(xe);
>-	if (err) {
>+	if (err || xe_survivability_mode_requested(xe)) {
> 		int save_err = err;
>
> 		/*
>diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>index da384acf731f..bcacfa78afd1 100644
>--- a/drivers/gpu/drm/xe/xe_pci.c
>+++ b/drivers/gpu/drm/xe/xe_pci.c
>@@ -812,18 +812,17 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> 		return err;
>
> 	err = xe_device_probe_early(xe);
>-	if (err) {
>-		/*
>-		 * In Boot Survivability mode, no drm card is exposed and driver
>-		 * is loaded with bare minimum to allow for firmware to be
>-		 * flashed through mei. If early probe failed, but it managed to
>-		 * enable survivability mode, return success.
>-		 */
>-		if (xe_survivability_mode_is_enabled(xe))
>-			return 0;
>+	/*
>+	 * In Boot Survivability mode, no drm card is exposed and driver
>+	 * is loaded with bare minimum to allow for firmware to be
>+	 * flashed through mei. Return success, if survivability mode
>+	 * is enabled due to pcode failure or configfs being set
>+	 */
>+	if (xe_survivability_mode_is_enabled(xe))
>+		return 0;
>
>+	if (err)
> 		return err;
>-	}
>
> 	err = xe_info_init(xe, desc);
> 	if (err)
>diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>index cb813b337fd3..ed6599c5b85d 100644
>--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>@@ -10,6 +10,7 @@
> #include <linux/pci.h>
> #include <linux/sysfs.h>
>
>+#include "xe_configfs.h"
> #include "xe_device.h"
> #include "xe_gt.h"
> #include "xe_heci_gsc.h"
>@@ -28,8 +29,12 @@
>  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
>  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
>  * such that it enters survivability mode when pcode initialization is incomplete and boot status
>- * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
>- * survivability mode and provides additional information required for debug
>+ * denotes a failure. Survivability mode can also be enabled manually by setting the

I think this should start a new paragraph.

>+ * survivability_mode attribute of the device in xe configfs. This is useful in cases where pcode
>+ * does not detect failure, IFR (in-field-repair) use cases where the repair can be performed for
>+ * a single GPU card without impacting the usage of other cards in the same node and for validation.

I'm missing a verb in in this phrase

>+ * The driver then populates the survivability_mode PCI sysfs indicating survivability mode and
>+ * provides additional information required for debug
>  *
>  * KMD exposes below admin-only readable sysfs in survivability mode
>  *
>@@ -42,6 +47,15 @@
>  *			      Overflow Information   - Provides history of previous failures
>  *			      Auxiliary Information  - Certain failures may have information in
>  *						       addition to postcode information
>+ *
>+ * Enable survivability mode through configfs
>+ *
>+ * Create device directory	: mkdir /config/xe/0000:03:00.0
>+ * Unbind			: echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/unbind
>+ * Enable survivability mode	: echo 1 > /config/xe/0000:03:00.0/survivability_mode
>+ * Bind				: echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/bind

ditto about alignment and unbind.

I didn't check how the html gets rendered though.

>+ *
>+ * The card enters survivability mode if supported
>  */
>
> static u32 aux_history_offset(u32 reg_value)
>@@ -133,6 +147,7 @@ static void xe_survivability_mode_fini(void *arg)
> 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> 	struct device *dev = &pdev->dev;
>
>+	xe_configfs_clear_survivability_mode(pdev);
> 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
> }
>
>@@ -186,24 +201,30 @@ bool xe_survivability_mode_is_enabled(struct xe_device *xe)
> 	return xe->survivability.mode;
> }
>
>-/*
>- * survivability_mode_requested - check if it's possible to enable
>- * survivability mode and that was requested by firmware
>+/**
>+ * xe_survivability_mode_requested - check if it's possible to enable survivability
>+ *				     mode that was requested by firmware or userspace
>+ * @xe: xe device instance
>  *
>- * This function reads the boot status from Pcode.
>+ * This function reads configfs and  boot status from Pcode.
>  *
>  * Return: true if platform support is available and boot status indicates
>- * failure, false otherwise.
>+ * failure or if survivability mode is requested, false otherwise.
>  */
>-static bool survivability_mode_requested(struct xe_device *xe)
>+bool xe_survivability_mode_requested(struct xe_device *xe)
> {
> 	struct xe_survivability *survivability = &xe->survivability;
> 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>+	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> 	u32 data;
>
> 	if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
> 		return false;
>
>+	/* Enable survivability mode if set via configfs */
>+	if (xe_configfs_get_survivability_mode(pdev))
>+		return true;
>+
> 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
> 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>
>@@ -226,7 +247,7 @@ int xe_survivability_mode_enable(struct xe_device *xe)
> 	struct xe_survivability_info *info;
> 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>
>-	if (!survivability_mode_requested(xe))
>+	if (!xe_survivability_mode_requested(xe))
> 		return 0;
>
> 	survivability->size = MAX_SCRATCH_MMIO;
>diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
>index d7e64885570d..70eaf56fb3bb 100644
>--- a/drivers/gpu/drm/xe/xe_survivability_mode.h
>+++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
>@@ -12,5 +12,5 @@ struct xe_device;
>
> int xe_survivability_mode_enable(struct xe_device *xe);
> bool xe_survivability_mode_is_enabled(struct xe_device *xe);
>-
>+bool xe_survivability_mode_requested(struct xe_device *xe);

keep empty line here and probably use _is_requested to be similar to the
enabled case.

Just a few nits.. From my tests today it appears to be working fine on
BMG. Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>

thanks
Lucas De Marchi

> #endif /* _XE_SURVIVABILITY_MODE_H_ */
>-- 
>2.47.1
>


More information about the Intel-xe mailing list