[PATCH 2/2] RFC drm/xe: Enable configfs support for survivability mode

Riana Tauro riana.tauro at intel.com
Mon Mar 10 05:36:11 UTC 2025


Hi Rodrigo

On 3/7/2025 8:31 PM, Rodrigo Vivi wrote:
> On Fri, Mar 07, 2025 at 07:54:45PM +0530, Riana Tauro wrote:
>> Register a configfs subsystem called 'xe' to userspace that allows
>> users to modify the exposed attributes. Expose survivability mode as
>> an attribute that can be modified manually. This is useful if
>> pcode fails to detect survivability mode and for validation
>>
>> To enable survivability mode for a card,
>>
>> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
>> echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
>> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
> 
> This is very great! Perhaps at some point we will still need
> the module parameter in this specific survivability case.
> But this configfs is more than needed and wanted for this and
> many other cases.
> 
>>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_configfs.c           | 22 ++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_configfs.h           |  6 +++++-
>>   drivers/gpu/drm/xe/xe_pci.c                |  8 ++++----
>>   drivers/gpu/drm/xe/xe_survivability_mode.c | 21 +++++++++++++++++++--
>>   4 files changed, 50 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>> index 8c5f248e466d..ce9f3757100f 100644
>> --- a/drivers/gpu/drm/xe/xe_configfs.c
>> +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> @@ -6,6 +6,7 @@
>>   #include <linux/configfs.h>
>>   #include <linux/init.h>
>>   #include <linux/module.h>
>> +#include <linux/pci.h>
>>   
>>   #include "xe_configfs.h"
>>   #include "xe_module.h"
>> @@ -28,6 +29,27 @@ void xe_configfs_clear_survivability_mode(void)
>>   	xe_modparam.survivability_mode = NULL;
>>   }
>>   
>> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev)
>> +{
>> +	unsigned int domain, bus, slot, function;
>> +	int ret = 0;
>> +
>> +	if (!xe_modparam.survivability_mode)
>> +		goto err;
>> +
>> +	ret = sscanf(xe_modparam.survivability_mode, "%04x:%02x:%02x.%x", &domain, &bus,
>> +		     &slot, &function);
>> +	if (ret != 4)
>> +		goto err;
>> +
>> +	if ((pci_domain_nr(pdev->bus) == domain) && (pdev->bus->number == bus) &&
>> +	    (PCI_SLOT(pdev->devfn) == slot) && (PCI_FUNC(pdev->devfn) == function))
>> +		return true;
>> +
>> +err:
> 
> if nothing else is here, please just return false everywhere instead of the goto
sure will fix this
> 
>> +	return false;
>> +}
>> +
>>   static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
>>   {
>>   	char *survivability_mode;
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>> index 491629a2ca53..b03f4c7d0f54 100644
>> --- a/drivers/gpu/drm/xe/xe_configfs.h
>> +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> @@ -5,8 +5,12 @@
>>   #ifndef _XE_CONFIGFS_H_
>>   #define _XE_CONFIGFS_H_
>>   
>> +#include <linux/types.h>
>> +
>> +struct pci_dev;
>> +
>>   int xe_configfs_init(void);
>>   void xe_configfs_exit(void);
>>   void xe_configfs_clear_survivability_mode(void);
>> -
>> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev);
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index 4d982a5a4ffd..d0f66cc08aa5 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -17,6 +17,7 @@
>>   
>>   #include "display/xe_display.h"
>>   #include "regs/xe_gt_regs.h"
>> +#include "xe_configfs.h"
>>   #include "xe_device.h"
>>   #include "xe_drv.h"
>>   #include "xe_gt.h"
>> @@ -815,10 +816,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>>   	 * mei. If early probe fails, check if survivability mode is flagged by
>>   	 * HW to be enabled. In that case enable it and return success.
>>   	 */
>> -	if (err) {
>> -		if (xe_survivability_mode_required(xe) &&
>> -		    xe_survivability_mode_enable(xe))
>> -			return 0;
>> +	if (xe_configfs_survivability_mode_enabled(pdev) || err) {
> 
> no strong feelings here, but:
> 
> I believe it is likely better to keep the current code and add
> the 2 new lines, instead of mixing err and new condition.
> 
> 
> 	if (err) {
> 		if (xe_survivability_mode_required(xe) &&
> 		    xe_survivability_mode_enable(xe))
> 			return 0;
> 	}
> 	if (xe_configfs_survivability_mode_enabled(pdev) || err)
> 	   		return 0;
> 
> 
> or perhaps encapsulate all of that in a static function and then here
> just:
> 
>         if(check_for_survivability(err))
> 		return 0;
Will try this.
> 
> the function can be better to extend to the module parameter
> to be used in boot if/when needed.
> 
> but, really up to you... as I said, no strong feelings...
> 
>> +		if (xe_survivability_mode_required(xe))
>> +			return xe_survivability_mode_enable(xe);
>>   
>>   		return err;
>>   	}
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> index d939ce70e6fa..5b60cbf8f7cb 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> @@ -10,6 +10,7 @@
>>   #include <linux/pci.h>
>>   #include <linux/sysfs.h>
>>   
>> +#include "xe_configfs.h"
>>   #include "xe_device.h"
>>   #include "xe_gt.h"
>>   #include "xe_heci_gsc.h"
>> @@ -28,8 +29,10 @@
>>    * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
>>    * to be flashed through mei and collect telemetry. The driver's probe flow is modified
>>    * such that it enters survivability mode when pcode initialization is incomplete and boot status
>> - * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
>> - * survivability mode and provides additional information required for debug
>> + * denotes a failure. Survivability mode can also be enabled manually by writing the pci address of
>> + * the card to the xe configfs attribute. This is useful in cases where pcode does not detect
>> + * failure and for validation.
> 
> or even for IFR (in-field-repair) use cases, where the repair or flash can be
> performed in a single GPU card without impacting the usage of other GPUs
> in the same node.
> 
> or something like that...
Will add the IFR case
> 
>> The driver then  populates the survivability_mode PCI sysfs
>> + * indicating survivability mo
> de and provides additional information required for debug
>>    *
>>    * KMD exposes below admin-only readable sysfs in survivability mode
>>    *
>> @@ -42,6 +45,15 @@
>>    *			      Overflow Information   - Provides history of previous failures
>>    *			      Auxiliary Information  - Certain failures may have information in
>>    *						       addition to postcode information
>> + * Enable survivability mode through configfs
>> + *
>> + * survivability mode is exposed as an attribute under the xe configfs subsystem. User can specify
>> + * the card that needs to enter survivability mode.
>> + *
>> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
>> + * echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
>> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
> 
> should we add an example in the doc so folks don't get so confused
> with the details of the terminology?
> 
> Example:
> 
> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
> echo "0000:03:00.0" > sys/kernel/config/xe/survivability_mode
> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> 
> ?

Sure. will add an example instead
> 
>> + *
>>    */
>>   
>>   static u32 aux_history_offset(u32 reg_value)
>> @@ -133,6 +145,7 @@ static void xe_survivability_mode_fini(void *arg)
>>   	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>   	struct device *dev = &pdev->dev;
>>   
>> +	xe_configfs_clear_survivability_mode();
>>   	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
>>   }
>>   
>> @@ -190,11 +203,15 @@ bool xe_survivability_mode_required(struct xe_device *xe)
>>   {
>>   	struct xe_survivability *survivability = &xe->survivability;
>>   	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>> +	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>   	u32 data;
>>   
>>   	if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
>>   		return false;
>>   
>> +	if (xe_configfs_survivability_mode_enabled(pdev))
>> +		return true;
> 
> this part is not needed right? as the check outside won't be evaluated
> or bypassed...
If its changed to single check , i'll remove this.

Thanks
Riana
> 
>> +
>>   	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
>>   	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>>   
>> -- 
>> 2.47.1
>>



More information about the Intel-xe mailing list