[PATCH v3 3/7] drm/xe/xe_survivability: Add support for Runtime survivability mode
Riana Tauro
riana.tauro at intel.com
Thu Jul 10 05:27:54 UTC 2025
Hi Stuart
On 7/9/2025 11:34 PM, Summers, Stuart wrote:
> On Wed, 2025-07-02 at 19:41 +0530, Riana Tauro wrote:
>> Certain runtime firmware errors can cause the device to be wedged
>> requiring a firmware flash to restore normal operation.
>> Runtime Survivability Mode indicates that a firmware flash is
>> necessary to
>> recover the device.
>
> I'm not understanding why we need to overload survivability mode here
> in the case of a CSC (or other hardware error) failure. I see there is
> some vesc initialization that happens there and GSC initialization
> (need to look further, but presumably this puts GSC in a survivability
> state also?). But we already have the vendor specific wedge. Do we
> really need the extra hook to survivability mode which was really built
> as a boot time config.
vendor-specific without a reason is vague and could be reused for a
different action in the future. There needs to be a indication that this
wedged uevent indicates firmware flash. So the survivability mode sysfs
This patch will further be extended to handle d3cold resume pcode
failures which will send a similar wedged event and survivability mode
Thanks
Riana>
> Thanks,
> Stuart
>
>>
>> The below sysfs is an indication that device is in survivability mode
>>
>> /sys/bus/pci/devices/<device>/surivability_mode
>>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_device.c | 2 +-
>> drivers/gpu/drm/xe/xe_survivability_mode.c | 26 ++++++++++++++++-
>> --
>> drivers/gpu/drm/xe/xe_survivability_mode.h | 4 ++-
>> .../gpu/drm/xe/xe_survivability_mode_types.h | 8 ++++++
>> 4 files changed, 35 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index 4a38486dccc8..5defa54ccd26 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -716,7 +716,7 @@ int xe_device_probe_early(struct xe_device *xe)
>> * possible, but still return the previous error for
>> error
>> * propagation
>> */
>> - err = xe_survivability_mode_enable(xe);
>> + err = xe_survivability_mode_enable(xe,
>> XE_SURVIVABILITY_TYPE_BOOT);
>> if (err)
>> return err;
>>
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c
>> b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> index 1f710b3fc599..e1adcb33c9b0 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> @@ -129,7 +129,10 @@ static ssize_t survivability_mode_show(struct
>> device *dev,
>> struct xe_survivability_info *info = survivability->info;
>> int index = 0, count = 0;
>>
>> - for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
>> + count += sysfs_emit_at(buff, count, "Survivability mode:
>> %s\n",
>> + survivability->type ? "Runtime" :
>> "Boot");
>> +
>> + for (index = 0; survivability->boot_status && index <
>> MAX_SCRATCH_MMIO; index++) {
>> if (info[index].reg)
>> count += sysfs_emit_at(buff, count, "%s: 0x%x
>> - 0x%x\n", info[index].name,
>> info[index].reg,
>> info[index].value);
>> @@ -169,6 +172,10 @@ static int enable_survivability_mode(struct
>> pci_dev *pdev)
>> if (ret)
>> return ret;
>>
>> + /* Only create sysfs for runtime survivability mode */
>> + if (xe_survivability_mode_is_runtime(xe))
>> + return 0;
>> +
>> /* Make sure xe_heci_gsc_init() knows about survivability
>> mode */
>> survivability->mode = true;
>>
>> @@ -189,6 +196,17 @@ static int enable_survivability_mode(struct
>> pci_dev *pdev)
>> return 0;
>> }
>>
>> +/**
>> + * xe_survivability_mode_is_runtime - check if survivability mode is
>> runtime
>> + * @xe: xe device instance
>> + *
>> + * Returns true if in runtime survivability mode, false otherwise
>> + */
>> +bool xe_survivability_mode_is_runtime(struct xe_device *xe)
>> +{
>> + return xe->survivability.type ==
>> XE_SURVIVABILITY_TYPE_RUNTIME;
>> +}
>> +
>> /**
>> * xe_survivability_mode_is_enabled - check if survivability mode is
>> enabled
>> * @xe: xe device instance
>> @@ -251,16 +269,18 @@ bool xe_survivability_mode_is_requested(struct
>> xe_device *xe)
>> * Return: 0 if survivability mode is enabled or not requested;
>> negative error
>> * code otherwise.
>> */
>> -int xe_survivability_mode_enable(struct xe_device *xe)
>> +int xe_survivability_mode_enable(struct xe_device *xe, const enum
>> xe_survivability_type type)
>> {
>> struct xe_survivability *survivability = &xe->survivability;
>> struct xe_survivability_info *info;
>> struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>
>> - if (!xe_survivability_mode_is_requested(xe))
>> + if (!xe_survivability_mode_is_requested(xe) &&
>> + type != XE_SURVIVABILITY_TYPE_RUNTIME)
>> return 0;
>>
>> survivability->size = MAX_SCRATCH_MMIO;
>> + survivability->type = type;
>>
>> info = devm_kcalloc(xe->drm.dev, survivability->size,
>> sizeof(*info),
>> GFP_KERNEL);
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h
>> b/drivers/gpu/drm/xe/xe_survivability_mode.h
>> index 02231c2bf008..559d1e99b03a 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode.h
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
>> @@ -9,9 +9,11 @@
>> #include <linux/types.h>
>>
>> struct xe_device;
>> +enum xe_survivability_type;
>>
>> -int xe_survivability_mode_enable(struct xe_device *xe);
>> +int xe_survivability_mode_enable(struct xe_device *xe, const enum
>> xe_survivability_type);
>> bool xe_survivability_mode_is_enabled(struct xe_device *xe);
>> +bool xe_survivability_mode_is_runtime(struct xe_device *xe);
>> bool xe_survivability_mode_is_requested(struct xe_device *xe);
>>
>> #endif /* _XE_SURVIVABILITY_MODE_H_ */
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> index 19d433e253df..01f07d9c4124 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
>> @@ -9,6 +9,11 @@
>> #include <linux/limits.h>
>> #include <linux/types.h>
>>
>> +enum xe_survivability_type {
>> + XE_SURVIVABILITY_TYPE_BOOT,
>> + XE_SURVIVABILITY_TYPE_RUNTIME,
>> +};
>> +
>> struct xe_survivability_info {
>> char name[NAME_MAX];
>> u32 reg;
>> @@ -30,6 +35,9 @@ struct xe_survivability {
>>
>> /** @mode: boolean to indicate survivability mode */
>> bool mode;
>> +
>> + /** @type: survivability mode type (boot or runtime) */
>> + enum xe_survivability_type type;
>> };
>>
>> #endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */
>
More information about the Intel-xe
mailing list