[PATCH v3 3/7] drm/xe/xe_survivability: Add support for Runtime survivability mode

Summers, Stuart stuart.summers at intel.com
Wed Jul 9 18:04:36 UTC 2025


On Wed, 2025-07-02 at 19:41 +0530, Riana Tauro wrote:
> Certain runtime firmware errors can cause the device to be wedged
> requiring a firmware flash to restore normal operation.
> Runtime Survivability Mode indicates that a firmware flash is
> necessary to
> recover the device.

I'm not understanding why we need to overload survivability mode here
in the case of a CSC (or other hardware error) failure. I see there is
some vesc initialization that happens there and GSC initialization
(need to look further, but presumably this puts GSC in a survivability
state also?). But we already have the vendor specific wedge. Do we
really need the extra hook to survivability mode which was really built
as a boot time config.

Thanks,
Stuart

> 
> The below sysfs is an indication that device is in survivability mode
> 
> /sys/bus/pci/devices/<device>/surivability_mode
> 
> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c                |  2 +-
>  drivers/gpu/drm/xe/xe_survivability_mode.c    | 26 ++++++++++++++++-
> --
>  drivers/gpu/drm/xe/xe_survivability_mode.h    |  4 ++-
>  .../gpu/drm/xe/xe_survivability_mode_types.h  |  8 ++++++
>  4 files changed, 35 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c
> b/drivers/gpu/drm/xe/xe_device.c
> index 4a38486dccc8..5defa54ccd26 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -716,7 +716,7 @@ int xe_device_probe_early(struct xe_device *xe)
>                  * possible, but still return the previous error for
> error
>                  * propagation
>                  */
> -               err = xe_survivability_mode_enable(xe);
> +               err = xe_survivability_mode_enable(xe,
> XE_SURVIVABILITY_TYPE_BOOT);
>                 if (err)
>                         return err;
>  
> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c
> b/drivers/gpu/drm/xe/xe_survivability_mode.c
> index 1f710b3fc599..e1adcb33c9b0 100644
> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
> @@ -129,7 +129,10 @@ static ssize_t survivability_mode_show(struct
> device *dev,
>         struct xe_survivability_info *info = survivability->info;
>         int index = 0, count = 0;
>  
> -       for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
> +       count += sysfs_emit_at(buff, count, "Survivability mode:
> %s\n",
> +                              survivability->type ? "Runtime" :
> "Boot");
> +
> +       for (index = 0; survivability->boot_status && index <
> MAX_SCRATCH_MMIO; index++) {
>                 if (info[index].reg)
>                         count += sysfs_emit_at(buff, count, "%s: 0x%x
> - 0x%x\n", info[index].name,
>                                                info[index].reg,
> info[index].value);
> @@ -169,6 +172,10 @@ static int enable_survivability_mode(struct
> pci_dev *pdev)
>         if (ret)
>                 return ret;
>  
> +       /* Only create sysfs for runtime survivability mode */
> +       if (xe_survivability_mode_is_runtime(xe))
> +               return 0;
> +
>         /* Make sure xe_heci_gsc_init() knows about survivability
> mode */
>         survivability->mode = true;
>  
> @@ -189,6 +196,17 @@ static int enable_survivability_mode(struct
> pci_dev *pdev)
>         return 0;
>  }
>  
> +/**
> + * xe_survivability_mode_is_runtime - check if survivability mode is
> runtime
> + * @xe: xe device instance
> + *
> + * Returns true if in runtime survivability mode, false otherwise
> + */
> +bool xe_survivability_mode_is_runtime(struct xe_device *xe)
> +{
> +       return xe->survivability.type ==
> XE_SURVIVABILITY_TYPE_RUNTIME;
> +}
> +
>  /**
>   * xe_survivability_mode_is_enabled - check if survivability mode is
> enabled
>   * @xe: xe device instance
> @@ -251,16 +269,18 @@ bool xe_survivability_mode_is_requested(struct
> xe_device *xe)
>   * Return: 0 if survivability mode is enabled or not requested;
> negative error
>   * code otherwise.
>   */
> -int xe_survivability_mode_enable(struct xe_device *xe)
> +int xe_survivability_mode_enable(struct xe_device *xe, const enum
> xe_survivability_type type)
>  {
>         struct xe_survivability *survivability = &xe->survivability;
>         struct xe_survivability_info *info;
>         struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>  
> -       if (!xe_survivability_mode_is_requested(xe))
> +       if (!xe_survivability_mode_is_requested(xe) &&
> +           type != XE_SURVIVABILITY_TYPE_RUNTIME)
>                 return 0;
>  
>         survivability->size = MAX_SCRATCH_MMIO;
> +       survivability->type = type;
>  
>         info = devm_kcalloc(xe->drm.dev, survivability->size,
> sizeof(*info),
>                             GFP_KERNEL);
> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h
> b/drivers/gpu/drm/xe/xe_survivability_mode.h
> index 02231c2bf008..559d1e99b03a 100644
> --- a/drivers/gpu/drm/xe/xe_survivability_mode.h
> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
> @@ -9,9 +9,11 @@
>  #include <linux/types.h>
>  
>  struct xe_device;
> +enum xe_survivability_type;
>  
> -int xe_survivability_mode_enable(struct xe_device *xe);
> +int xe_survivability_mode_enable(struct xe_device *xe, const enum
> xe_survivability_type);
>  bool xe_survivability_mode_is_enabled(struct xe_device *xe);
> +bool xe_survivability_mode_is_runtime(struct xe_device *xe);
>  bool xe_survivability_mode_is_requested(struct xe_device *xe);
>  
>  #endif /* _XE_SURVIVABILITY_MODE_H_ */
> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> index 19d433e253df..01f07d9c4124 100644
> --- a/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> @@ -9,6 +9,11 @@
>  #include <linux/limits.h>
>  #include <linux/types.h>
>  
> +enum xe_survivability_type {
> +       XE_SURVIVABILITY_TYPE_BOOT,
> +       XE_SURVIVABILITY_TYPE_RUNTIME,
> +};
> +
>  struct xe_survivability_info {
>         char name[NAME_MAX];
>         u32 reg;
> @@ -30,6 +35,9 @@ struct xe_survivability {
>  
>         /** @mode: boolean to indicate survivability mode */
>         bool mode;
> +
> +       /** @type: survivability mode type (boot or runtime) */
> +       enum xe_survivability_type type;
>  };
>  
>  #endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */



More information about the Intel-xe mailing list