[PATCH 1/2] RFC drm/xe: Add functions and sysfs for boot survivability

Rodrigo Vivi rodrigo.vivi at intel.com
Mon Dec 16 17:48:21 UTC 2024


On Mon, Dec 16, 2024 at 01:33:14PM +0530, Riana Tauro wrote:
> 
> 
> On 12/14/2024 2:13 AM, Rodrigo Vivi wrote:
> > On Fri, Dec 13, 2024 at 01:34:23PM +0530, Riana Tauro wrote:
> > > Hi Rodrigo
> > > 
> > > Thank you for the review comments.
> > > 
> > > On 12/13/2024 4:27 AM, Rodrigo Vivi wrote:
> > > > On Thu, Dec 12, 2024 at 11:19:44AM +0530, Riana Tauro wrote:
> > > > > Boot Survivability is a software based workflow for recovering a system
> > > > > in a failed boot state. Here system recoverability is concerned with
> > > > > recovering the firmware responsible for boot.
> > > > > 
> > > > > This is implemented by loading the driver with bare minimum (no drm card)
> > > > > to allow the firmware to be flashed through mei/gsc and collect telemetry.
> > > > > The driver's probe flow is modified such that it enters survivability mode
> > > > > when pcode initialization is incomplete and boot status denotes a failure.
> > > > > In this mode, drm card is not exposed and PCI sysfs is used to indicate
> > > > > survivability mode and provide additional information required for debug
> > > > > 
> > > > > This patch adds initialization functions and exposes admin
> > > > > readable sysfs entries
> > > > > 
> > > > > The new sysfs will have the below layout
> > > > > 
> > > > > 	/sys/bus/.../bdf
> > > > > 		     ├── survivability_info
> > > > >                    ├── survivability_mode
> > > > 
> > > > Let's make only one file and get all the info inside the survivability_mode
> > > > one.
> > > Then any application using this will have to parse value?
> > > 
> > > Oh you meant, the presence of the file will indicate the mode and contents
> > > will give the required information. Okay will modify this
> > > > 
> > > > > 
> > > > > Signed-off-by: Riana Tauro <riana.tauro at intel.com>
> > > > > ---
> > > > >    drivers/gpu/drm/xe/Makefile                   |   1 +
> > > > >    drivers/gpu/drm/xe/xe_device_types.h          |   4 +
> > > > >    drivers/gpu/drm/xe/xe_pcode_api.h             |  14 ++
> > > > >    drivers/gpu/drm/xe/xe_survivability_mode.c    | 225 ++++++++++++++++++
> > > > >    drivers/gpu/drm/xe/xe_survivability_mode.h    |  17 ++
> > > > >    .../gpu/drm/xe/xe_survivability_mode_types.h  |  35 +++
> > > > >    6 files changed, 296 insertions(+)
> > > > >    create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode.c
> > > > >    create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode.h
> > > > >    create mode 100644 drivers/gpu/drm/xe/xe_survivability_mode_types.h
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > > > > index 7730e0596299..dc60512a5c47 100644
> > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > @@ -95,6 +95,7 @@ xe-y += xe_bb.o \
> > > > >    	xe_sa.o \
> > > > >    	xe_sched_job.o \
> > > > >    	xe_step.o \
> > > > > +	xe_survivability_mode.o \
> > > > >    	xe_sync.o \
> > > > >    	xe_tile.o \
> > > > >    	xe_tile_sysfs.o \
> > > > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > > > > index 1373a222f5a5..79bd0bd94e9c 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > > > > @@ -21,6 +21,7 @@
> > > > >    #include "xe_pt_types.h"
> > > > >    #include "xe_sriov_types.h"
> > > > >    #include "xe_step_types.h"
> > > > > +#include "xe_survivability_mode_types.h"
> > > > >    #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > >    #define TEST_VM_OPS_ERROR
> > > > > @@ -341,6 +342,9 @@ struct xe_device {
> > > > >    		u8 skip_pcode:1;
> > > > >    	} info;
> > > > > +	/** @survivability: survivability information for device */
> > > > > +	struct xe_survivability survivability;
> > > > > +
> > > > >    	/** @irq: device interrupt state */
> > > > >    	struct {
> > > > >    		/** @irq.lock: lock for processing irq's on this device */
> > > > > diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h
> > > > > index f153ce96f69a..4e373b8199ca 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_pcode_api.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_pcode_api.h
> > > > > @@ -49,6 +49,20 @@
> > > > >    /* Domain IDs (param2) */
> > > > >    #define     PCODE_MBOX_DOMAIN_HBM		0x2
> > > > > +#define PCODE_SCRATCH_ADDR(x)		XE_REG(0x138320 + ((x) * 4))
> > > > > +/* PCODE_SCRATCH0 */
> > > > > +#define   AUXINFO_REG_OFFSET		REG_GENMASK(17, 15)
> > > > > +#define   OVERFLOW_REG_OFFSET		REG_GENMASK(14, 12)
> > > > > +#define   HISTORY_TRACKING		REG_BIT(11)
> > > > > +#define   OVERFLOW_SUPPORT		REG_BIT(10)
> > > > > +#define   AUXINFO_SUPPORT		REG_BIT(9)
> > > > > +#define   BOOT_STATUS			REG_GENMASK(3, 1)
> > > > > +#define      CRITICAL_FAILURE		4
> > > > > +#define      NON_CRITICAL_FAILURE	7
> > > > > +
> > > > > +/* Auxillary info bits */
> > > > > +#define   AUXINFO_HISTORY_OFFSET	REG_GENMASK(31, 29)
> > > > > +
> > > > >    struct pcode_err_decode {
> > > > >    	int errno;
> > > > >    	const char *str;
> > > > > diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
> > > > > new file mode 100644
> > > > > index 000000000000..7e36989efd68
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
> > > > > @@ -0,0 +1,225 @@
> > > > > +// SPDX-License-Identifier: MIT
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#include <drm/drm_managed.h>
> > > > 
> > > > this include moves together the linux group below,
> > > > on top of it...
> > > > 
> > > > > +
> > > > > +#include "xe_survivability_mode_types.h"
> > > > > +#include "xe_survivability_mode.h"
> > > > > +
> > > > > +#include <linux/kobject.h>
> > > > > +#include <linux/pci.h>
> > > > > +#include <linux/sysfs.h>
> > > > > +
> > > > > +#include "xe_device.h"
> > > > > +#include "xe_gt.h"
> > > > > +#include "xe_mmio.h"
> > > > > +#include "xe_pcode_api.h"
> > > > > +
> > > > > +#define MAX_SCRATCH_MMIO 8
> > > > > +
> > > > > +/**
> > > > > + * DOC: Xe Boot Survivability
> > > > > + *
> > > > > + * Boot Survivability is a software based workflow for recovering a system in a failed boot state
> > > > > + * Here system recoverability is concerned with recovering the firmware responsible for boot.
> > > > > + *
> > > > > + * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
> > > > > + * to be flashed through mei and collect telemetry. The driver's probe flow is modified
> > > > > + * such that it enters survivability mode when pcode initialization is incomplete and boot status
> > > > > + * denotes a failure. In this mode, drm card is not exposed and PCI sysfs is used to indicate the
> > > > > + * survivability mode and provide additional information required for debug
> > > > > + *
> > > > > + * Xe KMD exposes below admin-only readable sysfs in survivability mode
> > > > > + *
> > > > > + * device/survivability_mode: Indicates driver is in survivability mode
> > > > 
> > > > We need to make in a way that the presence of the file itself is the indication
> > > > of the survivability_mode.  No file, no survivability_mode. No survivability_mode, no file.
> > > > 
> > > > Which I believe your code is already doing this below...
> > > > 
> > > > > + * device/survivability_info: Provides additional information on why the driver entered
> > > > > + *			      survivability mode.
> > > > > + *
> > > > > + *			      Capability Information - Provides boot status
> > > > > + *			      Postcode Information   - Provides information about the failure
> > > > > + *			      Overflow Information   - Provides history of previous failures
> > > > > + *			      Auxillary Information  - Certain failures may have information in
> > > > > + *						       addition to postcode information
> > > > 
> > > > then this move into the single file...
> > > > 
> > > > > + *
> > > > > + * TODO: Notify mei about survivability mode
> > > > > + */
> > > > > +
> > > > > +static void set_survivability_info(struct xe_device *xe, struct xe_survivability_info *info,
> > > > > +				   int id, char *name)
> > > > > +{
> > > > > +	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
> > > > > +
> > > > > +	strscpy(info[id].name, name, sizeof(info[id].name));
> > > > > +	info[id].reg = PCODE_SCRATCH_ADDR(id).raw;
> > > > > +	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH_ADDR(id));
> > > > > +
> > > > > +	drm_info(&xe->drm, "%s: 0x%x - 0x%x\n", info[id].name,
> > > > > +		 info[id].reg, info[id].value);
> > > > > +}
> > > > > +
> > > > > +static int fill_survivability_info(struct xe_device *xe)
> > > > > +{
> > > > > +	struct xe_survivability *survivability = &xe->survivability;
> > > > > +	struct xe_survivability_info *info = survivability->info;
> > > > > +	u32 capability_info;
> > > > > +	int id = 0;
> > > > > +
> > > > > +	drm_info(&xe->drm, "Survivability Mode Information\n");
> > > > 
> > > > no need for the drm_info here
> > > Added a prefix here to indicate the below information is related to
> > > Survivability
> > > 
> > > Otherwise it will only display as below in case of Critical failure.
> > > Critical failure currently doesn't enter into the survivability mode
> > > and will not have sysfs.
> > 
> > Indeed. for the critical error we print dmesg, do-not create the sysfs
> > and fail probe. Perhaps that deserves a separate function?
> > 
> The same is being done even now. The init function below returns after
> printing dmesg.
> 
> /* Only log debug information and exit if it is a critical failure */
> if (survivability->boot_status == CRITICAL_FAILURE)
> 	return;
> > > 
> > > 
> In the review comment you suggested to remove
> drm_info(&xe->drm, "Survivability Mode Information\n");
> Without this, in case of critical failure there won't be any indication that
> the below dmesg is related to survivability mode.

So, just print them all in a function for critical mode only.
The regular mode doesn't need dmesg, this information data
is already part of the sysfs file, no need to repeat.

> 
> Thanks
> Riana Tauro
> > > [ 4708.689214] xe <bdf>: [drm] Capability Info: <value>
> > > [ 4708.689221] xe <bdf>: [drm] Postcode Info: <value>
> > > [ 4708.689226] xe <bdf>: [drm] Overflow Info: <value>
> > > [ 4708.689230] xe <bdf>: [drm] Auxiliary Info 0: <value>
> > > 
> > > Will remove if not required or add the function name.
> > > 
> > > Thanks,
> > > Riana Tauro
> > > > 
> > > > > +	set_survivability_info(xe, info, id, "Capability Info");
> > > > > +	capability_info = info[id].value;
> > > > > +
> > > > > +	if (capability_info & HISTORY_TRACKING) {
> > > > > +		id++;
> > > > > +		set_survivability_info(xe, info, id, "Postcode Info");
> > > > > +
> > > > > +		if (capability_info & OVERFLOW_SUPPORT) {
> > > > > +			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, capability_info);
> > > > > +			/* ID should be within MAX_SCRATCH_MMIO */
> > > > > +			if (id >= MAX_SCRATCH_MMIO)
> > > > > +				return -EINVAL;
> > > > > +			set_survivability_info(xe, info, id, "Overflow Info");
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	if (capability_info & AUXINFO_SUPPORT) {
> > > > > +		u32 aux_info;
> > > > > +		int index = 0;
> > > > > +		char name[NAME_MAX];
> > > > > +
> > > > > +		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, capability_info);
> > > > > +		if (id >= MAX_SCRATCH_MMIO)
> > > > > +			return -EINVAL;
> > > > > +
> > > > > +		snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
> > > > > +		set_survivability_info(xe, info, id, name);
> > > > > +		aux_info = info[id].value;
> > > > > +
> > > > > +		while ((id = REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, aux_info)) &&
> > > > > +		       (id < MAX_SCRATCH_MMIO)) {
> > > > > +			index++;
> > > > > +			snprintf(name, NAME_MAX, "Prev Auxiliary Info %d", index);
> > > > > +			set_survivability_info(xe, info, id, name);
> > > > > +			aux_info = info[id].value;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static ssize_t survivability_info_show(struct device *dev,
> > > > > +				       struct device_attribute *attr, char *buff)
> > > > > +{
> > > > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > > > +	struct xe_device *xe = pdev_to_xe_device(pdev);
> > > > > +	struct xe_survivability *survivability = &xe->survivability;
> > > > > +	struct xe_survivability_info *info = survivability->info;
> > > > > +	int index = 0, count = 0;
> > > > > +
> > > > > +	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
> > > > > +		if (info[index].reg)
> > > > > +			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
> > > > > +					       info[index].reg, info[index].value);
> > > > > +	}
> > > > > +
> > > > > +	return count;
> > > > > +}
> > > > > +
> > > > > +static DEVICE_ATTR_ADMIN_RO(survivability_info);
> > > > > +
> > > > > +static ssize_t survivability_mode_show(struct device *dev,
> > > > > +				       struct device_attribute *attr, char *buff)
> > > > > +{
> > > > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > > > +	struct xe_device *xe = pdev_to_xe_device(pdev);
> > > > > +	struct xe_survivability *survivability = &xe->survivability;
> > > > > +
> > > > > +	return sysfs_emit(buff, "%d\n", survivability->mode);
> > > > > +}
> > > > > +
> > > > > +static DEVICE_ATTR_ADMIN_RO(survivability_mode);
> > > > > +
> > > > > +static const struct attribute *survivability_attrs[] = {
> > > > > +	&dev_attr_survivability_mode.attr,
> > > > > +	&dev_attr_survivability_info.attr,
> > > > > +	NULL,
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * xe_survivability_mode_required- checks if survivability mode is required
> > > > > + * @xe: xe device instance
> > > > > + *
> > > > > + * This function reads the boot status of the capability register and
> > > > > + * checks if it is required to enter boot survivability mode.
> > > > > + *
> > > > > + * Return: true if survivability mode required, false otherwise
> > > > > + */
> > > > > +bool xe_survivability_mode_required(struct xe_device *xe)
> > > > > +{
> > > > > +	struct xe_survivability *survivability = &xe->survivability;
> > > > > +	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
> > > > > +	u32 data;
> > > > > +
> > > > > +	data = xe_mmio_read32(mmio, PCODE_SCRATCH_ADDR(0));
> > > > > +	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
> > > > > +
> > > > > +	return (survivability->boot_status == NON_CRITICAL_FAILURE ||
> > > > > +		survivability->boot_status == CRITICAL_FAILURE);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * xe_survivability_mode_remove - remove survivability mode
> > > > > + * @xe: xe device instance
> > > > > + *
> > > > > + * clean up sysfs entries of survivability mode
> > > > > + */
> > > > > +void xe_survivability_mode_remove(struct xe_device *xe)
> > > > > +{
> > > > > +	sysfs_remove_files(&xe->drm.dev->kobj, survivability_attrs);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * xe_survivability_mode_init - Initialize the survivability mode
> > > > > + * @xe: xe device instance
> > > > > + *
> > > > > + * Initializes the sysfs and required actions to enter survivability mode
> > > > > + */
> > > > > +void xe_survivability_mode_init(struct xe_device *xe)
> > > > > +{
> > > > > +	struct xe_survivability *survivability = &xe->survivability;
> > > > > +	struct xe_survivability_info *info;
> > > > > +	struct device *dev = xe->drm.dev;
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	survivability->size = MAX_SCRATCH_MMIO;
> > > > > +
> > > > > +	info = drmm_kcalloc(&xe->drm, survivability->size, sizeof(*info), GFP_KERNEL);
> > > > > +	if (!info) {
> > > > > +		drm_warn(&xe->drm, "%s failed, err: %d\n", __func__, -ENOMEM);
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	survivability->info = info;
> > > > > +
> > > > > +	ret = fill_survivability_info(xe);
> > > > > +	if (ret)
> > > > > +		drm_warn(&xe->drm, "%s failed, err: %d\n", __func__, ret);
> > > > > +
> > > > > +	/* Only log debug information and exit if it is a critical failure */
> > > > > +	if (survivability->boot_status == CRITICAL_FAILURE)
> > > > > +		return;
> > > > > +
> > > > > +	/* set survivability mode */
> > > > > +	survivability->mode = true;
> > > > > +
> > > > > +	drm_info(&xe->drm, "In Survivability Mode\n");
> > > > 
> > > > this one is good!
> > > > 
> > > > > +
> > > > > +	ret = sysfs_create_files(&dev->kobj, survivability_attrs);
> > > > > +	if (ret) {
> > > > > +		drm_warn(&xe->drm, "Failed to create survivability sysfs files\n");
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	/* TODO: Pass Survivability Mode notification to required child drivers */
> > > > > +}
> > > > > diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
> > > > > new file mode 100644
> > > > > index 000000000000..0d5c325322a2
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
> > > > > @@ -0,0 +1,17 @@
> > > > > +/* SPDX-License-Identifier: MIT */
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#ifndef _XE_SURVIVABILITY_MODE_H_
> > > > > +#define _XE_SURVIVABILITY_MODE_H_
> > > > > +
> > > > > +#include <linux/types.h>
> > > > > +
> > > > > +struct xe_device;
> > > > > +
> > > > > +void xe_survivability_mode_init(struct xe_device *xe);
> > > > > +void xe_survivability_mode_remove(struct xe_device *xe);
> > > > > +bool xe_survivability_mode_required(struct xe_device *xe);
> > > > > +
> > > > > +#endif /* _XE_SURVIVABILITY_MODE_H_ */
> > > > > diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> > > > > new file mode 100644
> > > > > index 000000000000..f9dbb6d80692
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h
> > > > > @@ -0,0 +1,35 @@
> > > > > +/* SPDX-License-Identifier: MIT */
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#ifndef _XE_SURVIVABILITY_MODE_TYPES_H_
> > > > > +#define _XE_SURVIVABILITY_MODE_TYPES_H_
> > > > > +
> > > > > +#include <linux/limits.h>
> > > > > +#include <linux/types.h>
> > > > > +
> > > > > +struct xe_survivability_info {
> > > > > +	char name[NAME_MAX];
> > > > > +	u32 reg;
> > > > > +	u32 value;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct xe_survivability: Contains survivability mode information
> > > > > + */
> > > > > +struct xe_survivability {
> > > > > +	/** @info: struct that holds survivability info from scratch registers */
> > > > > +	struct xe_survivability_info *info;
> > > > > +
> > > > > +	/** @size: number of scratch registers */
> > > > > +	u32 size;
> > > > > +
> > > > > +	/** @boot_status: indicates critical/non critical boot failure */
> > > > > +	u8 boot_status;
> > > > > +
> > > > > +	/** mode: boolean to indicate survivability mode */
> > > > > +	bool mode;
> > > > > +};
> > > > > +
> > > > > +#endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */
> > > > > -- 
> > > > > 2.47.1
> > > > > 
> > > 
> 


More information about the Intel-xe mailing list