[PATCH v3 4/4] drm/xe/FLR: Support PCIe FLR

Wed Apr 24 23:29:17 UTC 2024

On Wed, Apr 24, 2024 at 10:42:59AM +0530, Aravind Iddamsetty wrote:
> 
> On 24/04/24 05:19, Michał Winiarski wrote:
> > On Mon, Apr 22, 2024 at 12:27:56PM +0530, Aravind Iddamsetty wrote:
> >> PCI subsystem provides callbacks to inform the driver about a request to
> >> do function level reset by user, initiated by writing to sysfs entry
> >> /sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
> >> without the need to do unbind and rebind as the driver needs to
> >> reinitialize the device afresh post FLR.
> >>
> >> v2:
> >> 1. separate out gt idle and pci save/restore to a separate patch (Lucas)
> >> 2. Fixed the warnings seen around xe_guc_submit_stop, xe_guc_puc_fini
> >>
> >> v3: declare xe_pci_err_handlers as static(Michal)
> >>
> >> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> >> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> >> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> >>
> >> Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> >> Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
> >> ---
> >>  drivers/gpu/drm/xe/Makefile          |  1 +
> >>  drivers/gpu/drm/xe/xe_device_types.h |  3 +
> >>  drivers/gpu/drm/xe/xe_guc_pc.c       |  4 ++
> >>  drivers/gpu/drm/xe/xe_pci.c          |  9 ++-
> >>  drivers/gpu/drm/xe/xe_pci.h          |  2 +
> >>  drivers/gpu/drm/xe/xe_pci_err.c      | 88 ++++++++++++++++++++++++++++
> >>  drivers/gpu/drm/xe/xe_pci_err.h      | 13 ++++
> >>  7 files changed, 119 insertions(+), 1 deletion(-)
> >>  create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c
> >>  create mode 100644 drivers/gpu/drm/xe/xe_pci_err.h
> >>
> >> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> >> index 8bc62bfbc679..693971a1fac0 100644
> >> --- a/drivers/gpu/drm/xe/Makefile
> >> +++ b/drivers/gpu/drm/xe/Makefile
> >> @@ -117,6 +117,7 @@ xe-y += xe_bb.o \
> >>  	xe_module.o \
> >>  	xe_pat.o \
> >>  	xe_pci.o \
> >> +	xe_pci_err.o \
> >>  	xe_pcode.o \
> >>  	xe_pm.o \
> >>  	xe_preempt_fence.o \
> >> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> >> index 0a66555229e9..8c749b378a92 100644
> >> --- a/drivers/gpu/drm/xe/xe_device_types.h
> >> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> >> @@ -465,6 +465,9 @@ struct xe_device {
> >>  	/** @pci_state: PCI state of device */
> >>  	struct pci_saved_state *pci_state;
> >>  
> >> +	/** @pci_device_is_reset: device went through PCIe FLR */
> >> +	bool pci_device_is_reset;
> >> +
> >>  	/* private: */
> >>  
> >>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
> >> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
> >> index 509649d0e65e..efba0fbe2f5c 100644
> >> --- a/drivers/gpu/drm/xe/xe_guc_pc.c
> >> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
> >> @@ -902,6 +902,10 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
> >>  		return;
> >>  	}
> >>  
> >> +	/* We already have done this before going through a reset, so skip here */
> >> +	if (xe->pci_device_is_reset)
> >> +		return;
> >> +
> >>  	XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
> >>  	XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
> >>  	XE_WARN_ON(xe_guc_pc_stop(pc));
> >> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> >> index a62300990e19..b5a582afc9e7 100644
> >> --- a/drivers/gpu/drm/xe/xe_pci.c
> >> +++ b/drivers/gpu/drm/xe/xe_pci.c
> >> @@ -23,6 +23,7 @@
> >>  #include "xe_macros.h"
> >>  #include "xe_mmio.h"
> >>  #include "xe_module.h"
> >> +#include "xe_pci_err.h"
> >>  #include "xe_pci_types.h"
> >>  #include "xe_pm.h"
> >>  #include "xe_sriov.h"
> >> @@ -738,7 +739,7 @@ static void xe_pci_remove(struct pci_dev *pdev)
> >>  	pci_set_drvdata(pdev, NULL);
> >>  }
> >>  
> >> -static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> >> +int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> >>  {
> >>  	const struct xe_device_desc *desc = (const void *)ent->driver_data;
> >>  	const struct xe_subplatform_desc *subplatform_desc;
> >> @@ -986,6 +987,11 @@ static const struct dev_pm_ops xe_pm_ops = {
> >>  };
> >>  #endif
> >>  
> >> +static const struct pci_error_handlers xe_pci_err_handlers = {
> >> +	.reset_prepare = xe_pci_reset_prepare,
> >> +	.reset_done = xe_pci_reset_done,
> >> +};
> >> +
> >>  static struct pci_driver xe_pci_driver = {
> >>  	.name = DRIVER_NAME,
> >>  	.id_table = pciidlist,
> >> @@ -995,6 +1001,7 @@ static struct pci_driver xe_pci_driver = {
> >>  #ifdef CONFIG_PM_SLEEP
> >>  	.driver.pm = &xe_pm_ops,
> >>  #endif
> >> +	.err_handler = &xe_pci_err_handlers,
> >>  };
> >>  
> >>  int xe_register_pci_driver(void)
> >> diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h
> >> index 73b90a430d1f..9faf5380a09e 100644
> >> --- a/drivers/gpu/drm/xe/xe_pci.h
> >> +++ b/drivers/gpu/drm/xe/xe_pci.h
> >> @@ -7,8 +7,10 @@
> >>  #define _XE_PCI_H_
> >>  
> >>  struct pci_dev;
> >> +struct pci_device_id;
> >>  
> >>  int xe_register_pci_driver(void);
> >>  void xe_unregister_pci_driver(void);
> >>  void xe_load_pci_state(struct pci_dev *pdev);
> >> +int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
> >>  #endif
> >> diff --git a/drivers/gpu/drm/xe/xe_pci_err.c b/drivers/gpu/drm/xe/xe_pci_err.c
> >> new file mode 100644
> >> index 000000000000..5306925ea2fa
> >> --- /dev/null
> >> +++ b/drivers/gpu/drm/xe/xe_pci_err.c
> >> @@ -0,0 +1,88 @@
> >> +// SPDX-License-Identifier: MIT
> >> +/*
> >> + * Copyright © 2024 Intel Corporation
> >> + */
> >> +
> >> +#include <linux/pci.h>
> >> +#include <drm/drm_drv.h>
> >> +
> >> +#include "xe_device.h"
> >> +#include "xe_gt.h"
> >> +#include "xe_gt_printk.h"
> >> +#include "xe_pci.h"
> >> +#include "xe_pci_err.h"
> >> +#include "xe_pm.h"
> >> +#include "xe_uc.h"
> >> +
> >> +/**
> >> + * xe_pci_reset_prepare - Called when user issued a PCIe reset
> >> + * via /sys/bus/pci/devices/.../reset.
> >> + * @pdev: PCI device struct
> >> + */
> >> +void xe_pci_reset_prepare(struct pci_dev *pdev)
> >> +{
> >> +	struct xe_device *xe = pci_get_drvdata(pdev);
> >> +	struct xe_gt *gt;
> >> +	int id, err;
> >> +
> >> +	pci_warn(pdev, "preparing for PCIe reset\n");
> >> +
> >> +	drm_warn(&xe->drm, "removing device access to userspace\n");
> > Warn? Shouldn't it be info?
> I believe warn is appropriate as this is not a usual path the users apps expect
> to hit and as they loose device connection.

It's an expected path after issuing a reset via /sys/bus/pci/devices/.../reset.
DRM device disappearing - yeah, that's probably not expected, fully
agree on that.

> >
> >> +	drm_dev_unplug(&xe->drm);
> >> +
> >> +	xe_pm_runtime_get(xe);
> >> +	/* idle the GTs */
> >> +	for_each_gt(gt, xe, id) {
> >> +		err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> >> +		if (err)
> >> +			goto reset;
> >> +		xe_uc_reset_prepare(&gt->uc);
> >> +		xe_gt_idle(gt);
> >> +		err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> >> +		XE_WARN_ON(err);
> >> +	}
> >> +	xe_pm_runtime_put(xe);
> >> +
> >> +reset:
> >> +	pci_disable_device(pdev);
> >> +}
> >> +
> >> +/**
> >> + * xe_pci_reset_done - Called when PCIe reset is done.
> >> + * @pdev: PCI device struct
> >> + */
> >> +void xe_pci_reset_done(struct pci_dev *pdev)
> >> +{
> >> +	const struct pci_device_id *ent = pci_match_id(pdev->driver->id_table, pdev);
> >> +	struct xe_device *xe = pci_get_drvdata(pdev);
> >> +
> >> +	dev_info(&pdev->dev,
> >> +		 "device went through PCIe reset, reenabling the device\n");
> >> +
> >> +	if (pci_enable_device(pdev)) {
> >> +		dev_err(&pdev->dev,
> >> +			"Cannot re-enable PCI device after reset\n");
> >> +		return;
> >> +	}
> >> +	pci_set_master(pdev);
> >> +	xe_load_pci_state(pdev);
> >> +
> >> +	xe->pci_device_is_reset = true;
> >> +	/*
> >> +	 * We want to completely clean the driver and even destroy
> >> +	 * the xe private data and reinitialize afresh similar to
> >> +	 * probe
> >> +	 */
> >> +	pdev->driver->remove(pdev);
> > Do we really want to do that?
> > First of all, that fairly unusual - none of the other PCI drivers makes
> > changes in the device/driver model during reset, which makes me wonder
> > if this is really what the user expects.
> > I would expect that the device is reset, but the driver is not unloaded
> > and previously created FDs are still valid.
> We cannot continue to work with previous instance of driver as post
> reset GuC , LMEM and most of the soc is reset, so  we need to reinitialize
> device afresh like in driver probe, hence we remove to clean the previous
> stale driver state and re-probe again.

GuC, LMEM and most of the soc is reset when we're doing S3 / S4, and
yet, we're not removing the driver in this scenario.

> 
> All applications are expected to reopen the device handles afresh.

Current UMDs are not handling that case.

> 
> >
> > Note that messing with the driver model at reset also makes it difficult
> > to do a proper cleanup if SR-IOV is enabled, as PCI-core expects drivers
> > to remove VF devices during driver->remove.
> > Removal of said devices depends on pci_cfg_access_lock, which is already
> > held for the duration of the reset (wh
> I haven't verified SRIOV use case, i believe we can take this up as next step.
> Also, since this is not an automatic reset but a user/admin initiated i believe
> the onus can be on admin to close all VFs before initiating a reset.

That's not the contract for reset. It's not a "user/admin" interface,
there doesn't necessarily have to be an "interactive" user interfacing
with it through shell.
It's just a regular stable sysfs uAPI - it could very well be called by
application as well.

> > ich includes calling reset_done),
> > which will cause a deadlock.
> >
> > In current form, it also causes WARN if there are open FDs to DRM
> > device during reset.
> I did verify using igt at device_reset@reset-bound which has a open connection haven't seen this error
> but will re verify again but again certain warns are expected but driver reload should be successful.
> similarly here as well we can expect the admin to close any applications using the device before
> initiating a reset.

Can we expect this?
If we can - why are you stating that "all applications are expected to
reopen the device handles afresh"?
I believe that the expectation for reset is that it behaves simiarly to
the PM flows, which is mostly transparent to the users.

Thanks,
-Michał

> 
> Thanks,
> 
> Aravind.
> >
> > [29357.277699] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:02.0/tile0'                                                                                                                            01:06:58 [142/47263]
> > [29357.286158] CPU: 7 PID: 3479 Comm: bash Not tainted 6.9.0-rc5-xe+ #78
> > [29357.305870] Call Trace:
> > [29357.308342]  <TASK>
> > [29357.310453]  dump_stack_lvl+0x139/0x1d0
> > [29357.314305]  ? __pfx_dump_stack_lvl+0x10/0x10
> > [29357.318680]  ? __pfx__printk+0x10/0x10
> > [29357.322450]  ? kmalloc_trace+0x1c1/0x3a0
> > [29357.326394]  ? sysfs_create_dir_ns+0x162/0x210
> > [29357.330861]  sysfs_create_dir_ns+0x195/0x210
> > [29357.335154]  ? __pfx_sysfs_create_dir_ns+0x10/0x10
> > [29357.339965]  ? strcmp+0x2f/0x60
> > [29357.343134]  kobject_add_internal+0x2af/0x600
> > [29357.347517]  kobject_add+0xfb/0x190
> > [29357.351028]  ? __link_object+0x1c7/0x280
> > [29357.354976]  ? __pfx_kobject_add+0x10/0x10
> > [29357.359093]  ? __create_object+0x62/0x140
> > [29357.363111]  ? rcu_is_watching+0x20/0x50
> > [29357.367055]  ? kmalloc_trace+0x1c1/0x3a0
> > [29357.370998]  ? xe_tile_sysfs_init+0x4b/0x100 [xe]
> > [29357.376016]  xe_tile_sysfs_init+0x91/0x100 [xe]
> > [29357.380868]  xe_tile_init_noalloc+0xaf/0xc0 [xe]
> > [29357.385936]  xe_device_probe+0x60c/0x750 [xe]
> > [29357.390674]  ? __asan_memcpy+0x40/0x70
> > [29357.394461]  ? __drmm_add_action+0x1ac/0x210 [drm]
> > [29357.399413]  xe_pci_probe+0x5dc/0x680 [xe]
> > [29357.403882]  pci_reset_function+0x108/0x140
> > [29357.408100]  reset_store+0xb5/0x120
> > [29357.411623]  ? __pfx_reset_store+0x10/0x10
> > [29357.415751]  ? __pfx_sysfs_kf_write+0x10/0x10
> > [29357.420149]  kernfs_fop_write_iter+0x1b8/0x260
> > [29357.424620]  vfs_write+0x5ce/0x660
> > [29357.428067]  ? __pfx_vfs_write+0x10/0x10
> > [29357.432028]  ? __rcu_read_unlock+0x60/0x90
> > [29357.436181]  ksys_write+0xe4/0x190
> > [29357.439612]  ? __pfx_ksys_write+0x10/0x10
> > [29357.443657]  do_syscall_64+0x7b/0x120
> > [29357.447348]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [29357.452423] RIP: 0033:0x7f4f1ff14887
> > [29357.456030] Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
> > [29357.474761] RSP: 002b:00007ffe54e95068 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> > [29357.482343] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f4f1ff14887
> > [29357.489487] RDX: 0000000000000002 RSI: 0000559eb4076e30 RDI: 0000000000000001
> > [29357.496630] RBP: 0000559eb4076e30 R08: 0000000000000000 R09: 0000559eb4076e30
> > [29357.503775] R10: 0000000000000077 R11: 0000000000000246 R12: 0000000000000002
> > [29357.510947] R13: 00007f4f2001b780 R14: 00007f4f20017600 R15: 00007f4f20016a00
> > [29357.518174]  </TASK>
> > [29357.520539] kobject: kobject_add_internal failed for tile0 with -EEXIST, don't try to register things with the same name in the same directory.
> >
> > -Michał
> >
> >> +	if (pci_dev_msi_enabled(pdev))
> >> +		pci_free_irq_vectors(pdev);
> >> +
> >> +	devm_drm_dev_release_action(&xe->drm);
> >> +	pci_disable_device(pdev);
> >> +
> >> +	/*
> >> +	 * if this fails the driver might be in a stale state, only option is
> >> +	 * to unbind and rebind
> >> +	 */
> >> +	xe_pci_probe(pdev, ent);
> >> +}
> >> diff --git a/drivers/gpu/drm/xe/xe_pci_err.h b/drivers/gpu/drm/xe/xe_pci_err.h
> >> new file mode 100644
> >> index 000000000000..95a4c8ce9cf1
> >> --- /dev/null
> >> +++ b/drivers/gpu/drm/xe/xe_pci_err.h
> >> @@ -0,0 +1,13 @@
> >> +/* SPDX-License-Identifier: MIT */
> >> +/*
> >> + * Copyright © 2024 Intel Corporation
> >> + */
> >> +
> >> +#ifndef _XE_PCI_ERR_H_
> >> +#define _XE_PCI_ERR_H_
> >> +
> >> +struct pci_dev;
> >> +
> >> +void xe_pci_reset_prepare(struct pci_dev *pdev);
> >> +void xe_pci_reset_done(struct pci_dev *pdev);
> >> +#endif
> >> -- 
> >> 2.25.1
> >>