[Intel-xe] [PATCH] drm/xe: implement driver initiated functional reset

Tue Oct 3 08:14:28 UTC 2023

On 9/27/2023 9:46 AM, Andrzej Hajda wrote:
> Driver initiated functional reset (FLR) is the highest level of reset

Just to be pedantic, FLR means "function-level" reset (as in PCI 
function), and not "functional" reset. Just making the distinction 
because the meaning is different.

> that we can trigger from within the driver. In contrast to PCI FLR it
> doesn't require re-enumeration of PCI BAR. It can be useful in case
> GT fails to reset. It is also the only way to trigger GSC reset from
> the driver and can be used in future addition of GSC support.
>
> Signed-off-by: Andrzej Hajda <andrzej.hajda at intel.com>
> ---
> Hi all,
>
> This is initial attempt to implement Driver-FLR in xe. Implementation
> is copied from i915. I've skipped comments, but I can re-add them if
> requested.
> In i915 Driver-FLR is performed on driver unload if GSC was used.
> In xe GSC is not yet supported, but I have added trigger to perform FLR
> in case GT fails to reset, I guess this could be proper use-case and
> of course this is prerequisite to GSC support.
>
> Since this is my 1st patch to xe please be merciful in comments :)
>
> Regards
> Andrzej
> ---
>   drivers/gpu/drm/xe/regs/xe_gt_regs.h |  6 +++++
>   drivers/gpu/drm/xe/xe_gt.c           |  2 ++
>   drivers/gpu/drm/xe/xe_gt_types.h     |  3 +++
>   drivers/gpu/drm/xe/xe_mmio.c         | 38 ++++++++++++++++++++++++++++
>   4 files changed, 49 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index e13fbbdf6929ea..769a8fd005931a 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -359,6 +359,12 @@
>   #define RCU_MODE				XE_REG(0x14800, XE_REG_OPTION_MASKED)
>   #define   RCU_MODE_CCS_ENABLE			REG_BIT(0)
>   
> +#define GU_CNTL					XE_REG(0x101010)
> +#define   LMEM_INIT				REG_BIT(7)

this is already defined in xe_regs.h

> +#define   DRIVERFLR				REG_BIT(31)
> +#define GU_DEBUG				XE_REG(0x101018)
> +#define   DRIVERFLR_STATUS			REG_BIT(31)
> +
>   #define FORCEWAKE_ACK_GT			XE_REG(0x130044)
>   #define   FORCEWAKE_KERNEL			BIT(0)
>   #define   FORCEWAKE_USER			BIT(1)
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 1aa44d4f9ac1c8..1e0b76000ed46b 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -604,6 +604,8 @@ static int gt_reset(struct xe_gt *gt)
>   	xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev),
>   				   gt_to_tile(gt)->id, gt->info.id);
>   
> +	gt->needs_flr_on_fini = true;
> +
>   	return err;
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index d4310be3e1e7c8..dc194e9656abd5 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -347,6 +347,9 @@ struct xe_gt {
>   		/** @oob: bitmap with active OOB workaroudns */
>   		unsigned long *oob;
>   	} wa_active;
> +
> +	/** @needs_flr_on_fini: requests functional reset on fini */
> +	bool needs_flr_on_fini;

The FLR is a device-level reset, so it shouldn't be tracked at the 
GT-level (all GTs are reset when we do an FLR). In i915 we tracked the 
uncore at the GT level so we had to work with that, but in Xe IMO better 
to store the flag under xe->mmio.needs_flr_on_fini

>   };
>   
>   #endif
> diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
> index 3ccc0af4430be4..593ecc98cdfe8a 100644
> --- a/drivers/gpu/drm/xe/xe_mmio.c
> +++ b/drivers/gpu/drm/xe/xe_mmio.c
> @@ -4,6 +4,7 @@
>    */
>   
>   #include <linux/minmax.h>
> +#include <linux/units.h>
>   
>   #include "xe_mmio.h"
>   
> @@ -364,9 +365,46 @@ static void xe_mmio_probe_tiles(struct xe_device *xe)
>   	}
>   }
>   
> +static void driver_initiated_flr(struct xe_gt *gt)
> +{
> +	const unsigned int flr_timeout = 3 * MICRO; /* specs recommend a 3s wait */
> +	struct xe_device *xe = gt->tile->xe;
> +	int ret;
> +
> +	drm_dbg(&xe->drm, "Triggering Driver-FLR\n");
> +
> +	ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false);
> +	if (ret) {
> +		drm_err(&xe->drm, "Driver-FLR-prepare wait for ready failed! %d\n", ret);
> +		return;
> +	}
> +	xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS);
> +	xe_mmio_rmw32(gt, GU_CNTL, 0, DRIVERFLR);
> +	ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false);
> +	if (ret) {
> +		drm_err(&xe->drm, "Driver-FLR-teardown wait completion failed! %d\n", ret);
> +		return;
> +	}
> +	ret = xe_mmio_wait32(gt, GU_DEBUG, DRIVERFLR_STATUS, DRIVERFLR_STATUS,
> +			     flr_timeout, NULL, false);
> +	if (ret) {
> +		drm_err(&xe->drm, "Driver-FLR-reinit wait completion failed! %d\n", ret);
> +		return;
> +	}
> +
This block could do with some extra blank lines and some more comments 
to explain what each reg access does (you can copy those from i915).

> +	/* Clear sticky completion status */
> +	xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS);
> +}
> +
>   static void mmio_fini(struct drm_device *drm, void *arg)
>   {
>   	struct xe_device *xe = arg;
> +	struct xe_gt *gt;
> +	u8 id;
> +
> +	for_each_gt(gt, xe, id)
> +		if (gt->needs_flr_on_fini)
> +			driver_initiated_flr(gt);

As mentioned above, the FLR is a global reset so there is no need to do 
it multiple times if multiple GTs have failed and require it. Tracking 
with a global flag as suggested before should fix this as well.

Daniele

>   
>   	pci_iounmap(to_pci_dev(xe->drm.dev), xe->mmio.regs);
>   	if (xe->mem.vram.mapping)