[PATCH v4 1/6] drm/xe/pf: Prepare to stop SR-IOV support prior GT reset

Piotr Piórkowski piotr.piorkowski at intel.com
Tue Jul 15 06:56:17 UTC 2025


Michal Wajdeczko <michal.wajdeczko at intel.com> wrote on pią [2025-lip-11 21:33:11 +0200]:
> As part of the resume or GT reset, the PF driver schedules work
> which is then used to complete restarting of the SR-IOV support,
> including resending to the GuC configurations of provisioned VFs.
> 
> However, in case of short delay between those two actions, which
> could be seen by triggering a GT reset on the suspened device:
> 
>  $ echo 1 > /sys/kernel/debug/dri/0000:00:02.0/gt0/force_reset
> 
> this PF worker might be still busy, which lead to errors due to
> just stopped or disabled GuC CTB communication:
> 
>  [ ] xe 0000:00:02.0: [drm:xe_gt_resume [xe]] GT0: resumed
>  [ ] xe 0000:00:02.0: [drm] GT0: trying reset from force_reset_show [xe]
>  [ ] xe 0000:00:02.0: [drm] GT0: reset queued
>  [ ] xe 0000:00:02.0: [drm] GT0: reset started
>  [ ] xe 0000:00:02.0: [drm:guc_ct_change_state [xe]] GT0: GuC CT communication channel stopped
>  [ ] xe 0000:00:02.0: [drm:guc_ct_send_recv [xe]] GT0: H2G request 0x5503 canceled!
>  [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF1 12 config KLVs (-ECANCELED)
>  [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF1 configuration (-ECANCELED)
>  [ ] xe 0000:00:02.0: [drm:guc_ct_change_state [xe]] GT0: GuC CT communication channel disabled
>  [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF2 12 config KLVs (-ENODEV)
>  [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF2 configuration (-ENODEV)
>  [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push 2 of 2 VFs configurations
>  [ ] xe 0000:00:02.0: [drm:pf_worker_restart_func [xe]] GT0: PF: restart completed
> 
> While this VFs reprovisioning will be successful during next spin
> of the worker, to avoid those errors, make sure to cancel restart
> worker if we are about to trigger next reset.
> 
> Fixes: 411220808cee ("drm/xe/pf: Restart VFs provisioning after GT reset")
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt.c          |  3 +++
>  drivers/gpu/drm/xe/xe_gt_sriov_pf.c | 19 +++++++++++++++++++
>  drivers/gpu/drm/xe/xe_gt_sriov_pf.h |  5 +++++
>  3 files changed, 27 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index af03e19ef9be..58fe2c4168b0 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -835,6 +835,9 @@ static int gt_reset(struct xe_gt *gt)
>  		goto err_out;
>  	}
>  
> +	if (IS_SRIOV_PF(gt_to_xe(gt)))
> +		xe_gt_sriov_pf_stop_prepare(gt);
> +
>  	xe_uc_gucrc_disable(&gt->uc);
>  	xe_uc_stop_prepare(&gt->uc);
>  	xe_gt_pagefault_reset(gt);
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> index c08efca6420e..35489fa81825 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> @@ -172,6 +172,25 @@ void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid)
>  	pf_clear_vf_scratch_regs(gt, vfid);
>  }
>  
> +static void pf_cancel_restart(struct xe_gt *gt)
> +{
> +	xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt)));
> +
> +	if (cancel_work_sync(&gt->sriov.pf.workers.restart))
> +		xe_gt_sriov_dbg_verbose(gt, "pending restart canceled!\n");
> +}
> +
> +/**
> + * xe_gt_sriov_pf_stop_prepare() - Prepare to stop SR-IOV support.
> + * @gt: the &xe_gt
> + *
> + * This function can only be called on the PF.
> + */
> +void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt)
> +{
> +	pf_cancel_restart(gt);
> +}
> +
>  static void pf_restart(struct xe_gt *gt)
>  {
>  	struct xe_device *xe = gt_to_xe(gt);
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
> index f474509411c0..e2b2ff8132dc 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
> @@ -13,6 +13,7 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt);
>  int xe_gt_sriov_pf_init(struct xe_gt *gt);
>  void xe_gt_sriov_pf_init_hw(struct xe_gt *gt);
>  void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid);
> +void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt);
>  void xe_gt_sriov_pf_restart(struct xe_gt *gt);
>  #else
>  static inline int xe_gt_sriov_pf_init_early(struct xe_gt *gt)
> @@ -29,6 +30,10 @@ static inline void xe_gt_sriov_pf_init_hw(struct xe_gt *gt)
>  {
>  }
>  
> +static inline void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt)
> +{
> +}
> +
>  static inline void xe_gt_sriov_pf_restart(struct xe_gt *gt)
>  {
>  }

LGTM:
Reviewed-by: Piotr Piórkowski <piotr.piorkowski at intel.com>

> -- 
> 2.47.1
> 

-- 


More information about the Intel-xe mailing list