[PATCH v4 2/8] drm/xe/vf: Finish RESFIX by reset if CTB not enabled

Mon Jun 9 11:00:58 UTC 2025

On 06.06.2025 02:18, Tomasz Lis wrote:
> The RESFIX state should be achievable only when CTB communication is

I'm not sure this is a valid statement, we can't make assumptions about
the VF driver state, and CTB is enabled only when driver is actively
submitting workloads, which might not be its 100% life time

> enabled. If CTB was disabled and we still got it, then either we're
> dealing with unclean initial state, or the driver is not currently
> functional. In these cases, exit the RESFIX state by reset.
> 
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Michal Winiarski <michal.winiarski at intel.com>
> Reviewed-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 11 +++++++++++
>  drivers/gpu/drm/xe/xe_sriov_vf.c    | 16 ++++++++++++++++
>  drivers/gpu/drm/xe/xe_sriov_vf.h    |  1 +
>  3 files changed, 28 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index 792523cfa6e6..8fa210c0ef1a 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -23,6 +23,7 @@
>  #include "xe_gt_sriov_vf.h"
>  #include "xe_gt_sriov_vf_types.h"
>  #include "xe_guc.h"
> +#include "xe_guc_ct.h"
>  #include "xe_guc_hxg_helpers.h"
>  #include "xe_guc_relay.h"
>  #include "xe_mmio.h"
> @@ -721,6 +722,16 @@ void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt)
>  
>  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
>  
> +	if (!xe_guc_ct_enabled(&gt->uc.guc.ct)) {
> +		/*
> +		 * If driver initialization is running in parallel to this handler,
> +		 * ignore the migration which happened before the driver was loaded.

CT could be temporarily disabled during reset or suspend
shouldn't we attempt to do fixups in those cases?

> +		 * Force GuC to take the VF out of RESFIX state without any fixups.
> +		 */
> +		xe_sriov_vf_post_migration_reset_guc_state(xe);
> +		return;

if we exit early here, wouldn't that impact logic in the
vf_ready_to_recovery_on_all_gts() which looks for bits in gt_flags?

> +	}
> +
>  	set_bit(gt->info.id, &xe->sriov.vf.migration.gt_flags);
>  	/*
>  	 * We need to be certain that if all flags were set, at least one
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c
> index 6526fe450e55..eff6c7b96f25 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
> @@ -147,6 +147,22 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
>  		xe_sriov_info(xe, "migration not supported by this module version\n");
>  }
>  
> +/**
> + * xe_sriov_vf_post_migration_reset_guc_state - Reset VF state in all GuCs.
> + * @xe: the &xe_device struct instance
> + *
> + * This function sends VF state reset to GuC, as a way of exiting RESFIX

"sends VF state reset to GuC"

well, this function does much more than that

> + * state if a proper post-migration recovery procedure has failed.
> + */
> +void xe_sriov_vf_post_migration_reset_guc_state(struct xe_device *xe)
> +{
> +	struct xe_gt *gt;
> +	unsigned int id;
> +
> +	for_each_gt(gt, xe, id)
> +		xe_gt_reset_async(gt);
> +}
> +
>  /**
>   * vf_post_migration_requery_guc - Re-query GuC for current VF provisioning.
>   * @xe: the &xe_device struct instance
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.h b/drivers/gpu/drm/xe/xe_sriov_vf.h
> index 7b8622cff2b7..ba846af34a13 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf.h
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf.h
> @@ -10,5 +10,6 @@ struct xe_device;
>  
>  void xe_sriov_vf_init_early(struct xe_device *xe);
>  void xe_sriov_vf_start_migration_recovery(struct xe_device *xe);
> +void xe_sriov_vf_post_migration_reset_guc_state(struct xe_device *xe);
>  
>  #endif