[PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups

Tue May 27 13:10:25 UTC 2025

Hi
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of Tomasz
> Lis
> Sent: Tuesday, May 20, 2025 4:49 AM
> To: intel-xe at lists.freedesktop.org
> Cc: Winiarski, Michal <michal.winiarski at intel.com>; Wajdeczko, Michal
> <Michal.Wajdeczko at intel.com>; Piorkowski, Piotr
> <piotr.piorkowski at intel.com>; Brost, Matthew <matthew.brost at intel.com>;
> De Marchi, Lucas <lucas.demarchi at intel.com>
> Subject: [PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups
> 
> While applying post-migration fixups to VF, GuC will not respond
> to any commands. This means submissions have no way of finishing.
> 
> To avoid acquiring additional resources and then stalling
> on hardware access, pause the submission work. This will
> decrease the chance of depleting resources, and speed up
> the recovery.
> 
> v2: Commented xe_irq_resume() call
> v3: Typo fix
> 
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gpu_scheduler.c | 13 +++++++++
>  drivers/gpu/drm/xe/xe_gpu_scheduler.h |  1 +
>  drivers/gpu/drm/xe/xe_guc_submit.c    | 35 ++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_guc_submit.h    |  2 ++
>  drivers/gpu/drm/xe/xe_sriov_vf.c      | 42 +++++++++++++++++++++++++++
>  5 files changed, 93 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> index 869b43a4151d..455ccaf17314 100644
> --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> @@ -101,6 +101,19 @@ void xe_sched_submission_stop(struct
> xe_gpu_scheduler *sched)
>  	cancel_work_sync(&sched->work_process_msg);
>  }
> 
> +/**
> + * xe_sched_submission_stop_async - Stop further runs of submission tasks
> on a scheduler.
> + * @sched: the &xe_gpu_scheduler struct instance
> + *
> + * This call disables further runs of scheduling work queue. It does not wait
> + * for any in-progress runs to finish, only makes sure no further runs happen
> + * afterwards.
> + */
> +void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched)
> +{
> +	drm_sched_wqueue_stop(&sched->base);
> +}
> +
>  void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched)
>  {
>  	drm_sched_resume_timeout(&sched->base, sched->base.timeout);
> diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> index c250ea773491..d78b4e8203f9 100644
> --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> @@ -21,6 +21,7 @@ void xe_sched_fini(struct xe_gpu_scheduler *sched);
> 
>  void xe_sched_submission_start(struct xe_gpu_scheduler *sched);
>  void xe_sched_submission_stop(struct xe_gpu_scheduler *sched);
> +void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched);
> 
>  void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched);
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 80f748baad3f..6f280333de13 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1811,6 +1811,19 @@ void xe_guc_submit_stop(struct xe_guc *guc)
> 
>  }
> 
> +/**
> + * xe_guc_submit_pause - Stop further runs of submission tasks on given
> GuC.
> + * @guc: the &xe_guc struct instance whose scheduler is to be disabled
> + */
> +void xe_guc_submit_pause(struct xe_guc *guc)
> +{
> +	struct xe_exec_queue *q;
> +	unsigned long index;
> +
> +	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> +		xe_sched_submission_stop_async(&q->guc->sched);
> +}
> +
>  static void guc_exec_queue_start(struct xe_exec_queue *q)
>  {
>  	struct xe_gpu_scheduler *sched = &q->guc->sched;
> @@ -1851,6 +1864,28 @@ int xe_guc_submit_start(struct xe_guc *guc)
>  	return 0;
>  }
> 
> +static void guc_exec_queue_unpause(struct xe_exec_queue *q)
> +{
> +	struct xe_gpu_scheduler *sched = &q->guc->sched;
> +
> +	xe_sched_submission_start(sched);
> +}
> +
> +/**
> + * xe_guc_submit_unpause - Allow further runs of submission tasks on given
> GuC.
> + * @guc: the &xe_guc struct instance whose scheduler is to be enabled
> + */
> +void xe_guc_submit_unpause(struct xe_guc *guc)
> +{
> +	struct xe_exec_queue *q;
> +	unsigned long index;
> +
> +	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> +		guc_exec_queue_unpause(q);
> +
> +	wake_up_all(&guc->ct.wq);
> +}
> +
>  static struct xe_exec_queue *
>  g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h
> b/drivers/gpu/drm/xe/xe_guc_submit.h
> index 9b71a986c6ca..f1cf271492ae 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -18,6 +18,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
>  void xe_guc_submit_reset_wait(struct xe_guc *guc);
>  void xe_guc_submit_stop(struct xe_guc *guc);
>  int xe_guc_submit_start(struct xe_guc *guc);
> +void xe_guc_submit_pause(struct xe_guc *guc);
> +void xe_guc_submit_unpause(struct xe_guc *guc);
>  void xe_guc_submit_wedge(struct xe_guc *guc);
> 
>  int xe_guc_read_stopped(struct xe_guc *guc);
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c
> b/drivers/gpu/drm/xe/xe_sriov_vf.c
> index 099a395fbf59..fcd82a0fda48 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
> @@ -11,6 +11,8 @@
>  #include "xe_gt_sriov_printk.h"
>  #include "xe_gt_sriov_vf.h"
>  #include "xe_guc_ct.h"
> +#include "xe_guc_submit.h"
> +#include "xe_irq.h"
>  #include "xe_pm.h"
>  #include "xe_sriov.h"
>  #include "xe_sriov_printk.h"
> @@ -134,6 +136,44 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
>  	INIT_WORK(&xe->sriov.vf.migration.worker, migration_worker_func);
>  }
> 
> +/**
> + * vf_post_migration_shutdown - Stop the driver activities after VF migration.
> + * @xe: the &xe_device struct instance
> + *
> + * After this VM is migrated and assigned to a new VF, it is running on a new
> + * hardware, and therefore many hardware-dependent states and related
> structures
> + * require fixups. Without fixups, the hardware cannot do any work, and
> therefore
> + * all GPU pipelines are stalled.
> + * Stop some of kernel activities to make the fixup process faster.
> + */
> +static void vf_post_migration_shutdown(struct xe_device *xe)
> +{
> +	struct xe_gt *gt;
> +	unsigned int id;
> +
> +	for_each_gt(gt, xe, id)
> +		xe_guc_submit_pause(&gt->uc.guc);
> +}
> +
Since all GPU activities are stopped, no interrupts are expected from HW. So, is there an issue
If we suspend all interrupts from XE by calling xe_irq_suspend()?
I saw a comment from Michal Wajdeczko in rev-1 of this series, but do not see details here.
-Satya.
> +/**
> + * vf_post_migration_kickstart - Re-start the driver activities under new
> hardware.
> + * @xe: the &xe_device struct instance
> + *
> + * After we have finished with all post-migration fixups, restart the driver
> + * activities to continue feeding the GPU with workloads.
> + */
> +static void vf_post_migration_kickstart(struct xe_device *xe)
> +{
> +	struct xe_gt *gt;
> +	unsigned int id;
> +
> +	/* make sure interrupts on the new HW are properly set */
> +	xe_irq_resume(xe);
> +
> +	for_each_gt(gt, xe, id)
> +		xe_guc_submit_unpause(&gt->uc.guc);
> +}
> +
>  /**
>   * xe_sriov_vf_post_migration_reset_guc_state - Reset VF state in all GuCs.
>   * @xe: the &xe_device struct instance
> @@ -247,6 +287,7 @@ static void vf_post_migration_recovery(struct
> xe_device *xe)
> 
>  	drm_dbg(&xe->drm, "migration recovery in progress\n");
>  	xe_pm_runtime_get(xe);
> +	vf_post_migration_shutdown(xe);
>  	err = vf_post_migration_requery_guc(xe);
>  	if (vf_post_migration_imminent(xe))
>  		goto defer;
> @@ -258,6 +299,7 @@ static void vf_post_migration_recovery(struct
> xe_device *xe)
>  	if (need_fixups)
>  		vf_post_migration_fixup_ctb(xe);
> 
> +	vf_post_migration_kickstart(xe);
>  	vf_post_migration_notify_resfix_done(xe);
>  	xe_pm_runtime_put(xe);
>  	drm_notice(&xe->drm, "migration recovery ended\n");
> --
> 2.25.1