[PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups
K V P, Satyanarayana
satyanarayana.k.v.p at intel.com
Tue May 27 13:10:25 UTC 2025
Hi
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of Tomasz
> Lis
> Sent: Tuesday, May 20, 2025 4:49 AM
> To: intel-xe at lists.freedesktop.org
> Cc: Winiarski, Michal <michal.winiarski at intel.com>; Wajdeczko, Michal
> <Michal.Wajdeczko at intel.com>; Piorkowski, Piotr
> <piotr.piorkowski at intel.com>; Brost, Matthew <matthew.brost at intel.com>;
> De Marchi, Lucas <lucas.demarchi at intel.com>
> Subject: [PATCH v3 3/7] drm/xe/vf: Pause submissions during RESFIX fixups
>
> While applying post-migration fixups to VF, GuC will not respond
> to any commands. This means submissions have no way of finishing.
>
> To avoid acquiring additional resources and then stalling
> on hardware access, pause the submission work. This will
> decrease the chance of depleting resources, and speed up
> the recovery.
>
> v2: Commented xe_irq_resume() call
> v3: Typo fix
>
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gpu_scheduler.c | 13 +++++++++
> drivers/gpu/drm/xe/xe_gpu_scheduler.h | 1 +
> drivers/gpu/drm/xe/xe_guc_submit.c | 35 ++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_guc_submit.h | 2 ++
> drivers/gpu/drm/xe/xe_sriov_vf.c | 42 +++++++++++++++++++++++++++
> 5 files changed, 93 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> index 869b43a4151d..455ccaf17314 100644
> --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> @@ -101,6 +101,19 @@ void xe_sched_submission_stop(struct
> xe_gpu_scheduler *sched)
> cancel_work_sync(&sched->work_process_msg);
> }
>
> +/**
> + * xe_sched_submission_stop_async - Stop further runs of submission tasks
> on a scheduler.
> + * @sched: the &xe_gpu_scheduler struct instance
> + *
> + * This call disables further runs of scheduling work queue. It does not wait
> + * for any in-progress runs to finish, only makes sure no further runs happen
> + * afterwards.
> + */
> +void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched)
> +{
> + drm_sched_wqueue_stop(&sched->base);
> +}
> +
> void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched)
> {
> drm_sched_resume_timeout(&sched->base, sched->base.timeout);
> diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> index c250ea773491..d78b4e8203f9 100644
> --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> @@ -21,6 +21,7 @@ void xe_sched_fini(struct xe_gpu_scheduler *sched);
>
> void xe_sched_submission_start(struct xe_gpu_scheduler *sched);
> void xe_sched_submission_stop(struct xe_gpu_scheduler *sched);
> +void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched);
>
> void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched);
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 80f748baad3f..6f280333de13 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1811,6 +1811,19 @@ void xe_guc_submit_stop(struct xe_guc *guc)
>
> }
>
> +/**
> + * xe_guc_submit_pause - Stop further runs of submission tasks on given
> GuC.
> + * @guc: the &xe_guc struct instance whose scheduler is to be disabled
> + */
> +void xe_guc_submit_pause(struct xe_guc *guc)
> +{
> + struct xe_exec_queue *q;
> + unsigned long index;
> +
> + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> + xe_sched_submission_stop_async(&q->guc->sched);
> +}
> +
> static void guc_exec_queue_start(struct xe_exec_queue *q)
> {
> struct xe_gpu_scheduler *sched = &q->guc->sched;
> @@ -1851,6 +1864,28 @@ int xe_guc_submit_start(struct xe_guc *guc)
> return 0;
> }
>
> +static void guc_exec_queue_unpause(struct xe_exec_queue *q)
> +{
> + struct xe_gpu_scheduler *sched = &q->guc->sched;
> +
> + xe_sched_submission_start(sched);
> +}
> +
> +/**
> + * xe_guc_submit_unpause - Allow further runs of submission tasks on given
> GuC.
> + * @guc: the &xe_guc struct instance whose scheduler is to be enabled
> + */
> +void xe_guc_submit_unpause(struct xe_guc *guc)
> +{
> + struct xe_exec_queue *q;
> + unsigned long index;
> +
> + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> + guc_exec_queue_unpause(q);
> +
> + wake_up_all(&guc->ct.wq);
> +}
> +
> static struct xe_exec_queue *
> g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
> {
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h
> b/drivers/gpu/drm/xe/xe_guc_submit.h
> index 9b71a986c6ca..f1cf271492ae 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -18,6 +18,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
> void xe_guc_submit_reset_wait(struct xe_guc *guc);
> void xe_guc_submit_stop(struct xe_guc *guc);
> int xe_guc_submit_start(struct xe_guc *guc);
> +void xe_guc_submit_pause(struct xe_guc *guc);
> +void xe_guc_submit_unpause(struct xe_guc *guc);
> void xe_guc_submit_wedge(struct xe_guc *guc);
>
> int xe_guc_read_stopped(struct xe_guc *guc);
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c
> b/drivers/gpu/drm/xe/xe_sriov_vf.c
> index 099a395fbf59..fcd82a0fda48 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
> @@ -11,6 +11,8 @@
> #include "xe_gt_sriov_printk.h"
> #include "xe_gt_sriov_vf.h"
> #include "xe_guc_ct.h"
> +#include "xe_guc_submit.h"
> +#include "xe_irq.h"
> #include "xe_pm.h"
> #include "xe_sriov.h"
> #include "xe_sriov_printk.h"
> @@ -134,6 +136,44 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
> INIT_WORK(&xe->sriov.vf.migration.worker, migration_worker_func);
> }
>
> +/**
> + * vf_post_migration_shutdown - Stop the driver activities after VF migration.
> + * @xe: the &xe_device struct instance
> + *
> + * After this VM is migrated and assigned to a new VF, it is running on a new
> + * hardware, and therefore many hardware-dependent states and related
> structures
> + * require fixups. Without fixups, the hardware cannot do any work, and
> therefore
> + * all GPU pipelines are stalled.
> + * Stop some of kernel activities to make the fixup process faster.
> + */
> +static void vf_post_migration_shutdown(struct xe_device *xe)
> +{
> + struct xe_gt *gt;
> + unsigned int id;
> +
> + for_each_gt(gt, xe, id)
> + xe_guc_submit_pause(>->uc.guc);
> +}
> +
Since all GPU activities are stopped, no interrupts are expected from HW. So, is there an issue
If we suspend all interrupts from XE by calling xe_irq_suspend()?
I saw a comment from Michal Wajdeczko in rev-1 of this series, but do not see details here.
-Satya.
> +/**
> + * vf_post_migration_kickstart - Re-start the driver activities under new
> hardware.
> + * @xe: the &xe_device struct instance
> + *
> + * After we have finished with all post-migration fixups, restart the driver
> + * activities to continue feeding the GPU with workloads.
> + */
> +static void vf_post_migration_kickstart(struct xe_device *xe)
> +{
> + struct xe_gt *gt;
> + unsigned int id;
> +
> + /* make sure interrupts on the new HW are properly set */
> + xe_irq_resume(xe);
> +
> + for_each_gt(gt, xe, id)
> + xe_guc_submit_unpause(>->uc.guc);
> +}
> +
> /**
> * xe_sriov_vf_post_migration_reset_guc_state - Reset VF state in all GuCs.
> * @xe: the &xe_device struct instance
> @@ -247,6 +287,7 @@ static void vf_post_migration_recovery(struct
> xe_device *xe)
>
> drm_dbg(&xe->drm, "migration recovery in progress\n");
> xe_pm_runtime_get(xe);
> + vf_post_migration_shutdown(xe);
> err = vf_post_migration_requery_guc(xe);
> if (vf_post_migration_imminent(xe))
> goto defer;
> @@ -258,6 +299,7 @@ static void vf_post_migration_recovery(struct
> xe_device *xe)
> if (need_fixups)
> vf_post_migration_fixup_ctb(xe);
>
> + vf_post_migration_kickstart(xe);
> vf_post_migration_notify_resfix_done(xe);
> xe_pm_runtime_put(xe);
> drm_notice(&xe->drm, "migration recovery ended\n");
> --
> 2.25.1
More information about the Intel-xe
mailing list