[PATCH v6 4/9] drm/xe: Block reset while recovering from VF migration
Michał Winiarski
michal.winiarski at intel.com
Wed Jul 16 19:21:04 UTC 2025
On Fri, Jul 04, 2025 at 11:02:23PM +0200, Tomasz Lis wrote:
> Resetting GuC during recovery could interfere with the recovery
> process. Such reset might be also triggered without justification,
> due to migration taking time, rather than due to the workload not
> progressing.
>
> Doing GuC reset during the recovery would cause exit of RESFIX state,
> and therefore continuation of GuC work while fixups are still being
> applied. To avoid that, reset needs to be blocked during the recovery.
>
> This patch blocks the reset during recovery. Reset request in that
> time range will be stalled, and unblocked only after GuC goes out
> of RESFIX state.
>
> In case a reset procedure already started while the recovery is
> triggered, there isn't much we can do - we cannot wait for it to
> finish as it involves waiting for hardware, and we can't be sure
> at which exact point of the reset procedure the GPU got switched.
> Therefore, the rare cases where migration happens while reset is
> in progress, are still dangerous. Resets are not a part of the
> standard flow, and cause unfinished workloads - that will happen
> during the reset interrupted by migration as well, so it doesn't
> diverge that much from what normally happens during such resets.
>
> v2: Introduce a new atomic for reset blocking, as we cannot reuse
> `stopped` atomic (that could lead to losing a workload).
>
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Michal Winiarski <michal.winiarski at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 10 +++++++++
> drivers/gpu/drm/xe/xe_guc_submit.c | 36 ++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_guc_submit.h | 3 +++
> drivers/gpu/drm/xe/xe_guc_types.h | 6 +++++
> drivers/gpu/drm/xe/xe_sriov_vf.c | 12 ++++++++--
> 5 files changed, 65 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index d397df056e4c..f950f964d422 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -41,6 +41,7 @@
> #include "xe_gt_topology.h"
> #include "xe_guc_exec_queue_types.h"
> #include "xe_guc_pc.h"
> +#include "xe_guc_submit.h"
> #include "xe_hw_fence.h"
> #include "xe_hw_engine_class_sysfs.h"
> #include "xe_irq.h"
> @@ -806,6 +807,11 @@ static int do_gt_restart(struct xe_gt *gt)
> return 0;
> }
>
> +static int gt_wait_reset_unblock(struct xe_gt *gt)
> +{
> + return xe_guc_wait_reset_unblock(>->uc.guc);
> +}
> +
> static int gt_reset(struct xe_gt *gt)
> {
> unsigned int fw_ref;
> @@ -820,6 +826,10 @@ static int gt_reset(struct xe_gt *gt)
>
> xe_gt_info(gt, "reset started\n");
>
> + err = gt_wait_reset_unblock(gt);
> + if (!err)
> + xe_gt_warn(gt, "reset block failed to get lifted");
> +
> xe_pm_runtime_get(gt_to_xe(gt));
>
> if (xe_fault_inject_gt_reset()) {
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 87b1e114e2c6..566938a89c59 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1779,6 +1779,42 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> }
> }
>
> +/**
> + * xe_guc_submit_reset_block - Disallow reset calls on given GuC.
> + * @guc: the &xe_guc struct instance
> + */
> +int xe_guc_submit_reset_block(struct xe_guc *guc)
> +{
> + return atomic_fetch_or(1, &guc->submission_state.reset_blocked);
> +}
> +
> +/**
> + * xe_guc_submit_reset_unblock - Allow back reset calls on given GuC.
> + * @guc: the &xe_guc struct instance
> + */
> +void xe_guc_submit_reset_unblock(struct xe_guc *guc)
> +{
> + atomic_and(0, &guc->submission_state.reset_blocked);
> + /* we are going to wake workers, make sure they all see the value */
> + smp_wmb();
So, that's basically an atomic_set(&reset_blocked, 0)?
And the barrier usage looks suspicious (where's the read-side barrier),
so perhaps... atomic_set_release (and drop the smp_wmb)?
> + wake_up_all(&guc->ct.wq);
> +}
> +
> +static int guc_submit_reset_is_blocked(struct xe_guc *guc)
> +{
> + return atomic_read(&guc->submission_state.reset_blocked);
And atomic_read_acquire, to pair with the atomic_set_release? (and
fetch_or is fully ordered, so it also should be fine)
> +}
> +
> +/**
> + * xe_guc_wait_reset_unblock - Wait until reset blocking flag is lifted, or timeout.
> + * @guc: the &xe_guc struct instance
> + */
> +int xe_guc_wait_reset_unblock(struct xe_guc *guc)
> +{
> + return wait_event_timeout(guc->ct.wq,
> + !guc_submit_reset_is_blocked(guc), HZ * 5);
Define the timeout (or perhaps reuse some existing one).
With that - LGTM.
Reviewed-by: Michał Winiarski <michal.winiarski at intel.com>
Thanks,
-Michał
> +}
> +
> int xe_guc_submit_reset_prepare(struct xe_guc *guc)
> {
> int ret;
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
> index f1cf271492ae..45b978b047c9 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -20,6 +20,9 @@ void xe_guc_submit_stop(struct xe_guc *guc);
> int xe_guc_submit_start(struct xe_guc *guc);
> void xe_guc_submit_pause(struct xe_guc *guc);
> void xe_guc_submit_unpause(struct xe_guc *guc);
> +int xe_guc_submit_reset_block(struct xe_guc *guc);
> +void xe_guc_submit_reset_unblock(struct xe_guc *guc);
> +int xe_guc_wait_reset_unblock(struct xe_guc *guc);
> void xe_guc_submit_wedge(struct xe_guc *guc);
>
> int xe_guc_read_stopped(struct xe_guc *guc);
> diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h
> index 1fde7614fcc5..c7b9642b41ba 100644
> --- a/drivers/gpu/drm/xe/xe_guc_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_types.h
> @@ -85,6 +85,12 @@ struct xe_guc {
> struct xarray exec_queue_lookup;
> /** @submission_state.stopped: submissions are stopped */
> atomic_t stopped;
> + /**
> + * @submission_state.reset_blocked: reset attempts are blocked;
> + * blocking reset in order to delay it may be required if running
> + * an operation which is sensitive to resets.
> + */
> + atomic_t reset_blocked;
> /** @submission_state.lock: protects submission state */
> struct mutex lock;
> /** @submission_state.enabled: submission is enabled */
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c
> index c66b17da1ce7..71d28b30de43 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
> @@ -163,9 +163,15 @@ static void vf_post_migration_shutdown(struct xe_device *xe)
> {
> struct xe_gt *gt;
> unsigned int id;
> + int ret = 0;
>
> - for_each_gt(gt, xe, id)
> + for_each_gt(gt, xe, id) {
> xe_guc_submit_pause(>->uc.guc);
> + ret |= xe_guc_submit_reset_block(>->uc.guc);
> + }
> +
> + if (ret)
> + drm_info(&xe->drm, "migration recovery encountered ongoing reset\n");
> }
>
> /**
> @@ -187,8 +193,10 @@ static void vf_post_migration_kickstart(struct xe_device *xe)
> */
> xe_irq_resume(xe);
>
> - for_each_gt(gt, xe, id)
> + for_each_gt(gt, xe, id) {
> + xe_guc_submit_reset_unblock(>->uc.guc);
> xe_guc_submit_unpause(>->uc.guc);
> + }
> }
>
> static bool gt_vf_post_migration_needed(struct xe_gt *gt)
> --
> 2.25.1
>
More information about the Intel-xe
mailing list