[PATCH 05/14] drm/amdgpu/userq: implement resets

Fri May 30 22:18:01 UTC 2025

On Fri, May 30, 2025 at 5:04 AM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher at amd.com>
>
> If map or unmap fails, or a user fence times out, attempt to reset the queue.  If that fails, schedule a GPU reset.
>
> Acked-by: Christian König <christian.koenig at amd.com>
> Reviewed-by: Sunil Khatri <sunil.khatri at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 123 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
>  4 files changed, 128 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a5ccd0ada16a..7365558f47a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1303,6 +1303,7 @@ struct amdgpu_device {
>         struct list_head                userq_mgr_list;
>         struct mutex                    userq_mutex;
>         bool                            userq_halt_for_enforce_isolation;
> +       struct work_struct              userq_reset_work;
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 72e41781afb0..2c90a7d256e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4426,6 +4426,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>
>         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
>         adev->gfx.gfx_off_req_count = 1;
>         adev->gfx.gfx_off_residency = 0;
> @@ -5768,6 +5769,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
>                                 if (r)
>                                         goto out;
>
> +                               r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> +                               if (r)
> +                                       goto out;
> +
>                                 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
>                                 /*
> @@ -5990,6 +5995,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>         if (!amdgpu_sriov_vf(adev))
>                 cancel_work(&adev->reset_work);
>  #endif
> +       cancel_work(&adev->userq_reset_work);
>
>         if (adev->kfd.dev)
>                 cancel_work(&adev->kfd.reset_work);
> @@ -6100,6 +6106,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
>                       amdgpu_device_ip_need_full_reset(tmp_adev))
>                         amdgpu_ras_suspend(tmp_adev);
>
> +               amdgpu_userq_pre_reset(tmp_adev);
> +
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 295e7186e156..d4f807256383 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -26,7 +26,10 @@
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
>
> +#include <drm/drm_drv.h>
> +
>  #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> @@ -44,6 +47,44 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> +{
> +
> +       if (amdgpu_device_should_recover_gpu(adev))
> +               amdgpu_reset_domain_schedule(adev->reset_domain,
> +                                            &adev->userq_reset_work);
> +}
> +
> +static bool
> +amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
> +                               struct amdgpu_usermode_queue *queue)
> +{
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const struct amdgpu_userq_funcs *userq_funcs =
> +               adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
> +       int r;
> +
> +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +       } else if (amdgpu_gpu_recovery && userq_funcs->reset) {
> +               r = userq_funcs->reset(uq_mgr, queue);
> +               if (r) {
> +                       dev_err(adev->dev, "userq reset failed\n");
> +                       gpu_reset = true;
> +               } else {
> +                       dev_err(adev->dev, "userq reset succeeded\n");
> +                       atomic_inc(&adev->gpu_reset_counter);
> +                       amdgpu_userq_fence_driver_force_completion(queue);
> +                       drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> +               }
> +       } else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
> +               gpu_reset = true;
> +       }
> +
> +       return gpu_reset;
> +}

After our discussions with the MES team, I think we should approach
this slightly differently.  I see two options:

1. Use the existing userq reset callback but rather than trying to use
the reset single queue API, use the detect and reset API instead.
2. Add a new higher level callback (e.g., mes vs. umsch, etc.) for
detect and reset and update the queue status for all hung queues.
Then call that new callback in all the places amdgpu_userq.c where we
call amdgpu_userq_unmap_helper().  E.g.,

    /* detect all the queues that are hung, reset, and update their
status to hung */
    amdgpu_userq_detect_and_reset_queues()
    idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
                /* only unmap the queues that are not already marked as hung */
                r = amdgpu_userq_unmap_helper(uq_mgr, queue);
               ...
    }

The second option has the advantage of only needing to call the MES
API once rather than per queue.

Alex

> +
>  static int
>  amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>                           struct amdgpu_usermode_queue *queue)
> @@ -51,15 +92,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>                 r = userq_funcs->unmap(uq_mgr, queue);
> -               if (r)
> +               if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> -               else
> +                       gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, queue);
> +               } else {
>                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> +               }
>         }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
>         return r;
>  }
>
> @@ -70,16 +118,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
>                 r = userq_funcs->map(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, queue);
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
>         return r;
>  }
>
> @@ -703,6 +757,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         return ret;
>  }
>
> +void amdgpu_userq_reset_work(struct work_struct *work)
> +{
> +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +                                                 userq_reset_work);
> +       struct amdgpu_reset_context reset_context;
> +
> +       memset(&reset_context, 0, sizeof(reset_context));
> +
> +       reset_context.method = AMD_RESET_METHOD_NONE;
> +       reset_context.reset_req_dev = adev;
> +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +}
> +
>  static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>  {
> @@ -729,22 +800,18 @@ void
>  amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
>                    struct amdgpu_eviction_fence *ev_fence)
>  {
> -       int ret;
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> +       int ret;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> -       if (ret) {
> +       if (ret)
>                 drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> -               return;
> -       }
>
>         ret = amdgpu_userq_evict_all(uq_mgr);
> -       if (ret) {
> +       if (ret)
>                 drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> -               return;
> -       }
>
>         /* Signal current eviction fence */
>         amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
> @@ -922,3 +989,41 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>         mutex_unlock(&adev->userq_mutex);
>         return ret;
>  }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
> +{
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm, *tmp;
> +       int queue_id;
> +
> +       mutex_lock(&adev->userq_mutex);
> +       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +               cancel_delayed_work_sync(&uqm->resume_work);
> +               mutex_lock(&uqm->userq_mutex);
> +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +                       if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> +                               amdgpu_userq_wait_for_last_fence(uqm, queue);
> +                               userq_funcs = adev->userq_funcs[queue->queue_type];
> +                               userq_funcs->unmap(uqm, queue);
> +                               /* just mark all queues as hung at this point.
> +                                * if unmap succeeds, we could map again
> +                                * in amdgpu_userq_post_reset() if vram is not lost
> +                                */
> +                               queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                               amdgpu_userq_fence_driver_force_completion(queue);
> +                       }
> +               }
> +               mutex_unlock(&uqm->userq_mutex);
> +       }
> +       mutex_unlock(&adev->userq_mutex);
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> +        * at this point, we should be able to map it again
> +        * and continue if vram is not lost.
> +        */
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 3cc0ad8919f4..9d62befcdb24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -134,4 +134,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>  int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                    u32 idx);
>
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
>  #endif
> --
> 2.49.0
>