[PATCH 5/7] drm/amdgpu/userq: implement resets

Thu May 8 06:31:11 UTC 2025

[Public]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Zhang,
> Jesse(Jie)
> Sent: Wednesday, May 7, 2025 1:24 PM
> To: Deucher, Alexander <Alexander.Deucher at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian
> <Christian.Koenig at amd.com>; Khatri, Sunil <Sunil.Khatri at amd.com>
> Subject: RE: [PATCH 5/7] drm/amdgpu/userq: implement resets
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Alex
> Deucher
> Sent: Tuesday, May 6, 2025 11:59 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian
> <Christian.Koenig at amd.com>; Khatri, Sunil <Sunil.Khatri at amd.com>
> Subject: [PATCH 5/7] drm/amdgpu/userq: implement resets
>
> If map or unmap fails, or a user fence times out, attempt to reset the queue.  If
> that fails, schedule a GPU reset.
>
> Acked-by: Christian König <christian.koenig at amd.com>
> Reviewed-by: Sunil Khatri <sunil.khatri at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 123 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
>  4 files changed, 128 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index de57ab032cc70..ec49047e4cdd3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1302,6 +1302,7 @@ struct amdgpu_device {
>         struct list_head                userq_mgr_list;
>         struct mutex                    userq_mutex;
>         bool                            userq_halt_for_enforce_isolation;
> +       struct work_struct              userq_reset_work;
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8330e30f0cafe..8c738928091c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4425,6 +4425,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>
>         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
>         adev->gfx.gfx_off_req_count = 1;
>         adev->gfx.gfx_off_residency = 0; @@ -5782,6 +5783,10 @@ int
> amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
>                                 if (r)
>                                         goto out;
>
> +                               r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> +                               if (r)
> +                                       goto out;
> +
>                                 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
>                                 /*
> @@ -6004,6 +6009,7 @@ static inline void
> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>         if (!amdgpu_sriov_vf(adev))
>                 cancel_work(&adev->reset_work);  #endif
> +       cancel_work(&adev->userq_reset_work);
>
>         if (adev->kfd.dev)
>                 cancel_work(&adev->kfd.reset_work);
> @@ -6114,6 +6120,8 @@ static int amdgpu_device_halt_activities(struct
> amdgpu_device *adev,
>                       amdgpu_device_ip_need_full_reset(tmp_adev))
>                         amdgpu_ras_suspend(tmp_adev);
>
> +               amdgpu_userq_pre_reset(tmp_adev);
> +
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 394b036be1d9e..e09d0e5b9762a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -26,7 +26,10 @@
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
>
> +#include <drm/drm_drv.h>
> +
>  #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> @@ -44,6 +47,44 @@ u32 amdgpu_userq_get_supported_ip_mask(struct
> amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) {
> +
> +       if (amdgpu_device_should_recover_gpu(adev))
> +               amdgpu_reset_domain_schedule(adev->reset_domain,
> +                                            &adev->userq_reset_work); }
> +
> +static bool
> +amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
> +                               struct amdgpu_usermode_queue *queue) {
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const struct amdgpu_userq_funcs *userq_funcs =
> +               adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
> +       int r;
> +
> +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +       } else if (amdgpu_gpu_recovery && userq_funcs->reset) {
>   Do you need to check the queue's hang information (e.g., queue ID, pipe ID,
> ME/MEC ID) before resetting?
>    such as, by checking  regCP_HQD_ACTIVE or regCP_GFX_HQD_ACTIVE for
> each queue to filter the hang queue.

If the hang userq reset tries to use the MES_SCH_API_RESET method, then the MES firmware will be responsible for detecting the hang queue/pipe and vmid self rather than driver input.

> Thanks
> Jesse
> +               r = userq_funcs->reset(uq_mgr, queue);
> +               if (r) {
> +                       dev_err(adev->dev, "userq reset failed\n");
> +                       gpu_reset = true;
> +               } else {
> +                       dev_err(adev->dev, "userq reset succeeded\n");
> +                       atomic_inc(&adev->gpu_reset_counter);
> +                       amdgpu_userq_fence_driver_force_completion(queue);
> +                       drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE);
> +               }
> +       } else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
> +               gpu_reset = true;
> +       }
> +
> +       return gpu_reset;
> +}
> +
>  static int
>  amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>                           struct amdgpu_usermode_queue *queue) @@ -51,15 +92,22
> @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>                 r = userq_funcs->unmap(uq_mgr, queue);
> -               if (r)
> +               if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> -               else
> +                       gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr,
> queue);
> +               } else {
>                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> +               }
>         }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
>         return r;
>  }
>
> @@ -70,16 +118,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr
> *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
>                 r = userq_funcs->map(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       gpu_reset =
> + amdgpu_userq_queue_reset_helper(uq_mgr, queue);
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
>         return r;
>  }
>
> @@ -699,6 +753,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr
> *uq_mgr)
>         return ret;
>  }
>
> +void amdgpu_userq_reset_work(struct work_struct *work) {
> +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +                                                 userq_reset_work);
> +       struct amdgpu_reset_context reset_context;
> +
> +       memset(&reset_context, 0, sizeof(reset_context));
> +
> +       reset_context.method = AMD_RESET_METHOD_NONE;
> +       reset_context.reset_req_dev = adev;
> +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +       amdgpu_device_gpu_recover(adev, NULL, &reset_context); }
> +
>  static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)  { @@ -
> 725,22 +796,18 @@ void  amdgpu_userq_evict(struct amdgpu_userq_mgr
> *uq_mgr,
>                    struct amdgpu_eviction_fence *ev_fence)  {
> -       int ret;
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> +       int ret;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> -       if (ret) {
> +       if (ret)
>                 drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for
> work\n");
> -               return;
> -       }
>
>         ret = amdgpu_userq_evict_all(uq_mgr);
> -       if (ret) {
> +       if (ret)
>                 drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> -               return;
> -       }
>
>         /* Signal current eviction fence */
>         amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -917,3 +984,41
> @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device
> *adev,
>         mutex_unlock(&adev->userq_mutex);
>         return ret;
>  }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) {
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm, *tmp;
> +       int queue_id;
> +
> +       mutex_lock(&adev->userq_mutex);
> +       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +               cancel_delayed_work_sync(&uqm->resume_work);
> +               mutex_lock(&uqm->userq_mutex);
> +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +                       if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> +                               amdgpu_userq_wait_for_last_fence(uqm, queue);
> +                               userq_funcs = adev->userq_funcs[queue->queue_type];
> +                               userq_funcs->unmap(uqm, queue);
> +                               /* just mark all queues as hung at this point.
> +                                * if unmap succeeds, we could map again
> +                                * in amdgpu_userq_post_reset() if vram is not lost
> +                                */
> +                               queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                               amdgpu_userq_fence_driver_force_completion(queue);
> +                       }
> +               }
> +               mutex_unlock(&uqm->userq_mutex);
> +       }
> +       mutex_unlock(&adev->userq_mutex); }
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> +        * at this point, we should be able to map it again
> +        * and continue if vram is not lost.
> +        */
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 3cc0ad8919f42..9d62befcdb24f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -134,4 +134,9 @@ int
> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device
> *adev,
>                                                    u32 idx);
>
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); int
> +amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
>  #endif
> --
> 2.49.0