[v12 11/11] drm/amdgpu: Implement user queue reset functionality

Thu Aug 28 21:01:26 UTC 2025

On Thu, Aug 28, 2025 at 4:54 AM Zhang, Jesse(Jie) <Jesse.Zhang at amd.com> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Zhang, Jesse(Jie)
> Sent: Thursday, August 28, 2025 4:16 PM
> To: Alex Deucher <alexdeucher at gmail.com>
> Cc: amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
> Subject: RE: [v12 11/11] drm/amdgpu: Implement user queue reset functionality
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: Alex Deucher <alexdeucher at gmail.com>
> Sent: Tuesday, August 26, 2025 10:49 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>
> Cc: amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
> Subject: Re: [v12 11/11] drm/amdgpu: Implement user queue reset functionality
>
> On Mon, Aug 25, 2025 at 10:43 PM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
> >
> > From: Alex Deucher <alexander.deucher at amd.com>
> >
> > This patch adds robust reset handling for user queues (userq) to
> > improve recovery from queue failures. The key components include:
> >
> > 1. Queue detection and reset logic:
> >    - amdgpu_userq_detect_and_reset_queues() identifies failed queues
> >    - Per-IP detect_and_reset callbacks for targeted recovery
> >    - Falls back to full GPU reset when needed
> >
> > 2. Reset infrastructure:
> >    - Adds userq_reset_work workqueue for async reset handling
> >    - Implements pre/post reset handlers for queue state management
> >    - Integrates with existing GPU reset framework
> >
> > 3. Error handling improvements:
> >    - Enhanced state tracking with HUNG state
> >    - Automatic reset triggering on critical failures
> >    - VRAM loss handling during recovery
> >
> > 4. Integration points:
> >    - Added to device init/reset paths
> >    - Called during queue destroy, suspend, and isolation events
> >    - Handles both individual queue and full GPU resets
> >
> > The reset functionality works with both compute and graphics queues,
> > providing better resilience against queue failures while minimizing
> > disruption to unaffected queues.
> >
> > v2: Separate SDMA and GFX/COMPUTE queue reset handling
> >
> > Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> > Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 +
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 215 +++++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
> >  4 files changed, 216 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index ef3af170dda4..9db05cdc7304 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1302,6 +1302,7 @@ struct amdgpu_device {
> >         struct list_head                userq_mgr_list;
> >         struct mutex                    userq_mutex;
> >         bool                            userq_halt_for_enforce_isolation;
> > +       struct work_struct              userq_reset_work;
> >  };
> >
> >  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
> > *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 3757634613c3..1dc88b0055dd 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> >         }
> >
> >         INIT_WORK(&adev->xgmi_reset_work,
> > amdgpu_device_xgmi_reset_func);
> > +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
> >
> >         adev->gfx.gfx_off_req_count = 1;
> >         adev->gfx.gfx_off_residency = 0; @@ -5880,6 +5881,10 @@ int
> > amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
> >                                 if (r)
> >                                         goto out;
> >
> > +                               r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> > +                               if (r)
> > +                                       goto out;
> > +
> >
> > drm_client_dev_resume(adev_to_drm(tmp_adev), false);
> >
> >                                 /*
> > @@ -6102,6 +6107,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
> >         if (!amdgpu_sriov_vf(adev))
> >                 cancel_work(&adev->reset_work);  #endif
> > +       cancel_work(&adev->userq_reset_work);
> >
> >         if (adev->kfd.dev)
> >                 cancel_work(&adev->kfd.reset_work);
> > @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
> >                       amdgpu_device_ip_need_full_reset(tmp_adev))
> >                         amdgpu_ras_suspend(tmp_adev);
> >
> > +               amdgpu_userq_pre_reset(tmp_adev);
> > +
> >                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                         struct amdgpu_ring *ring = tmp_adev->rings[i];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index decedf8057ac..996c7397a57f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -25,8 +25,10 @@
> >  #include <drm/drm_auth.h>
> >  #include <drm/drm_exec.h>
> >  #include <linux/pm_runtime.h>
> > +#include <drm/drm_drv.h>
> >
> >  #include "amdgpu.h"
> > +#include "amdgpu_reset.h"
> >  #include "amdgpu_vm.h"
> >  #include "amdgpu_userq.h"
> >  #include "amdgpu_userq_fence.h"
> > @@ -44,6 +46,102 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
> >         return userq_ip_mask;
> >  }
> >
> > +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) {
> > +
> > +       if (amdgpu_device_should_recover_gpu(adev)) {
> > +               amdgpu_reset_domain_schedule(adev->reset_domain,
> > +                                            &adev->userq_reset_work);
> > +               /* Wait for the reset job to complete */
> > +               flush_work(&adev->userq_reset_work);
> > +       }
> > +}
> > +
> > +static int
> > +amdgpu_userq_detect_and_reset_sdma(struct amdgpu_userq_mgr *uq_mgr) {
> > +       struct amdgpu_device *adev = uq_mgr->adev;
> > +       const struct amdgpu_userq_funcs *userq_sdma_funcs =
> > +                       adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
> > +       struct amdgpu_usermode_queue *userq;
> > +       bool has_sdma = false;
> > +       bool gpu_reset = false;
> > +       int id, r = 0;
> > +
> > +       /* Detect if SDMA queues are present */
> > +       idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
> > +               if (userq->queue_type == AMDGPU_RING_TYPE_SDMA) {
> > +                       has_sdma = true;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       if (has_sdma && userq_sdma_funcs && userq_sdma_funcs->detect_and_reset) {
> > +               r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA);
> > +               if (r)
> > +                       gpu_reset = true;
> > +       }
> > +
> > +       if (gpu_reset)
> > +               amdgpu_userq_gpu_reset(adev);
> > +
> > +       return r;
> > +}
> > +
> > +static int
> > +amdgpu_userq_detect_and_reset_gfx_compute(struct amdgpu_userq_mgr
> > +*uq_mgr) {
> > +       struct amdgpu_device *adev = uq_mgr->adev;
> > +       const struct amdgpu_userq_funcs *userq_gfx_funcs =
> > +                       adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
> > +       const struct amdgpu_userq_funcs *userq_compute_funcs =
> > +                       adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
> > +       struct amdgpu_usermode_queue *userq;
> > +       bool has_gfx = false, has_compute = false;
> > +       bool gpu_reset = false;
> > +       int id, r = 0;
> > +
> > +       /* Detect which types of queues are present (excluding SDMA) */
> > +       idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
> > +               switch (userq->queue_type) {
> > +               case AMDGPU_RING_TYPE_GFX:
> > +                       has_gfx = true;
> > +                       break;
> > +               case AMDGPU_RING_TYPE_COMPUTE:
> > +                       has_compute = true;
> > +                       break;
> > +               default:
> > +                       break;
> > +               }
> > +       }
> > +
> > +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> > +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> > +       } else if (amdgpu_gpu_recovery) {
> > +               if (has_compute && userq_compute_funcs->detect_and_reset) {
> > +                       r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
> > +                       if (r) {
> > +                               gpu_reset = true;
> > +                               goto gpu_reset;
> > +                       }
> > +               }
> > +
> > +               if (has_gfx && userq_gfx_funcs->detect_and_reset) {
> > +                       r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX);
> > +                       if (r) {
> > +                               gpu_reset = true;
> > +                               goto gpu_reset;
> > +                       }
> > +               }
> > +       }
> > +
> > +gpu_reset:
> > +       if (gpu_reset)
> > +               amdgpu_userq_gpu_reset(adev);
> > +
> > +       return r;
> > +}
>
> > Why split these into two functions?  I think it would be more
> efficient to just have a single function
> (amdgpu_userq_detect_and_reset()) that works at the device level and checks to see what types of user queues are present and then calls
> detect_and_reset() for each queue type that is present and then have that function update the queue state of all queues that are hung.
> Then just call this function in all the places we call preempt or unmap.  E.g.,
>
> > amdgpu_userq_detect_and_reset()
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
>     /* evict, unmap, */
> }
>
> > We probably also need some way to determine whether the queue reset
> was successful or not because we ultimately need to decide when to do a full adapter reset.
>
> Thanks Alex.
> You are right, I would revert the changes and call it everywhere we call preempt and unmap, including when preempt/unmap fails.

How about if we have a per device userq counter for each user queue
type.  It would get incremented when a queue is created and
decremented when it's destroyed.  Then we can just check the counters
and call detect_and_reset for all queue types where the counter is
non-0.  Update the state of any affected queues. Take all the locks.

Then do whatever preempt/umap is requested and update the queue state.

Then afterwards, call detect_and_reset() again with all the locks.

At this point we can do a full adapter reset if necessary.  I'm not
clear on how we determine whether we need to or not.

Alex

>
> Jesse
>
> >Hi Alex,
>
> For SDMA, preemption on the GFX12 always succeeds even if the SDMA queue is pending. Therefore, we must detect the hung queues before preempting/unmapping.
>
> For compute/GFX queues, the API does not detect the hung queues before preempting/unmapping.
> However, if a GFX/compute queue is hung, preemption will fail. After preemption/unmapping fails, we call the detection and reset API, and MES resets the unmapped queue.
>
> For this reason, I've split these into two functions.
>
> Thanks
> Jesse
>
>
> Alex
>
> > +
> >  static int
> >  amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> >                           struct amdgpu_usermode_queue *queue) @@
> > -56,6 +154,7 @@ amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> >         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> >                 r = userq_funcs->preempt(uq_mgr, queue);
> >                 if (r) {
> > +
> > + amdgpu_userq_detect_and_reset_gfx_compute(uq_mgr);
> >                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> >                 } else {
> >                         queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
> > @@ -72,17 +171,21 @@ amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr,
> >         struct amdgpu_device *adev = uq_mgr->adev;
> >         const struct amdgpu_userq_funcs *userq_funcs =
> >                 adev->userq_funcs[queue->queue_type];
> > +       bool gpu_reset = false;
> >         int r = 0;
> >
> >         if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
> >                 r = userq_funcs->restore(uq_mgr, queue);
> >                 if (r) {
> >                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> > +                       gpu_reset = true;
> >                 } else {
> >                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
> >                 }
> >         }
> >
> > +       if (gpu_reset)
> > +               amdgpu_userq_gpu_reset(adev);
> >         return r;
> >  }
> >
> > @@ -98,11 +201,14 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
> >         if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> >                 (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> >                 r = userq_funcs->unmap(uq_mgr, queue);
> > -               if (r)
> > +               if (r) {
> > +
> > + amdgpu_userq_detect_and_reset_gfx_compute(uq_mgr);
> >                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> > -               else
> > +               } else {
> >                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> > +               }
> >         }
> > +
> >         return r;
> >  }
> >
> > @@ -113,16 +219,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
> >         struct amdgpu_device *adev = uq_mgr->adev;
> >         const struct amdgpu_userq_funcs *userq_funcs =
> >                 adev->userq_funcs[queue->queue_type];
> > +       bool gpu_reset = false;
> >         int r = 0;
> >
> >         if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
> >                 r = userq_funcs->map(uq_mgr, queue);
> >                 if (r) {
> >                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> > +                       gpu_reset = true;
> >                 } else {
> >                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
> >                 }
> >         }
> > +
> > +       if (gpu_reset)
> > +               amdgpu_userq_gpu_reset(adev);
> > +
> >         return r;
> >  }
> >
> > @@ -361,6 +473,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
> >                 amdgpu_bo_unreserve(queue->db_obj.obj);
> >         }
> >         amdgpu_bo_unref(&queue->db_obj.obj);
> > +       amdgpu_userq_detect_and_reset_sdma(uq_mgr);
> >         r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> >         amdgpu_userq_cleanup(uq_mgr, queue, queue_id);
> >         mutex_unlock(&uq_mgr->userq_mutex);
> > @@ -734,6 +847,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> >         int queue_id;
> >         int ret = 0, r;
> >
> > +       amdgpu_userq_detect_and_reset_sdma (uq_mgr);
> >         /* Try to unmap all the queues in this process ctx */
> >         idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> >                 r = amdgpu_userqueue_preempt_helper(uq_mgr, queue); @@
> > -746,6 +860,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> >         return ret;
> >  }
> >
> > +void amdgpu_userq_reset_work(struct work_struct *work) {
> > +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> > +                                                 userq_reset_work);
> > +       struct amdgpu_reset_context reset_context;
> > +
> > +       memset(&reset_context, 0, sizeof(reset_context));
> > +
> > +       reset_context.method = AMD_RESET_METHOD_NONE;
> > +       reset_context.reset_req_dev = adev;
> > +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> > +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> > +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> > +
> > +       amdgpu_device_gpu_recover(adev, NULL, &reset_context); }
> > +
> >  static int
> >  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)  { @@
> > -772,22 +903,19 @@ void  amdgpu_userq_evict(struct amdgpu_userq_mgr
> > *uq_mgr,
> >                    struct amdgpu_eviction_fence *ev_fence)  {
> > -       int ret;
> >         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> >         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> > +       struct amdgpu_device *adev = uq_mgr->adev;
> > +       int ret;
> >
> >         /* Wait for any pending userqueue fence work to finish */
> >         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> > -       if (ret) {
> > -               drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> > -               return;
> > -       }
> > +       if (ret)
> > +               dev_err(adev->dev, "Not evicting userqueue, timeout
> > + waiting for work\n");
> >
> >         ret = amdgpu_userq_evict_all(uq_mgr);
> > -       if (ret) {
> > -               drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> > -               return;
> > -       }
> > +       if (ret)
> > +               dev_err(adev->dev, "Failed to evict userqueue\n");
> >
> >         /* Signal current eviction fence */
> >         amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -828,6
> > +956,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr
> > *userq_mgr)
> >
> >         mutex_lock(&adev->userq_mutex);
> >         mutex_lock(&userq_mgr->userq_mutex);
> > +       amdgpu_userq_detect_and_reset_sdma(userq_mgr);
> >         idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> >                 amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
> >                 amdgpu_userq_unmap_helper(userq_mgr, queue); @@
> > -861,10 +990,13 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
> >         list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> >                 cancel_delayed_work_sync(&uqm->resume_work);
> >                 mutex_lock(&uqm->userq_mutex);
> > +               amdgpu_userq_detect_and_reset_sdma(uqm);
> >                 idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> >                         r = amdgpu_userq_unmap_helper(uqm, queue);
> > -                       if (r)
> > +                       if (r) {
> > +
> > + amdgpu_userq_detect_and_reset_gfx_compute(uqm);
> >                                 ret = r;
> > +                       }
> >                 }
> >                 mutex_unlock(&uqm->userq_mutex);
> >         }
> > @@ -917,13 +1049,16 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> >         list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> >                 cancel_delayed_work_sync(&uqm->resume_work);
> >                 mutex_lock(&uqm->userq_mutex);
> > +               amdgpu_userq_detect_and_reset_sdma(uqm);
> >                 idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> >                         if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> >                              (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> >                             (queue->xcp_id == idx)) {
> >                                 r = amdgpu_userq_unmap_helper(uqm, queue);
> > -                               if (r)
> > +                               if (r) {
> > +
> > + amdgpu_userq_detect_and_reset_gfx_compute(uqm);
> >                                         ret = r;
> > +                               }
> >                         }
> >                 }
> >                 mutex_unlock(&uqm->userq_mutex); @@ -965,3 +1100,57 @@
> > int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> >         mutex_unlock(&adev->userq_mutex);
> >         return ret;
> >  }
> > +
> > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) {
> > +       const struct amdgpu_userq_funcs *userq_funcs;
> > +       struct amdgpu_usermode_queue *queue;
> > +       struct amdgpu_userq_mgr *uqm, *tmp;
> > +       int queue_id;
> > +
> > +       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> > +               cancel_delayed_work_sync(&uqm->resume_work);
> > +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> > +                       if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> > +                               amdgpu_userq_wait_for_last_fence(uqm, queue);
> > +                               userq_funcs = adev->userq_funcs[queue->queue_type];
> > +                               userq_funcs->unmap(uqm, queue);
> > +                               /* just mark all queues as hung at this point.
> > +                                * if unmap succeeds, we could map again
> > +                                * in amdgpu_userq_post_reset() if vram is not lost
> > +                                */
> > +                               queue->state = AMDGPU_USERQ_STATE_HUNG;
> > +                               amdgpu_userq_fence_driver_force_completion(queue);
> > +                       }
> > +               }
> > +       }
> > +}
> > +
> > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool
> > +vram_lost) {
> > +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> > +        * at this point, we should be able to map it again
> > +        * and continue if vram is not lost.
> > +        */
> > +       struct amdgpu_userq_mgr *uqm;
> > +       struct amdgpu_usermode_queue *queue;
> > +       const struct amdgpu_userq_funcs *userq_funcs;
> > +       int queue_id, r = 0;
> > +
> > +       list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
> > +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> > +                       if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
> > +                               userq_funcs = adev->userq_funcs[queue->queue_type];
> > +                               /* re-map queue */
> > +                               r = userq_funcs->map(uqm, queue);
> > +                               if (r) {
> > +                                       dev_err(adev->dev, "Failed to remap queue %d\n", queue_id);
> > +                                       continue;
> > +                               }
> > +                               queue->state = AMDGPU_USERQ_STATE_MAPPED;
> > +                       }
> > +               }
> > +       }
> > +
> > +       return r;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > index 9fa0d1a88d71..e68bb144b26f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > @@ -138,4 +138,9 @@ int
> > amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,  int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> >                                                    u32 idx);
> >
> > +void amdgpu_userq_reset_work(struct work_struct *work);
> > +
> > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); int
> > +amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> > +
> >  #endif
> > --
> > 2.49.0
> >