[v7 11/11] drm/amdgpu: Implement user queue reset handling and recovery
Alex Deucher
alexdeucher at gmail.com
Fri Aug 8 19:58:11 UTC 2025
On Tue, Aug 5, 2025 at 10:29 PM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher at amd.com>
>
> This patch introduces queue reset and GPU reset support for userq.
> It adds detection and recovery logic for gfx, compute, and SDMA queues when errors occur during queue operations (e.g., map/unmap failures).
> This enhances user queue resiliency and ensures graceful recovery in case of individual queue failure without immediately requiring full GPU reset.
>
I'd split this patch into two patches:
1. switch over to using preempt/restore for cases where it is
applicable. We still need map/unmap for initial userq setup and
removal and for system suspend.
2. implement reset
Alex
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 202 +++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
> 4 files changed, 201 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index ef3af170dda4..9db05cdc7304 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1302,6 +1302,7 @@ struct amdgpu_device {
> struct list_head userq_mgr_list;
> struct mutex userq_mutex;
> bool userq_halt_for_enforce_isolation;
> + struct work_struct userq_reset_work;
> };
>
> static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3757634613c3..1dc88b0055dd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> }
>
> INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
> adev->gfx.gfx_off_req_count = 1;
> adev->gfx.gfx_off_residency = 0;
> @@ -5880,6 +5881,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
> if (r)
> goto out;
>
> + r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
> + if (r)
> + goto out;
> +
> drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
> /*
> @@ -6102,6 +6107,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
> if (!amdgpu_sriov_vf(adev))
> cancel_work(&adev->reset_work);
> #endif
> + cancel_work(&adev->userq_reset_work);
>
> if (adev->kfd.dev)
> cancel_work(&adev->kfd.reset_work);
> @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
> amdgpu_device_ip_need_full_reset(tmp_adev))
> amdgpu_ras_suspend(tmp_adev);
>
> + amdgpu_userq_pre_reset(tmp_adev);
> +
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 1496544cd1c2..edec881e6fa5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -25,8 +25,9 @@
> #include <drm/drm_auth.h>
> #include <drm/drm_exec.h>
> #include <linux/pm_runtime.h>
> -
> +#include <drm/drm_drv.h>
> #include "amdgpu.h"
> +#include "amdgpu_reset.h"
> #include "amdgpu_vm.h"
> #include "amdgpu_userq.h"
> #include "amdgpu_userq_fence.h"
> @@ -44,6 +45,88 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
> return userq_ip_mask;
> }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> +{
> +
> + if (amdgpu_device_should_recover_gpu(adev)) {
> + amdgpu_reset_domain_schedule(adev->reset_domain,
> + &adev->userq_reset_work);
> + /* Wait for the reset job to complete */
> + flush_work(&adev->userq_reset_work);
> + }
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const struct amdgpu_userq_funcs *userq_gfx_funcs =
> + adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
> + const struct amdgpu_userq_funcs *userq_compute_funcs =
> + adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
> + const struct amdgpu_userq_funcs *userq_sdma_funcs =
> + adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
> +
> + bool has_gfx = false, has_compute = false, has_sdma = false;
> + struct amdgpu_usermode_queue *userq;
> + bool gpu_reset = false;
> + int id, r = 0;
> +
> + if (idr_is_empty(&uq_mgr->userq_idr))
> + return false;
> +
> + /* Detect which types of queues are present */
> + idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
> + switch (userq->queue_type) {
> + case AMDGPU_RING_TYPE_GFX:
> + has_gfx = true;
> + break;
> + case AMDGPU_RING_TYPE_COMPUTE:
> + has_compute = true;
> + break;
> + case AMDGPU_RING_TYPE_SDMA:
> + has_sdma = true;
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> + dev_err(adev->dev, "userq reset disabled by debug mask\n");
> + } else if (amdgpu_gpu_recovery) {
> + if (has_compute && userq_compute_funcs->detect_and_reset) {
> + r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
> + if (r) {
> + gpu_reset = true;
> + goto gpu_reset;
> + }
> + }
> +
> + if (has_gfx && userq_gfx_funcs->detect_and_reset) {
> + r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX);
> + if (r) {
> + gpu_reset = true;
> + goto gpu_reset;
> + }
> + }
> +
> + if (has_sdma && userq_sdma_funcs && userq_sdma_funcs->detect_and_reset) {
> + r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA);
> + if (r) {
> + gpu_reset = true;
> + goto gpu_reset;
> + }
> + }
> + }
> +
> +gpu_reset:
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> + return r;
> +}
> +
> static int
> amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_queue *queue)
> @@ -99,15 +182,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *userq_funcs =
> adev->userq_funcs[queue->queue_type];
> + bool gpu_reset = false;
> int r = 0;
>
> - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> r = userq_funcs->unmap(uq_mgr, queue);
> - if (r)
> + if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> - else
> + gpu_reset = true;
> + } else {
> queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> + }
> }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> return r;
> }
>
> @@ -118,16 +209,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_device *adev = uq_mgr->adev;
> const struct amdgpu_userq_funcs *userq_funcs =
> adev->userq_funcs[queue->queue_type];
> + bool gpu_reset = false;
> int r = 0;
>
> if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
> r = userq_funcs->map(uq_mgr, queue);
> if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> + gpu_reset = true;
> } else {
> queue->state = AMDGPU_USERQ_STATE_MAPPED;
> }
> }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> return r;
> }
>
> @@ -366,6 +463,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
> amdgpu_bo_unreserve(queue->db_obj.obj);
> }
> amdgpu_bo_unref(&queue->db_obj.obj);
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> amdgpu_userq_cleanup(uq_mgr, queue, queue_id);
> mutex_unlock(&uq_mgr->userq_mutex);
> @@ -584,7 +682,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
>
> /* Resume all the queues for this process */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_map_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_restore_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -741,7 +839,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>
> /* Try to unmap all the queues in this process ctx */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_preempt_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -751,6 +849,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> return ret;
> }
>
> +void amdgpu_userq_reset_work(struct work_struct *work)
> +{
> + struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> + userq_reset_work);
> + struct amdgpu_reset_context reset_context;
> +
> + memset(&reset_context, 0, sizeof(reset_context));
> +
> + reset_context.method = AMD_RESET_METHOD_NONE;
> + reset_context.reset_req_dev = adev;
> + reset_context.src = AMDGPU_RESET_SRC_USERQ;
> + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> + amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> +}
> +
> static int
> amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
> {
> @@ -777,22 +892,19 @@ void
> amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_eviction_fence *ev_fence)
> {
> - int ret;
> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> + struct amdgpu_device *adev = uq_mgr->adev;
> + int ret;
>
> /* Wait for any pending userqueue fence work to finish */
> ret = amdgpu_userq_wait_for_signal(uq_mgr);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
> - return;
> - }
> + if (ret)
> + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
>
> ret = amdgpu_userq_evict_all(uq_mgr);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> - return;
> - }
> + if (ret)
> + dev_err(adev->dev, "Failed to evict userqueue\n");
>
> /* Signal current eviction fence */
> amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
> @@ -833,6 +945,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
>
> mutex_lock(&adev->userq_mutex);
> mutex_lock(&userq_mgr->userq_mutex);
> + amdgpu_userq_detect_and_reset_queues(userq_mgr);
> idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
> amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> @@ -866,6 +979,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> cancel_delayed_work_sync(&uqm->resume_work);
> mutex_lock(&uqm->userq_mutex);
> + amdgpu_userq_detect_and_reset_queues(uqm);
> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> r = amdgpu_userq_unmap_helper(uqm, queue);
> if (r)
> @@ -922,6 +1036,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> cancel_delayed_work_sync(&uqm->resume_work);
> mutex_lock(&uqm->userq_mutex);
> + amdgpu_userq_detect_and_reset_queues(uqm);
> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
> @@ -970,3 +1085,60 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> mutex_unlock(&adev->userq_mutex);
> return ret;
> }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
> +{
> + const struct amdgpu_userq_funcs *userq_funcs;
> + struct amdgpu_usermode_queue *queue;
> + struct amdgpu_userq_mgr *uqm, *tmp;
> + int queue_id;
> +
> + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> + cancel_delayed_work_sync(&uqm->resume_work);
> + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + amdgpu_userq_wait_for_last_fence(uqm, queue);
> + userq_funcs = adev->userq_funcs[queue->queue_type];
> + userq_funcs->unmap(uqm, queue);
> + /* just mark all queues as hung at this point.
> + * if unmap succeeds, we could map again
> + * in amdgpu_userq_post_reset() if vram is not lost
> + */
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + amdgpu_userq_fence_driver_force_completion(queue);
> + }
> + }
> + }
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
> +{
> + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> + * at this point, we should be able to map it again
> + * and continue if vram is not lost.
> + */
> + struct amdgpu_userq_mgr *uqm;
> + struct amdgpu_usermode_queue *queue;
> + const struct amdgpu_userq_funcs *userq_funcs;
> + int queue_id, r = 0;
> +
> + list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
> + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
> + userq_funcs = adev->userq_funcs[queue->queue_type];
> +
> + r = userq_funcs->map(uqm, queue); // Re-map queue
> + if (r) {
> + dev_err(adev->dev, "Failed to remap queue %d\n", queue_id);
> + continue;
> + }
> + queue->state = AMDGPU_USERQ_STATE_MAPPED;
> + }
> + }
> +
> + /* Restart resume work after reset */
> + //queue_delayed_work(system_wq, &uqm->resume_work, msecs_to_jiffies(100));
> + }
> +
> + return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 9fa0d1a88d71..e68bb144b26f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -138,4 +138,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
> int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
> u32 idx);
>
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
> #endif
> --
> 2.49.0
>
More information about the amd-gfx
mailing list