[v6 11/12] drm/amdgpu: Add GPU reset handling for user mode queues
Alexander.Deucher at amd.com
Alexander.Deucher at amd.com
Fri Aug 1 06:21:09 UTC 2025
From: Alex Deucher <alexander.deucher at amd.com>
This patch introduces GPU reset support for userq.
It adds detection and recovery logic for gfx, compute, and SDMA queues when errors occur during queue operations (e.g., map/unmap failures).
Key changes include:
- New function `amdgpu_userq_detect_and_reset_queues()` to scan active queues and attempt recovery via queue-specific handlers.
- Integration of userq reset logic into suspend/resume, eviction, and reset paths.
- New pre-reset and post-reset hooks (`amdgpu_userq_pre_reset()` and `amdgpu_userq_post_reset()`) for cleaning up and reinitializing user queues during full GPU reset.
- A new work_struct (`userq_reset_work`) is introduced and used in `amdgpu_userq_gpu_reset()` to trigger GPU recovery using the standard GPU reset framework.
This enhances user queue resiliency and ensures graceful recovery in case of individual queue failure without immediately requiring full GPU reset.
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 200 +++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
4 files changed, 203 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ef3af170dda4..9db05cdc7304 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1302,6 +1302,7 @@ struct amdgpu_device {
struct list_head userq_mgr_list;
struct mutex userq_mutex;
bool userq_halt_for_enforce_isolation;
+ struct work_struct userq_reset_work;
};
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3757634613c3..1dc88b0055dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
@@ -5880,6 +5881,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
+ r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+ if (r)
+ goto out;
+
drm_client_dev_resume(adev_to_drm(tmp_adev), false);
/*
@@ -6102,6 +6107,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
+ cancel_work(&adev->userq_reset_work);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
@@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
+ amdgpu_userq_pre_reset(tmp_adev);
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index aac0de86f3e8..0c91302162fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -26,7 +26,10 @@
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
+
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_userq_fence.h"
@@ -44,6 +47,92 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
return userq_ip_mask;
}
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+ /* Wait for the reset job to complete */
+ flush_work(&adev->userq_reset_work);
+ }
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_gfx_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
+ const struct amdgpu_userq_funcs *userq_compute_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
+ const struct amdgpu_userq_funcs *userq_sdma_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
+
+ bool has_gfx = false, has_compute = false, has_sdma = false;
+ struct amdgpu_usermode_queue *userq;
+ bool gpu_reset = false;
+ int gpu_suspend, id, r = 0;
+
+ if (idr_is_empty(&uq_mgr->userq_idr))
+ return false;
+
+ /* Detect which types of queues are present */
+ idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
+ switch (userq->queue_type) {
+ case AMDGPU_RING_TYPE_GFX:
+ has_gfx = true;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ has_compute = true;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ has_sdma = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ } else if (amdgpu_gpu_recovery) {
+ if (has_compute && userq_compute_funcs->detect_and_reset) {
+ gpu_suspend = amdgpu_mes_suspend(adev);
+ r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if (has_gfx && userq_gfx_funcs->detect_and_reset) {
+ r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if (has_sdma && userq_sdma_funcs && userq_sdma_funcs->detect_and_reset) {
+ r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+ }
+
+gpu_reset:
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ if ((!gpu_suspend) && has_compute)
+ amdgpu_mes_resume(adev);
+
+ return r;
+}
+
static int
amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -51,15 +140,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ gpu_reset = true;
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -70,16 +166,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = true;
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -318,6 +420,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
amdgpu_bo_unreserve(queue->db_obj.obj);
}
amdgpu_bo_unref(&queue->db_obj.obj);
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
amdgpu_userq_cleanup(uq_mgr, queue, queue_id);
mutex_unlock(&uq_mgr->userq_mutex);
@@ -691,6 +794,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
int queue_id;
int ret = 0, r;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
@@ -703,6 +807,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -729,22 +850,19 @@ void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -785,6 +903,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
mutex_lock(&adev->userq_mutex);
mutex_lock(&userq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(userq_mgr);
idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -818,6 +937,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
mutex_lock(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
r = amdgpu_userq_unmap_helper(uqm, queue);
if (r)
@@ -874,6 +994,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
mutex_lock(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
@@ -922,3 +1043,60 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
mutex_unlock(&adev->userq_mutex);
return ret;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ int queue_id;
+
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ cancel_delayed_work_sync(&uqm->resume_work);
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+ }
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ struct amdgpu_userq_mgr *uqm;
+ struct amdgpu_usermode_queue *queue;
+ const struct amdgpu_userq_funcs *userq_funcs;
+ int queue_id, r = 0;
+
+ list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+
+ r = userq_funcs->map(uqm, queue); // Re-map queue
+ if (r) {
+ dev_err(adev->dev, "Failed to remap queue %d\n", queue_id);
+ continue;
+ }
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ /* Restart resume work after reset */
+ //queue_delayed_work(system_wq, &uqm->resume_work, msecs_to_jiffies(100));
+ }
+
+ return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 68e46d01bed2..c136d7e7a763 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -139,4 +139,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
#endif
--
2.49.0
More information about the amd-gfx
mailing list