[v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API
Jesse.Zhang
Jesse.Zhang at amd.com
Mon Aug 4 08:38:31 UTC 2025
Replace the queue remove/add approach with suspend/resume semantics
for user queue preemption. This change:
1. Maintains queue scheduling registration while only preempting execution
- Previously used remove_queue/add_queue would fully deregister queues
- New suspend/resume approach keeps scheduler state while preempting
2. Introduces proper preemption helpers:
- amdgpu_userqueue_preempt_helper(): Suspends queue execution
- Transitions MAPPED→UNMAPPED state on success
- Marks as HUNG and triggers reset on failure
- amdgpu_userqueue_restore_helper(): Resumes queue execution
- Transitions UNMAPPED→MAPPED state on success
- Triggers GPU reset on failure
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++----
1 file changed, 53 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0c91302162fa..3a8da1f47159 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
bool has_gfx = false, has_compute = false, has_sdma = false;
struct amdgpu_usermode_queue *userq;
bool gpu_reset = false;
- int gpu_suspend, id, r = 0;
+ int id, r = 0;
if (idr_is_empty(&uq_mgr->userq_idr))
return false;
@@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
dev_err(adev->dev, "userq reset disabled by debug mask\n");
} else if (amdgpu_gpu_recovery) {
if (has_compute && userq_compute_funcs->detect_and_reset) {
- gpu_suspend = amdgpu_mes_suspend(adev);
r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
if (r) {
gpu_reset = true;
@@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
if (gpu_reset)
amdgpu_userq_gpu_reset(adev);
- if ((!gpu_suspend) && has_compute)
- amdgpu_mes_resume(adev);
-
return r;
}
@@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
bool gpu_reset = false;
int r = 0;
- if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
+ (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
@@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
return r;
}
+static int
+amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ int r = 0;
+
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ r = userq_funcs->preempt(uq_mgr, queue);
+ if (r) {
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ } else {
+ queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
+ }
+ }
+
+ return r;
+}
+
+static int
+amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
+ int r = 0;
+
+ if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
+ r = userq_funcs->restore(uq_mgr, queue);
+ if (r) {
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = true;
+ } else {
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
+
static void
amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
/* Resume all the queues for this process */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
- r = amdgpu_userq_map_helper(uq_mgr, queue);
+ r = amdgpu_userqueue_restore_helper(uq_mgr, queue);
if (r)
ret = r;
}
@@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
int queue_id;
int ret = 0, r;
- amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
- r = amdgpu_userq_unmap_helper(uq_mgr, queue);
+ r = amdgpu_userqueue_preempt_helper(uq_mgr, queue);
if (r)
ret = r;
}
@@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
uint32_t queue_id;
cancel_delayed_work_sync(&userq_mgr->resume_work);
-
mutex_lock(&adev->userq_mutex);
mutex_lock(&userq_mgr->userq_mutex);
amdgpu_userq_detect_and_reset_queues(userq_mgr);
@@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
amdgpu_userq_unmap_helper(userq_mgr, queue);
amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
}
-
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
if (uqm == userq_mgr) {
list_del(&uqm->list);
--
2.49.0
More information about the amd-gfx
mailing list