[v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API
Alex Deucher
alexdeucher at gmail.com
Mon Aug 4 17:22:52 UTC 2025
On Mon, Aug 4, 2025 at 4:41 AM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> Replace the queue remove/add approach with suspend/resume semantics
> for user queue preemption. This change:
>
> 1. Maintains queue scheduling registration while only preempting execution
> - Previously used remove_queue/add_queue would fully deregister queues
> - New suspend/resume approach keeps scheduler state while preempting
>
> 2. Introduces proper preemption helpers:
> - amdgpu_userqueue_preempt_helper(): Suspends queue execution
> - Transitions MAPPED→UNMAPPED state on success
> - Marks as HUNG and triggers reset on failure
> - amdgpu_userqueue_restore_helper(): Resumes queue execution
> - Transitions UNMAPPED→MAPPED state on success
> - Triggers GPU reset on failure
I would move the preempt/restore patches to the start of the series.
Use preempt/restore for all of the cases where we need to preempt the
queues and only use map/unmap for device init/fini and system
suspend/resume.
Alex
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++----
> 1 file changed, 53 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0c91302162fa..3a8da1f47159 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> bool has_gfx = false, has_compute = false, has_sdma = false;
> struct amdgpu_usermode_queue *userq;
> bool gpu_reset = false;
> - int gpu_suspend, id, r = 0;
> + int id, r = 0;
>
> if (idr_is_empty(&uq_mgr->userq_idr))
> return false;
> @@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> dev_err(adev->dev, "userq reset disabled by debug mask\n");
> } else if (amdgpu_gpu_recovery) {
> if (has_compute && userq_compute_funcs->detect_and_reset) {
> - gpu_suspend = amdgpu_mes_suspend(adev);
> r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
> if (r) {
> gpu_reset = true;
> @@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> if (gpu_reset)
> amdgpu_userq_gpu_reset(adev);
>
> - if ((!gpu_suspend) && has_compute)
> - amdgpu_mes_resume(adev);
> -
> return r;
> }
>
> @@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
> bool gpu_reset = false;
> int r = 0;
>
> - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> r = userq_funcs->unmap(uq_mgr, queue);
> if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> @@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
> return r;
> }
>
> +static int
> +amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const struct amdgpu_userq_funcs *userq_funcs =
> + adev->userq_funcs[queue->queue_type];
> + int r = 0;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + r = userq_funcs->preempt(uq_mgr, queue);
> + if (r) {
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + } else {
> + queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
> + }
> + }
> +
> + return r;
> +}
> +
> +static int
> +amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const struct amdgpu_userq_funcs *userq_funcs =
> + adev->userq_funcs[queue->queue_type];
> + bool gpu_reset = false;
> + int r = 0;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
> + r = userq_funcs->restore(uq_mgr, queue);
> + if (r) {
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + gpu_reset = true;
> + } else {
> + queue->state = AMDGPU_USERQ_STATE_MAPPED;
> + }
> + }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> + return r;
> +}
> +
> static void
> amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_queue *queue)
> @@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
>
> /* Resume all the queues for this process */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_map_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_restore_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> int queue_id;
> int ret = 0, r;
>
> - amdgpu_userq_detect_and_reset_queues(uq_mgr);
> /* Try to unmap all the queues in this process ctx */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_preempt_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> uint32_t queue_id;
>
> cancel_delayed_work_sync(&userq_mgr->resume_work);
> -
> mutex_lock(&adev->userq_mutex);
> mutex_lock(&userq_mgr->userq_mutex);
> amdgpu_userq_detect_and_reset_queues(userq_mgr);
> @@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
> }
> -
> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> if (uqm == userq_mgr) {
> list_del(&uqm->list);
> --
> 2.49.0
>
More information about the amd-gfx
mailing list