[PATCH 14/14] drm/amd/amdgpu: Improve compute queue reset handling

Jesse.Zhang Jesse.Zhang at amd.com
Fri May 30 09:00:15 UTC 2025


This commit enhances compute queue reset reliability by:

1. Adding hang detection verification before compute queue resets
   - Checks HQD address match to confirm actual hang
   - Returns early if no hang is detected
   - Prevents unnecessary resets of healthy queues

2. Implementing MES suspend/resume during compute queue reset
   - Suspends MES before resetting compute queues
   - Resumes MES after reset completes
   - Prevents potential race conditions during reset

3. Enabling MMIO path for compute queue resets
   - Sets use_mmio flag for compute queue resets

Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 18 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c |  3 ++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 028989e1538c..6d3597244ac4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -142,11 +142,25 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
 	const struct amdgpu_userq_funcs *userq_funcs =
 		adev->userq_funcs[queue->queue_type];
 	bool gpu_reset = false;
+	bool gpu_suspend = false;
 	int r;
 
 	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
 		dev_err(adev->dev, "userq reset disabled by debug mask\n");
 	} else if (amdgpu_gpu_recovery && userq_funcs->reset) {
+		if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+			if (!amdgpu_userqueue_detect_hang(uq_mgr, queue)) {
+				dev_err(adev->dev, "userq not detected hang\n");
+				return true;
+			}
+
+			r = amdgpu_mes_suspend(adev);
+			if (!r) {
+				dev_err(adev->dev, "userq suspend gangs from MES succeeded\n");
+				gpu_suspend = true;
+			}
+		}
+
 		r = userq_funcs->reset(uq_mgr, queue);
 		if (r) {
 			dev_err(adev->dev, "userq reset failed\n");
@@ -157,6 +171,10 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
 			amdgpu_userq_fence_driver_force_completion(queue);
 			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
 		}
+
+		if (gpu_suspend)
+			amdgpu_mes_resume(adev);
+
 	} else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
 		gpu_reset = true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 2b5bd3691766..997b25f9fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -360,7 +360,8 @@ static int mes_userq_reset(struct amdgpu_userq_mgr *uq_mgr,
 	queue_input.queue_type = queue->queue_type;
 	if (queue->queue_type == AMDGPU_RING_TYPE_GFX)
 		queue_input.hang_detect_then_reset = true;
-
+	else if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE)
+		queue_input.use_mmio = true;
 	amdgpu_mes_lock(&adev->mes);
 	r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
 	amdgpu_mes_unlock(&adev->mes);
-- 
2.49.0



More information about the amd-gfx mailing list