[PATCH 14/14] drm/amd/amdgpu: Improve compute queue reset handling
Jesse.Zhang
Jesse.Zhang at amd.com
Fri May 30 09:00:15 UTC 2025
This commit enhances compute queue reset reliability by:
1. Adding hang detection verification before compute queue resets
- Checks HQD address match to confirm actual hang
- Returns early if no hang is detected
- Prevents unnecessary resets of healthy queues
2. Implementing MES suspend/resume during compute queue reset
- Suspends MES before resetting compute queues
- Resumes MES after reset completes
- Prevents potential race conditions during reset
3. Enabling MMIO path for compute queue resets
- Sets use_mmio flag for compute queue resets
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 18 ++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 3 ++-
2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 028989e1538c..6d3597244ac4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -142,11 +142,25 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
bool gpu_reset = false;
+ bool gpu_suspend = false;
int r;
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "userq reset disabled by debug mask\n");
} else if (amdgpu_gpu_recovery && userq_funcs->reset) {
+ if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+ if (!amdgpu_userqueue_detect_hang(uq_mgr, queue)) {
+ dev_err(adev->dev, "userq not detected hang\n");
+ return true;
+ }
+
+ r = amdgpu_mes_suspend(adev);
+ if (!r) {
+ dev_err(adev->dev, "userq suspend gangs from MES succeeded\n");
+ gpu_suspend = true;
+ }
+ }
+
r = userq_funcs->reset(uq_mgr, queue);
if (r) {
dev_err(adev->dev, "userq reset failed\n");
@@ -157,6 +171,10 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
amdgpu_userq_fence_driver_force_completion(queue);
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
}
+
+ if (gpu_suspend)
+ amdgpu_mes_resume(adev);
+
} else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
gpu_reset = true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 2b5bd3691766..997b25f9fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -360,7 +360,8 @@ static int mes_userq_reset(struct amdgpu_userq_mgr *uq_mgr,
queue_input.queue_type = queue->queue_type;
if (queue->queue_type == AMDGPU_RING_TYPE_GFX)
queue_input.hang_detect_then_reset = true;
-
+ else if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE)
+ queue_input.use_mmio = true;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
--
2.49.0
More information about the amd-gfx
mailing list