[PATCH 13/14] drm/amd/amdgpu: Implement usermode queue hang detection

Jesse.Zhang Jesse.Zhang at amd.com
Fri May 30 09:00:14 UTC 2025


This commit introduces hang detection infrastructure for usermode queues by:

1. Adding userq_hang_info structure to track:
   - Queue identification (ME/MEC, pipe, queue)
   - VMID and XCC ID
   - Queue GPU address

2. Implementing amdgpu_userqueue_detect_hang() which:
   - Scans active hardware queues to find matching HQD addresses
   - Supports both GFX and Compute queue types
   - Properly handles queue identification hierarchy
   - Stores found queue information in hang_info structure

3. Integrating hang detection with existing queue structures:
   - Added hang_info to amdgpu_usermode_queue
   - Maintained compatibility with existing reset mechanisms

The implementation:
- Matches hardware queue organization (ME->pipe->queue)
- Uses existing MQD functions for HQD address lookup
- Provides all necessary information for targeted resets

 Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 79 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 12 ++++
 2 files changed, 91 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index b7b1c26067f5..028989e1538c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -55,6 +55,85 @@ static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
 					     &adev->userq_reset_work);
 }
 
+/**
+ * Detect if a given usermode queue is hung by comparing its GPU address
+ * to existing HQD addresses in the hardware.
+ *
+ * @uqm   - User queue manager containing context and tracking structures
+ * @queue - The usermode queue to check for hang
+ *
+ * @return - bool, hang detection info is stored in hang_info if match found.
+ */
+static bool amdgpu_userqueue_detect_hang(struct amdgpu_userq_mgr *uqm, struct amdgpu_usermode_queue *queue)
+{
+	struct amdgpu_device *adev = uqm->adev;
+	struct userq_hang_info *hang_info = &queue->hang_info;
+	struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type];
+
+	int queue_type = queue->queue_type;
+	uint64_t hdq_pq_base = queue->userq_prop->hqd_base_gpu_addr;
+
+	uint64_t hqd_addr = 0;
+	uint32_t mec, me, pipe, q, vmid;
+
+	switch (queue_type) {
+	case AMDGPU_HW_IP_GFX:
+		for (me = 0; me < adev->gfx.me.num_me; me++) {
+			for (q = 0; q < adev->gfx.me.num_queue_per_pipe; q++) {
+				for (pipe = 0; pipe < adev->gfx.me.num_pipe_per_me; pipe++) {
+
+					hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+						me, pipe, q, 0, &vmid);
+					if (!hqd_addr)
+						continue;
+					/* Check if this HQD matches the target queue */
+					if (hqd_addr == hdq_pq_base) {
+						hang_info->me = me;
+						hang_info->pipe = pipe;
+						hang_info->queue = q;
+						hang_info->queue_address = hqd_addr;
+						hang_info->vmid = vmid;
+						return true;
+					}
+				}
+			}
+		}
+	break;
+	case AMDGPU_HW_IP_COMPUTE:
+		for (mec = 0; mec < adev->gfx.mec.num_mec; ++mec) {
+			/* mec0 is me1 */
+			mec +=1;
+			for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+				for (pipe = 0; pipe < adev->gfx.mec.num_pipe_per_mec; pipe++) {
+					hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+									mec, pipe, q, 0, &vmid);
+					if (!hqd_addr)
+					    continue;
+
+					/* Check for address match to determine hang */
+					if (hqd_addr == hdq_pq_base) {
+						hang_info->mec = mec;
+						hang_info->pipe = pipe;
+						hang_info->queue = q;
+						hang_info->queue_address = hqd_addr;
+						hang_info->vmid = vmid;
+						return true;
+					}
+				}
+			}
+		}
+	break;
+	case AMDGPU_HW_IP_DMA:
+	case AMDGPU_HW_IP_VCN_ENC:
+	case AMDGPU_HW_IP_VPE:
+	default:
+	/* These queue types are not yet supported in hang detection */
+	break;
+	}
+
+	return false;
+}
+
 static bool
 amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
 				struct amdgpu_usermode_queue *queue)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index cdc3dcc62dce..0b33df8f0793 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -47,6 +47,16 @@ struct amdgpu_userq_obj {
 	struct amdgpu_bo *obj;
 };
 
+struct userq_hang_info {
+	int me;
+	int mec;
+	int pipe;
+	int queue;
+	int xcc_id;
+	int vmid;
+	uint64_t queue_address;
+};
+
 struct amdgpu_usermode_queue {
 	int			queue_type;
 	enum amdgpu_userq_state state;
@@ -66,6 +76,8 @@ struct amdgpu_usermode_queue {
 	u32			xcp_id;
 	int			priority;
 	uint64_t		generation;
+	/* for per-queue reset support */
+	struct userq_hang_info hang_info;
 };
 
 struct amdgpu_userq_funcs {
-- 
2.49.0



More information about the amd-gfx mailing list