[PATCH 13/14] drm/amd/amdgpu: Implement usermode queue hang detection
Jesse.Zhang
Jesse.Zhang at amd.com
Fri May 30 09:00:14 UTC 2025
This commit introduces hang detection infrastructure for usermode queues by:
1. Adding userq_hang_info structure to track:
- Queue identification (ME/MEC, pipe, queue)
- VMID and XCC ID
- Queue GPU address
2. Implementing amdgpu_userqueue_detect_hang() which:
- Scans active hardware queues to find matching HQD addresses
- Supports both GFX and Compute queue types
- Properly handles queue identification hierarchy
- Stores found queue information in hang_info structure
3. Integrating hang detection with existing queue structures:
- Added hang_info to amdgpu_usermode_queue
- Maintained compatibility with existing reset mechanisms
The implementation:
- Matches hardware queue organization (ME->pipe->queue)
- Uses existing MQD functions for HQD address lookup
- Provides all necessary information for targeted resets
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 79 +++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 12 ++++
2 files changed, 91 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index b7b1c26067f5..028989e1538c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -55,6 +55,85 @@ static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
&adev->userq_reset_work);
}
+/**
+ * Detect if a given usermode queue is hung by comparing its GPU address
+ * to existing HQD addresses in the hardware.
+ *
+ * @uqm - User queue manager containing context and tracking structures
+ * @queue - The usermode queue to check for hang
+ *
+ * @return - bool, hang detection info is stored in hang_info if match found.
+ */
+static bool amdgpu_userqueue_detect_hang(struct amdgpu_userq_mgr *uqm, struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uqm->adev;
+ struct userq_hang_info *hang_info = &queue->hang_info;
+ struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type];
+
+ int queue_type = queue->queue_type;
+ uint64_t hdq_pq_base = queue->userq_prop->hqd_base_gpu_addr;
+
+ uint64_t hqd_addr = 0;
+ uint32_t mec, me, pipe, q, vmid;
+
+ switch (queue_type) {
+ case AMDGPU_HW_IP_GFX:
+ for (me = 0; me < adev->gfx.me.num_me; me++) {
+ for (q = 0; q < adev->gfx.me.num_queue_per_pipe; q++) {
+ for (pipe = 0; pipe < adev->gfx.me.num_pipe_per_me; pipe++) {
+
+ hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+ me, pipe, q, 0, &vmid);
+ if (!hqd_addr)
+ continue;
+ /* Check if this HQD matches the target queue */
+ if (hqd_addr == hdq_pq_base) {
+ hang_info->me = me;
+ hang_info->pipe = pipe;
+ hang_info->queue = q;
+ hang_info->queue_address = hqd_addr;
+ hang_info->vmid = vmid;
+ return true;
+ }
+ }
+ }
+ }
+ break;
+ case AMDGPU_HW_IP_COMPUTE:
+ for (mec = 0; mec < adev->gfx.mec.num_mec; ++mec) {
+ /* mec0 is me1 */
+ mec +=1;
+ for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+ for (pipe = 0; pipe < adev->gfx.mec.num_pipe_per_mec; pipe++) {
+ hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+ mec, pipe, q, 0, &vmid);
+ if (!hqd_addr)
+ continue;
+
+ /* Check for address match to determine hang */
+ if (hqd_addr == hdq_pq_base) {
+ hang_info->mec = mec;
+ hang_info->pipe = pipe;
+ hang_info->queue = q;
+ hang_info->queue_address = hqd_addr;
+ hang_info->vmid = vmid;
+ return true;
+ }
+ }
+ }
+ }
+ break;
+ case AMDGPU_HW_IP_DMA:
+ case AMDGPU_HW_IP_VCN_ENC:
+ case AMDGPU_HW_IP_VPE:
+ default:
+ /* These queue types are not yet supported in hang detection */
+ break;
+ }
+
+ return false;
+}
+
static bool
amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index cdc3dcc62dce..0b33df8f0793 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -47,6 +47,16 @@ struct amdgpu_userq_obj {
struct amdgpu_bo *obj;
};
+struct userq_hang_info {
+ int me;
+ int mec;
+ int pipe;
+ int queue;
+ int xcc_id;
+ int vmid;
+ uint64_t queue_address;
+};
+
struct amdgpu_usermode_queue {
int queue_type;
enum amdgpu_userq_state state;
@@ -66,6 +76,8 @@ struct amdgpu_usermode_queue {
u32 xcp_id;
int priority;
uint64_t generation;
+ /* for per-queue reset support */
+ struct userq_hang_info hang_info;
};
struct amdgpu_userq_funcs {
--
2.49.0
More information about the amd-gfx
mailing list