[PATCH i-g-t 2/3] lib/amdgpu: enhance command submission and error handling with user queue support

Thu May 8 06:07:05 UTC 2025

This patch introduces several improvements to AMDGPU command submission
and error handling infrastructure:

1. Fixed typos in time_out field initialization (rint_context -> ring_context)
   across multiple helper functions.

2. Enhanced bad_access_helper() to support user queues:
   - Added user_queue parameter to control submission path
   - Implemented separate handling for user queue vs legacy path
   - Added proper timeout initialization
   - Used sync object based buffer allocation (amdgpu_bo_alloc_and_map_sync)
   - Added timeline syncobj wait after user queue operations

3. Updated bad_access_ring_helper() to:
   - Accept user_queue parameter
   - Skip ring iteration for user queues (hardware scheduled)
   - Add version check for user queue support
   - Pass user_queue flag to bad_access_helper()

Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
Cc: Sunil Khatri <sunil.khatri at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Cc: Alexander Deucher <alexander.deucher at amd.com>

Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
---
 lib/amdgpu/amd_deadlock_helpers.c | 54 ++++++++++++++++++++++++-------
 lib/amdgpu/amd_deadlock_helpers.h |  2 +-
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index c8a48930e..f52bf70e5 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -12,6 +12,7 @@
 #include <signal.h>
 #include "amd_memory.h"
 #include "amd_deadlock_helpers.h"
+#include "lib/amdgpu/amd_userq.h"
 #include "lib/amdgpu/amd_command_submission.h"
 
 #define MAX_JOB_COUNT 200
@@ -274,7 +275,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
 
 static void
 bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
-			unsigned int ip_type, uint32_t priority)
+			unsigned int ip_type, uint32_t priority, bool user_queue)
 {
 
 	const struct amdgpu_ip_block_version *ip_block = NULL;
@@ -287,10 +288,14 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
 	ring_context = calloc(1, sizeof(*ring_context));
 	igt_assert(ring_context);
 
-	if (priority == AMDGPU_CTX_PRIORITY_HIGH)
-		r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
-	else
-		r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
+	if (user_queue) {
+		amdgpu_user_queue_create(device_handle, ring_context, ip_type);
+	} else {
+		if (priority == AMDGPU_CTX_PRIORITY_HIGH)
+			r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
+		else
+			r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
+	}
 	igt_assert_eq(r, 0);
 
 	/* setup parameters */
@@ -299,16 +304,28 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
 	ring_context->pm4_size = pm4_dw;
 	ring_context->res_cnt = 1;
 	ring_context->ring_id = 0;
+	ring_context->user_queue = user_queue;
+	ring_context->time_out = 0x7ffff;
 	igt_assert(ring_context->pm4);
 	ip_block = get_ip_block(device_handle, ip_type);
-	r = amdgpu_bo_alloc_and_map(device_handle,
+	r = amdgpu_bo_alloc_and_map_sync(device_handle,
 				    ring_context->write_length * sizeof(uint32_t),
 				    4096, AMDGPU_GEM_DOMAIN_GTT,
-					AMDGPU_GEM_CREATE_CPU_GTT_USWC, &ring_context->bo,
+				    AMDGPU_GEM_CREATE_CPU_GTT_USWC,
+				    AMDGPU_VM_MTYPE_UC,
+				    &ring_context->bo,
 				    (void **)&ring_context->bo_cpu,
 				    &ring_context->bo_mc,
-				    &ring_context->va_handle);
+				    &ring_context->va_handle,
+				    ring_context->timeline_syncobj_handle,
+				    ++ring_context->point, user_queue);
 	igt_assert_eq(r, 0);
+	if (user_queue) {
+		r = amdgpu_timeline_syncobj_wait(device_handle,
+			ring_context->timeline_syncobj_handle,
+			ring_context->point);
+		igt_assert_eq(r, 0);
+	}
 
 	memset((void *)ring_context->bo_cpu, 0, ring_context->write_length * sizeof(uint32_t));
 	ring_context->resources[0] = ring_context->bo;
@@ -320,8 +337,12 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
 
 	amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle, ring_context->bo_mc,
 				 ring_context->write_length * sizeof(uint32_t));
-	free(ring_context->pm4);
-	free(ring_context);
+	if (user_queue) {
+		amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
+	} else {
+		free(ring_context->pm4);
+		free(ring_context);
+	}
 }
 
 #define MAX_DMABUF_COUNT 0x20000
@@ -419,7 +440,7 @@ amdgpu_hang_sdma_helper(amdgpu_device_handle device_handle, uint8_t hang_type)
 	free_cmd_base(base_cmd);
 }
 
-void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci)
+void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue)
 {
 	int r;
 	FILE *fp;
@@ -436,6 +457,15 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
 	if (!info.available_rings)
 		igt_info("SKIP ... as there's no ring for ip %d\n", ip_type);
 
+	if (user_queue) {
+		if (info.hw_ip_version_major < 11) {
+			igt_info("SKIP ... as user queueu doesn't support %d\n", ip_type);
+			return;
+		}
+		/* No need to iterate each ring, user queues are scheduled by hardware */
+		bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
+		return;
+	}
 	support_page = is_support_page_queue(ip_type, pci);
 	if (ip_type == AMD_IP_GFX)
 		snprintf(sysfs, sizeof(sysfs) - 1, "/sys/kernel/debug/dri/%04x:%02x:%02x.%01x/amdgpu_gfx_sched_mask",
@@ -505,7 +535,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
 			igt_assert_eq(r, 0);
 		}
 
-		bad_access_helper(device_handle, cmd_error, ip_type, prio);
+		bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
 	}
 
 	/* recover the sched mask */
diff --git a/lib/amdgpu/amd_deadlock_helpers.h b/lib/amdgpu/amd_deadlock_helpers.h
index 1d654c490..fdbeb409c 100644
--- a/lib/amdgpu/amd_deadlock_helpers.h
+++ b/lib/amdgpu/amd_deadlock_helpers.h
@@ -29,7 +29,7 @@
 void
 amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int ip_type, struct pci_addr *pci);
 void
-bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci);
+bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue);
 
 void
 amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t hang_type, struct pci_addr *pci);
-- 
2.49.0