[PATCH i-g-t 2/3] lib/amdgpu: enhance command submission and error handling with user queue support
Jesse.Zhang
Jesse.Zhang at amd.com
Thu May 8 06:07:05 UTC 2025
This patch introduces several improvements to AMDGPU command submission
and error handling infrastructure:
1. Fixed typos in time_out field initialization (rint_context -> ring_context)
across multiple helper functions.
2. Enhanced bad_access_helper() to support user queues:
- Added user_queue parameter to control submission path
- Implemented separate handling for user queue vs legacy path
- Added proper timeout initialization
- Used sync object based buffer allocation (amdgpu_bo_alloc_and_map_sync)
- Added timeline syncobj wait after user queue operations
3. Updated bad_access_ring_helper() to:
- Accept user_queue parameter
- Skip ring iteration for user queues (hardware scheduled)
- Add version check for user queue support
- Pass user_queue flag to bad_access_helper()
Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
Cc: Sunil Khatri <sunil.khatri at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Cc: Alexander Deucher <alexander.deucher at amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
---
lib/amdgpu/amd_deadlock_helpers.c | 54 ++++++++++++++++++++++++-------
lib/amdgpu/amd_deadlock_helpers.h | 2 +-
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
index c8a48930e..f52bf70e5 100644
--- a/lib/amdgpu/amd_deadlock_helpers.c
+++ b/lib/amdgpu/amd_deadlock_helpers.c
@@ -12,6 +12,7 @@
#include <signal.h>
#include "amd_memory.h"
#include "amd_deadlock_helpers.h"
+#include "lib/amdgpu/amd_userq.h"
#include "lib/amdgpu/amd_command_submission.h"
#define MAX_JOB_COUNT 200
@@ -274,7 +275,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
static void
bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
- unsigned int ip_type, uint32_t priority)
+ unsigned int ip_type, uint32_t priority, bool user_queue)
{
const struct amdgpu_ip_block_version *ip_block = NULL;
@@ -287,10 +288,14 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
ring_context = calloc(1, sizeof(*ring_context));
igt_assert(ring_context);
- if (priority == AMDGPU_CTX_PRIORITY_HIGH)
- r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
- else
- r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
+ if (user_queue) {
+ amdgpu_user_queue_create(device_handle, ring_context, ip_type);
+ } else {
+ if (priority == AMDGPU_CTX_PRIORITY_HIGH)
+ r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
+ else
+ r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
+ }
igt_assert_eq(r, 0);
/* setup parameters */
@@ -299,16 +304,28 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
ring_context->pm4_size = pm4_dw;
ring_context->res_cnt = 1;
ring_context->ring_id = 0;
+ ring_context->user_queue = user_queue;
+ ring_context->time_out = 0x7ffff;
igt_assert(ring_context->pm4);
ip_block = get_ip_block(device_handle, ip_type);
- r = amdgpu_bo_alloc_and_map(device_handle,
+ r = amdgpu_bo_alloc_and_map_sync(device_handle,
ring_context->write_length * sizeof(uint32_t),
4096, AMDGPU_GEM_DOMAIN_GTT,
- AMDGPU_GEM_CREATE_CPU_GTT_USWC, &ring_context->bo,
+ AMDGPU_GEM_CREATE_CPU_GTT_USWC,
+ AMDGPU_VM_MTYPE_UC,
+ &ring_context->bo,
(void **)&ring_context->bo_cpu,
&ring_context->bo_mc,
- &ring_context->va_handle);
+ &ring_context->va_handle,
+ ring_context->timeline_syncobj_handle,
+ ++ring_context->point, user_queue);
igt_assert_eq(r, 0);
+ if (user_queue) {
+ r = amdgpu_timeline_syncobj_wait(device_handle,
+ ring_context->timeline_syncobj_handle,
+ ring_context->point);
+ igt_assert_eq(r, 0);
+ }
memset((void *)ring_context->bo_cpu, 0, ring_context->write_length * sizeof(uint32_t));
ring_context->resources[0] = ring_context->bo;
@@ -320,8 +337,12 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle, ring_context->bo_mc,
ring_context->write_length * sizeof(uint32_t));
- free(ring_context->pm4);
- free(ring_context);
+ if (user_queue) {
+ amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
+ } else {
+ free(ring_context->pm4);
+ free(ring_context);
+ }
}
#define MAX_DMABUF_COUNT 0x20000
@@ -419,7 +440,7 @@ amdgpu_hang_sdma_helper(amdgpu_device_handle device_handle, uint8_t hang_type)
free_cmd_base(base_cmd);
}
-void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci)
+void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue)
{
int r;
FILE *fp;
@@ -436,6 +457,15 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
if (!info.available_rings)
igt_info("SKIP ... as there's no ring for ip %d\n", ip_type);
+ if (user_queue) {
+ if (info.hw_ip_version_major < 11) {
+ igt_info("SKIP ... as user queueu doesn't support %d\n", ip_type);
+ return;
+ }
+ /* No need to iterate each ring, user queues are scheduled by hardware */
+ bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
+ return;
+ }
support_page = is_support_page_queue(ip_type, pci);
if (ip_type == AMD_IP_GFX)
snprintf(sysfs, sizeof(sysfs) - 1, "/sys/kernel/debug/dri/%04x:%02x:%02x.%01x/amdgpu_gfx_sched_mask",
@@ -505,7 +535,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
igt_assert_eq(r, 0);
}
- bad_access_helper(device_handle, cmd_error, ip_type, prio);
+ bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
}
/* recover the sched mask */
diff --git a/lib/amdgpu/amd_deadlock_helpers.h b/lib/amdgpu/amd_deadlock_helpers.h
index 1d654c490..fdbeb409c 100644
--- a/lib/amdgpu/amd_deadlock_helpers.h
+++ b/lib/amdgpu/amd_deadlock_helpers.h
@@ -29,7 +29,7 @@
void
amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int ip_type, struct pci_addr *pci);
void
-bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci);
+bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue);
void
amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t hang_type, struct pci_addr *pci);
--
2.49.0
More information about the igt-dev
mailing list